From 6bf0a5cb5034a7e684dcc3500e841785237ce2dd Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sun, 7 Apr 2024 19:32:43 +0200
Subject: Adding upstream version 1:115.7.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 third_party/jpeg-xl/lib/jxl/ac_context.h           |  149 +
 third_party/jpeg-xl/lib/jxl/ac_strategy.cc         |  108 +
 third_party/jpeg-xl/lib/jxl/ac_strategy.h          |  261 +
 third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc    |  237 +
 third_party/jpeg-xl/lib/jxl/alpha.cc               |  115 +
 third_party/jpeg-xl/lib/jxl/alpha.h                |   66 +
 third_party/jpeg-xl/lib/jxl/alpha_test.cc          |  134 +
 third_party/jpeg-xl/lib/jxl/ans_common.cc          |  148 +
 third_party/jpeg-xl/lib/jxl/ans_common.h           |  143 +
 third_party/jpeg-xl/lib/jxl/ans_common_test.cc     |   43 +
 third_party/jpeg-xl/lib/jxl/ans_params.h           |   36 +
 third_party/jpeg-xl/lib/jxl/ans_test.cc            |  278 +
 third_party/jpeg-xl/lib/jxl/base/arch_macros.h     |   33 +
 third_party/jpeg-xl/lib/jxl/base/bits.h            |  147 +
 third_party/jpeg-xl/lib/jxl/base/byte_order.h      |  274 +
 third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc  |  157 +
 third_party/jpeg-xl/lib/jxl/base/cache_aligned.h   |   74 +
 .../jpeg-xl/lib/jxl/base/compiler_specific.h       |  157 +
 third_party/jpeg-xl/lib/jxl/base/data_parallel.cc  |   23 +
 third_party/jpeg-xl/lib/jxl/base/data_parallel.h   |  120 +
 third_party/jpeg-xl/lib/jxl/base/file_io.h         |  153 +
 third_party/jpeg-xl/lib/jxl/base/float.h           |   98 +
 third_party/jpeg-xl/lib/jxl/base/iaca.h            |   65 +
 third_party/jpeg-xl/lib/jxl/base/os_macros.h       |   50 +
 third_party/jpeg-xl/lib/jxl/base/override.h        |   29 +
 third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc   |   63 +
 third_party/jpeg-xl/lib/jxl/base/padded_bytes.h    |  197 +
 third_party/jpeg-xl/lib/jxl/base/printf_macros.h   |   34 +
 third_party/jpeg-xl/lib/jxl/base/profiler.cc       |  540 ++
 third_party/jpeg-xl/lib/jxl/base/profiler.h        |  170 +
 third_party/jpeg-xl/lib/jxl/base/random.cc         |   21 +
 third_party/jpeg-xl/lib/jxl/base/random.h          |   95 +
 .../jpeg-xl/lib/jxl/base/sanitizer_definitions.h   |   44 +
 third_party/jpeg-xl/lib/jxl/base/scope_guard.h     |   48 +
 third_party/jpeg-xl/lib/jxl/base/span.h            |   60 +
 third_party/jpeg-xl/lib/jxl/base/status.h          |  326 ++
 third_party/jpeg-xl/lib/jxl/base/tsc_timer.h       |  172 +
 third_party/jpeg-xl/lib/jxl/bit_reader_test.cc     |  262 +
 third_party/jpeg-xl/lib/jxl/bits_test.cc           |   87 +
 third_party/jpeg-xl/lib/jxl/blending.cc            |  152 +
 third_party/jpeg-xl/lib/jxl/blending.h             |   24 +
 third_party/jpeg-xl/lib/jxl/blending_test.cc       |   37 +
 third_party/jpeg-xl/lib/jxl/box_content_decoder.cc |  101 +
 third_party/jpeg-xl/lib/jxl/box_content_decoder.h  |   49 +
 .../jpeg-xl/lib/jxl/butteraugli/butteraugli.cc     | 1988 +++++++
 .../jpeg-xl/lib/jxl/butteraugli/butteraugli.h      |  209 +
 third_party/jpeg-xl/lib/jxl/butteraugli_test.cc    |  103 +
 third_party/jpeg-xl/lib/jxl/butteraugli_wrapper.cc |  203 +
 third_party/jpeg-xl/lib/jxl/byte_order_test.cc     |   53 +
 third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc    |   21 +
 third_party/jpeg-xl/lib/jxl/chroma_from_luma.h     |  147 +
 third_party/jpeg-xl/lib/jxl/codec_in_out.h         |  116 +
 third_party/jpeg-xl/lib/jxl/coeff_order.cc         |  153 +
 third_party/jpeg-xl/lib/jxl/coeff_order.h          |   64 +
 third_party/jpeg-xl/lib/jxl/coeff_order_fwd.h      |   47 +
 third_party/jpeg-xl/lib/jxl/coeff_order_test.cc    |   97 +
 .../jpeg-xl/lib/jxl/color_encoding_internal.cc     |  753 +++
 .../jpeg-xl/lib/jxl/color_encoding_internal.h      |  463 ++
 .../lib/jxl/color_encoding_internal_test.cc        |  157 +
 third_party/jpeg-xl/lib/jxl/color_management.cc    |  682 +++
 third_party/jpeg-xl/lib/jxl/color_management.h     |   40 +
 .../jpeg-xl/lib/jxl/color_management_test.cc       |  405 ++
 third_party/jpeg-xl/lib/jxl/common.h               |  245 +
 third_party/jpeg-xl/lib/jxl/compressed_dc.cc       |  318 ++
 third_party/jpeg-xl/lib/jxl/compressed_dc.h        |   34 +
 third_party/jpeg-xl/lib/jxl/convolve-inl.h         |  297 ++
 third_party/jpeg-xl/lib/jxl/convolve.h             |  105 +
 third_party/jpeg-xl/lib/jxl/convolve_separable5.cc |  261 +
 third_party/jpeg-xl/lib/jxl/convolve_separable7.cc |  285 +
 third_party/jpeg-xl/lib/jxl/convolve_slow.cc       |  212 +
 third_party/jpeg-xl/lib/jxl/convolve_symmetric3.cc |  194 +
 third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc |  185 +
 third_party/jpeg-xl/lib/jxl/convolve_test.cc       |  252 +
 third_party/jpeg-xl/lib/jxl/data_parallel_test.cc  |   87 +
 third_party/jpeg-xl/lib/jxl/dct-inl.h              |  334 ++
 third_party/jpeg-xl/lib/jxl/dct_block-inl.h        |  108 +
 third_party/jpeg-xl/lib/jxl/dct_for_test.h         |   99 +
 third_party/jpeg-xl/lib/jxl/dct_scales.cc          |   31 +
 third_party/jpeg-xl/lib/jxl/dct_scales.h           |  379 ++
 third_party/jpeg-xl/lib/jxl/dct_test.cc            |  389 ++
 third_party/jpeg-xl/lib/jxl/dct_util.h             |   86 +
 third_party/jpeg-xl/lib/jxl/dec_ans.cc             |  374 ++
 third_party/jpeg-xl/lib/jxl/dec_ans.h              |  462 ++
 third_party/jpeg-xl/lib/jxl/dec_bit_reader.h       |  354 ++
 third_party/jpeg-xl/lib/jxl/dec_cache.cc           |  229 +
 third_party/jpeg-xl/lib/jxl/dec_cache.h            |  261 +
 third_party/jpeg-xl/lib/jxl/dec_context_map.cc     |   86 +
 third_party/jpeg-xl/lib/jxl/dec_context_map.h      |   30 +
 third_party/jpeg-xl/lib/jxl/dec_external_image.cc  |  493 ++
 third_party/jpeg-xl/lib/jxl/dec_external_image.h   |   46 +
 .../jpeg-xl/lib/jxl/dec_external_image_gbench.cc   |   56 +
 third_party/jpeg-xl/lib/jxl/dec_frame.cc           |  878 ++++
 third_party/jpeg-xl/lib/jxl/dec_frame.h            |  329 ++
 third_party/jpeg-xl/lib/jxl/dec_group.cc           |  801 +++
 third_party/jpeg-xl/lib/jxl/dec_group.h            |   49 +
 third_party/jpeg-xl/lib/jxl/dec_group_border.cc    |  184 +
 third_party/jpeg-xl/lib/jxl/dec_group_border.h     |   47 +
 third_party/jpeg-xl/lib/jxl/dec_huffman.cc         |  255 +
 third_party/jpeg-xl/lib/jxl/dec_huffman.h          |   32 +
 third_party/jpeg-xl/lib/jxl/dec_modular.cc         |  774 +++
 third_party/jpeg-xl/lib/jxl/dec_modular.h          |  140 +
 third_party/jpeg-xl/lib/jxl/dec_noise.cc           |  131 +
 third_party/jpeg-xl/lib/jxl/dec_noise.h            |   32 +
 .../jpeg-xl/lib/jxl/dec_patch_dictionary.cc        |  347 ++
 third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.h |  151 +
 third_party/jpeg-xl/lib/jxl/dec_tone_mapping-inl.h |  234 +
 third_party/jpeg-xl/lib/jxl/dec_transforms-inl.h   |  853 +++
 .../jpeg-xl/lib/jxl/dec_transforms_testonly.cc     |   41 +
 .../jpeg-xl/lib/jxl/dec_transforms_testonly.h      |   32 +
 third_party/jpeg-xl/lib/jxl/dec_xyb-inl.h          |  346 ++
 third_party/jpeg-xl/lib/jxl/dec_xyb.cc             |  329 ++
 third_party/jpeg-xl/lib/jxl/dec_xyb.h              |   89 +
 third_party/jpeg-xl/lib/jxl/decode.cc              | 2809 ++++++++++
 third_party/jpeg-xl/lib/jxl/decode_test.cc         | 5507 ++++++++++++++++++++
 third_party/jpeg-xl/lib/jxl/decode_to_jpeg.cc      |  169 +
 third_party/jpeg-xl/lib/jxl/decode_to_jpeg.h       |  217 +
 third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc     | 1168 +++++
 third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h      |   74 +
 .../jpeg-xl/lib/jxl/enc_adaptive_quantization.cc   | 1145 ++++
 .../jpeg-xl/lib/jxl/enc_adaptive_quantization.h    |   66 +
 third_party/jpeg-xl/lib/jxl/enc_ans.cc             | 1688 ++++++
 third_party/jpeg-xl/lib/jxl/enc_ans.h              |  143 +
 third_party/jpeg-xl/lib/jxl/enc_ans_params.h       |   76 +
 .../jpeg-xl/lib/jxl/enc_ar_control_field.cc        |  325 ++
 third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h |   49 +
 third_party/jpeg-xl/lib/jxl/enc_aux_out.cc         |  205 +
 third_party/jpeg-xl/lib/jxl/enc_aux_out.h          |  163 +
 third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc      |  201 +
 third_party/jpeg-xl/lib/jxl/enc_bit_writer.h       |  129 +
 .../jpeg-xl/lib/jxl/enc_butteraugli_comparator.cc  |   99 +
 .../jpeg-xl/lib/jxl/enc_butteraugli_comparator.h   |   59 +
 .../jpeg-xl/lib/jxl/enc_butteraugli_pnorm.cc       |  211 +
 .../jpeg-xl/lib/jxl/enc_butteraugli_pnorm.h        |   25 +
 third_party/jpeg-xl/lib/jxl/enc_cache.cc           |  218 +
 third_party/jpeg-xl/lib/jxl/enc_cache.h            |   93 +
 .../jpeg-xl/lib/jxl/enc_chroma_from_luma.cc        |  409 ++
 third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h |   68 +
 third_party/jpeg-xl/lib/jxl/enc_cluster.cc         |  295 ++
 third_party/jpeg-xl/lib/jxl/enc_cluster.h          |   63 +
 third_party/jpeg-xl/lib/jxl/enc_coeff_order.cc     |  291 ++
 third_party/jpeg-xl/lib/jxl/enc_coeff_order.h      |   54 +
 .../jpeg-xl/lib/jxl/enc_color_management.cc        | 1293 +++++
 third_party/jpeg-xl/lib/jxl/enc_color_management.h |   90 +
 third_party/jpeg-xl/lib/jxl/enc_comparator.cc      |  130 +
 third_party/jpeg-xl/lib/jxl/enc_comparator.h       |   52 +
 third_party/jpeg-xl/lib/jxl/enc_context_map.cc     |  141 +
 third_party/jpeg-xl/lib/jxl/enc_context_map.h      |   35 +
 third_party/jpeg-xl/lib/jxl/enc_detect_dots.cc     |  626 +++
 third_party/jpeg-xl/lib/jxl/enc_detect_dots.h      |   67 +
 third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.cc  |   71 +
 third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.h   |   34 +
 third_party/jpeg-xl/lib/jxl/enc_entropy_coder.cc   |  274 +
 third_party/jpeg-xl/lib/jxl/enc_entropy_coder.h    |   46 +
 third_party/jpeg-xl/lib/jxl/enc_external_image.cc  |  183 +
 third_party/jpeg-xl/lib/jxl/enc_external_image.h   |   45 +
 .../jpeg-xl/lib/jxl/enc_external_image_gbench.cc   |   46 +
 .../jpeg-xl/lib/jxl/enc_external_image_test.cc     |   79 +
 third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc   | 3860 ++++++++++++++
 third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h    |   72 +
 third_party/jpeg-xl/lib/jxl/enc_fields.cc          |  239 +
 third_party/jpeg-xl/lib/jxl/enc_fields.h           |   37 +
 third_party/jpeg-xl/lib/jxl/enc_file.cc            |  141 +
 third_party/jpeg-xl/lib/jxl/enc_file.h             |   31 +
 third_party/jpeg-xl/lib/jxl/enc_frame.cc           | 1745 +++++++
 third_party/jpeg-xl/lib/jxl/enc_frame.h            |   78 +
 third_party/jpeg-xl/lib/jxl/enc_gaborish.cc        |   61 +
 third_party/jpeg-xl/lib/jxl/enc_gaborish.h         |   26 +
 third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc   |   77 +
 third_party/jpeg-xl/lib/jxl/enc_gamma_correct.h    |   36 +
 third_party/jpeg-xl/lib/jxl/enc_group.cc           |  426 ++
 third_party/jpeg-xl/lib/jxl/enc_group.h            |   32 +
 third_party/jpeg-xl/lib/jxl/enc_heuristics.cc      |  948 ++++
 third_party/jpeg-xl/lib/jxl/enc_heuristics.h       |   81 +
 third_party/jpeg-xl/lib/jxl/enc_huffman.cc         |  214 +
 third_party/jpeg-xl/lib/jxl/enc_huffman.h          |   22 +
 third_party/jpeg-xl/lib/jxl/enc_huffman_tree.cc    |  328 ++
 third_party/jpeg-xl/lib/jxl/enc_huffman_tree.h     |   52 +
 third_party/jpeg-xl/lib/jxl/enc_icc_codec.cc       |  406 ++
 third_party/jpeg-xl/lib/jxl/enc_icc_codec.h        |   33 +
 third_party/jpeg-xl/lib/jxl/enc_image_bundle.cc    |  154 +
 third_party/jpeg-xl/lib/jxl/enc_image_bundle.h     |   25 +
 third_party/jpeg-xl/lib/jxl/enc_jxl_skcms.h        |   54 +
 third_party/jpeg-xl/lib/jxl/enc_linalg.cc          |   52 +
 third_party/jpeg-xl/lib/jxl/enc_linalg.h           |   24 +
 third_party/jpeg-xl/lib/jxl/enc_linalg_test.cc     |  118 +
 third_party/jpeg-xl/lib/jxl/enc_modular.cc         | 1762 +++++++
 third_party/jpeg-xl/lib/jxl/enc_modular.h          |   92 +
 third_party/jpeg-xl/lib/jxl/enc_noise.cc           |  374 ++
 third_party/jpeg-xl/lib/jxl/enc_noise.h            |   34 +
 third_party/jpeg-xl/lib/jxl/enc_optimize.cc        |  163 +
 third_party/jpeg-xl/lib/jxl/enc_optimize.h         |  218 +
 third_party/jpeg-xl/lib/jxl/enc_optimize_test.cc   |  109 +
 third_party/jpeg-xl/lib/jxl/enc_params.h           |  225 +
 .../jpeg-xl/lib/jxl/enc_patch_dictionary.cc        |  813 +++
 third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.h |  109 +
 third_party/jpeg-xl/lib/jxl/enc_photon_noise.cc    |   89 +
 third_party/jpeg-xl/lib/jxl/enc_photon_noise.h     |   22 +
 .../jpeg-xl/lib/jxl/enc_photon_noise_test.cc       |   51 +
 .../jpeg-xl/lib/jxl/enc_progressive_split.cc       |   82 +
 .../jpeg-xl/lib/jxl/enc_progressive_split.h        |  131 +
 third_party/jpeg-xl/lib/jxl/enc_quant_weights.cc   |  214 +
 third_party/jpeg-xl/lib/jxl/enc_quant_weights.h    |   37 +
 third_party/jpeg-xl/lib/jxl/enc_splines.cc         |   98 +
 third_party/jpeg-xl/lib/jxl/enc_splines.h          |   38 +
 third_party/jpeg-xl/lib/jxl/enc_toc.cc             |   45 +
 third_party/jpeg-xl/lib/jxl/enc_toc.h              |   31 +
 third_party/jpeg-xl/lib/jxl/enc_transforms-inl.h   |  827 +++
 third_party/jpeg-xl/lib/jxl/enc_transforms.cc      |   41 +
 third_party/jpeg-xl/lib/jxl/enc_transforms.h       |   32 +
 third_party/jpeg-xl/lib/jxl/enc_xyb.cc             |  520 ++
 third_party/jpeg-xl/lib/jxl/enc_xyb.h              |   56 +
 third_party/jpeg-xl/lib/jxl/encode.cc              | 2128 ++++++++
 third_party/jpeg-xl/lib/jxl/encode_internal.h      |  275 +
 third_party/jpeg-xl/lib/jxl/encode_test.cc         | 1405 +++++
 third_party/jpeg-xl/lib/jxl/entropy_coder.cc       |   70 +
 third_party/jpeg-xl/lib/jxl/entropy_coder.h        |   45 +
 third_party/jpeg-xl/lib/jxl/entropy_coder_test.cc  |   68 +
 third_party/jpeg-xl/lib/jxl/epf.cc                 |  146 +
 third_party/jpeg-xl/lib/jxl/epf.h                  |   33 +
 third_party/jpeg-xl/lib/jxl/exif.h                 |   87 +
 .../lib/jxl/fake_parallel_runner_testonly.h        |   79 +
 third_party/jpeg-xl/lib/jxl/fast_dct-inl.h         |  238 +
 third_party/jpeg-xl/lib/jxl/fast_dct.cc            |   37 +
 third_party/jpeg-xl/lib/jxl/fast_dct.h             |    9 +
 third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h      | 2137 ++++++++
 third_party/jpeg-xl/lib/jxl/fast_dct16-inl.h       |  180 +
 third_party/jpeg-xl/lib/jxl/fast_dct256-inl.h      | 4811 +++++++++++++++++
 third_party/jpeg-xl/lib/jxl/fast_dct32-inl.h       |  419 ++
 third_party/jpeg-xl/lib/jxl/fast_dct64-inl.h       |  985 ++++
 third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h        |   80 +
 third_party/jpeg-xl/lib/jxl/fast_dct_test.cc       |  378 ++
 third_party/jpeg-xl/lib/jxl/fast_math-inl.h        |  236 +
 third_party/jpeg-xl/lib/jxl/fast_math_test.cc      |  288 +
 third_party/jpeg-xl/lib/jxl/field_encodings.h      |  134 +
 third_party/jpeg-xl/lib/jxl/fields.cc              |  642 +++
 third_party/jpeg-xl/lib/jxl/fields.h               |  377 ++
 third_party/jpeg-xl/lib/jxl/fields_test.cc         |  429 ++
 third_party/jpeg-xl/lib/jxl/frame_header.cc        |  494 ++
 third_party/jpeg-xl/lib/jxl/frame_header.h         |  503 ++
 third_party/jpeg-xl/lib/jxl/gamma_correct_test.cc  |   37 +
 third_party/jpeg-xl/lib/jxl/gauss_blur.cc          |  623 +++
 third_party/jpeg-xl/lib/jxl/gauss_blur.h           |   94 +
 third_party/jpeg-xl/lib/jxl/gauss_blur_gbench.cc   |  126 +
 third_party/jpeg-xl/lib/jxl/gauss_blur_test.cc     |  453 ++
 third_party/jpeg-xl/lib/jxl/gradient_test.cc       |  207 +
 third_party/jpeg-xl/lib/jxl/headers.cc             |  194 +
 third_party/jpeg-xl/lib/jxl/headers.h              |   97 +
 third_party/jpeg-xl/lib/jxl/huffman_table.cc       |  161 +
 third_party/jpeg-xl/lib/jxl/huffman_table.h        |   28 +
 third_party/jpeg-xl/lib/jxl/iaca_test.cc           |   21 +
 third_party/jpeg-xl/lib/jxl/icc_codec.cc           |  389 ++
 third_party/jpeg-xl/lib/jxl/icc_codec.h            |   57 +
 third_party/jpeg-xl/lib/jxl/icc_codec_common.cc    |  190 +
 third_party/jpeg-xl/lib/jxl/icc_codec_common.h     |  106 +
 third_party/jpeg-xl/lib/jxl/icc_codec_test.cc      |  207 +
 third_party/jpeg-xl/lib/jxl/image.cc               |  251 +
 third_party/jpeg-xl/lib/jxl/image.h                |  497 ++
 third_party/jpeg-xl/lib/jxl/image_bundle.cc        |  125 +
 third_party/jpeg-xl/lib/jxl/image_bundle.h         |  254 +
 third_party/jpeg-xl/lib/jxl/image_bundle_test.cc   |   37 +
 third_party/jpeg-xl/lib/jxl/image_metadata.cc      |  472 ++
 third_party/jpeg-xl/lib/jxl/image_metadata.h       |  425 ++
 third_party/jpeg-xl/lib/jxl/image_ops.h            |  805 +++
 third_party/jpeg-xl/lib/jxl/image_ops_test.cc      |  164 +
 third_party/jpeg-xl/lib/jxl/image_test_utils.h     |  257 +
 third_party/jpeg-xl/lib/jxl/inverse_mtf-inl.h      |   90 +
 third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc  |  145 +
 third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.h   |   19 +
 .../jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc   | 1050 ++++
 .../jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.h    |   35 +
 .../jpeg-xl/lib/jxl/jpeg/dec_jpeg_output_chunk.h   |   72 +
 .../lib/jxl/jpeg/dec_jpeg_serialization_state.h    |   96 +
 third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.cc  |  384 ++
 third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.h   |   31 +
 .../jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.cc   | 1053 ++++
 .../jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.h    |   36 +
 .../lib/jxl/jpeg/enc_jpeg_huffman_decode.cc        |  103 +
 .../jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.h |   41 +
 third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc      |  451 ++
 third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.h       |  216 +
 third_party/jpeg-xl/lib/jxl/jxl.syms               |    5 +
 third_party/jpeg-xl/lib/jxl/jxl.version            |   17 +
 third_party/jpeg-xl/lib/jxl/jxl_inspection.h       |   22 +
 third_party/jpeg-xl/lib/jxl/jxl_osx.syms           |    1 +
 third_party/jpeg-xl/lib/jxl/jxl_test.cc            | 1537 ++++++
 third_party/jpeg-xl/lib/jxl/lehmer_code.h          |  102 +
 third_party/jpeg-xl/lib/jxl/lehmer_code_test.cc    |   98 +
 third_party/jpeg-xl/lib/jxl/libjxl.pc.in           |   13 +
 third_party/jpeg-xl/lib/jxl/loop_filter.cc         |   98 +
 third_party/jpeg-xl/lib/jxl/loop_filter.h          |   76 +
 third_party/jpeg-xl/lib/jxl/luminance.cc           |   26 +
 third_party/jpeg-xl/lib/jxl/luminance.h            |   22 +
 third_party/jpeg-xl/lib/jxl/matrix_ops.h           |   84 +
 .../jpeg-xl/lib/jxl/memory_manager_internal.cc     |   18 +
 .../jpeg-xl/lib/jxl/memory_manager_internal.h      |  101 +
 .../lib/jxl/modular/encoding/context_predict.h     |  626 +++
 .../jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc     |  107 +
 .../jpeg-xl/lib/jxl/modular/encoding/dec_ma.h      |   66 +
 .../lib/jxl/modular/encoding/enc_debug_tree.cc     |  124 +
 .../lib/jxl/modular/encoding/enc_debug_tree.h      |   27 +
 .../lib/jxl/modular/encoding/enc_encoding.cc       |  562 ++
 .../lib/jxl/modular/encoding/enc_encoding.h        |   47 +
 .../jpeg-xl/lib/jxl/modular/encoding/enc_ma.cc     | 1023 ++++
 .../jpeg-xl/lib/jxl/modular/encoding/enc_ma.h      |  157 +
 .../jpeg-xl/lib/jxl/modular/encoding/encoding.cc   |  622 +++
 .../jpeg-xl/lib/jxl/modular/encoding/encoding.h    |  135 +
 .../jpeg-xl/lib/jxl/modular/encoding/ma_common.h   |   28 +
 .../jpeg-xl/lib/jxl/modular/modular_image.cc       |   77 +
 .../jpeg-xl/lib/jxl/modular/modular_image.h        |  118 +
 third_party/jpeg-xl/lib/jxl/modular/options.h      |  117 +
 .../lib/jxl/modular/transform/enc_palette.cc       |  606 +++
 .../lib/jxl/modular/transform/enc_palette.h        |   22 +
 .../jpeg-xl/lib/jxl/modular/transform/enc_rct.cc   |   73 +
 .../jpeg-xl/lib/jxl/modular/transform/enc_rct.h    |   17 +
 .../lib/jxl/modular/transform/enc_squeeze.cc       |  141 +
 .../lib/jxl/modular/transform/enc_squeeze.h        |   20 +
 .../lib/jxl/modular/transform/enc_transform.cc     |   46 +
 .../lib/jxl/modular/transform/enc_transform.h      |   22 +
 .../jpeg-xl/lib/jxl/modular/transform/palette.cc   |  176 +
 .../jpeg-xl/lib/jxl/modular/transform/palette.h    |  129 +
 .../jpeg-xl/lib/jxl/modular/transform/rct.cc       |  153 +
 .../jpeg-xl/lib/jxl/modular/transform/rct.h        |   20 +
 .../jpeg-xl/lib/jxl/modular/transform/squeeze.cc   |  478 ++
 .../jpeg-xl/lib/jxl/modular/transform/squeeze.h    |   90 +
 .../jpeg-xl/lib/jxl/modular/transform/transform.cc |   98 +
 .../jpeg-xl/lib/jxl/modular/transform/transform.h  |  148 +
 third_party/jpeg-xl/lib/jxl/modular_test.cc        |  541 ++
 third_party/jpeg-xl/lib/jxl/noise.h                |   60 +
 third_party/jpeg-xl/lib/jxl/opsin_image_test.cc    |  123 +
 third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc  |   57 +
 third_party/jpeg-xl/lib/jxl/opsin_params.cc        |   44 +
 third_party/jpeg-xl/lib/jxl/opsin_params.h         |   86 +
 third_party/jpeg-xl/lib/jxl/padded_bytes_test.cc   |  126 +
 third_party/jpeg-xl/lib/jxl/passes_state.cc        |   70 +
 third_party/jpeg-xl/lib/jxl/passes_state.h         |  133 +
 third_party/jpeg-xl/lib/jxl/passes_test.cc         |  402 ++
 .../jpeg-xl/lib/jxl/patch_dictionary_internal.h    |   31 +
 .../jpeg-xl/lib/jxl/patch_dictionary_test.cc       |   58 +
 third_party/jpeg-xl/lib/jxl/preview_test.cc        |   68 +
 third_party/jpeg-xl/lib/jxl/quant_weights.cc       | 1239 +++++
 third_party/jpeg-xl/lib/jxl/quant_weights.h        |  448 ++
 third_party/jpeg-xl/lib/jxl/quant_weights_test.cc  |  240 +
 third_party/jpeg-xl/lib/jxl/quantizer-inl.h        |   74 +
 third_party/jpeg-xl/lib/jxl/quantizer.cc           |  156 +
 third_party/jpeg-xl/lib/jxl/quantizer.h            |  182 +
 third_party/jpeg-xl/lib/jxl/quantizer_test.cc      |   81 +
 .../jpeg-xl/lib/jxl/rational_polynomial-inl.h      |   98 +
 .../jpeg-xl/lib/jxl/rational_polynomial_test.cc    |  238 +
 .../render_pipeline/low_memory_render_pipeline.cc  |  865 +++
 .../render_pipeline/low_memory_render_pipeline.h   |  111 +
 .../lib/jxl/render_pipeline/render_pipeline.cc     |  132 +
 .../lib/jxl/render_pipeline/render_pipeline.h      |  139 +
 .../jxl/render_pipeline/render_pipeline_stage.h    |  171 +
 .../jxl/render_pipeline/render_pipeline_test.cc    |  562 ++
 .../jxl/render_pipeline/simple_render_pipeline.cc  |  266 +
 .../jxl/render_pipeline/simple_render_pipeline.h   |   37 +
 .../lib/jxl/render_pipeline/stage_blending.cc      |  247 +
 .../lib/jxl/render_pipeline/stage_blending.h       |   24 +
 .../jxl/render_pipeline/stage_chroma_upsampling.cc |  129 +
 .../jxl/render_pipeline/stage_chroma_upsampling.h  |   27 +
 .../jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc   |  524 ++
 .../jpeg-xl/lib/jxl/render_pipeline/stage_epf.h    |   31 +
 .../lib/jxl/render_pipeline/stage_from_linear.cc   |  191 +
 .../lib/jxl/render_pipeline/stage_from_linear.h    |   20 +
 .../lib/jxl/render_pipeline/stage_gaborish.cc      |  122 +
 .../lib/jxl/render_pipeline/stage_gaborish.h       |   25 +
 .../jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc |  311 ++
 .../jpeg-xl/lib/jxl/render_pipeline/stage_noise.h  |   32 +
 .../lib/jxl/render_pipeline/stage_patches.cc       |   48 +
 .../lib/jxl/render_pipeline/stage_patches.h        |   22 +
 .../lib/jxl/render_pipeline/stage_splines.cc       |   63 +
 .../lib/jxl/render_pipeline/stage_splines.h        |   21 +
 .../jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc  |   52 +
 .../jpeg-xl/lib/jxl/render_pipeline/stage_spot.h   |   21 +
 .../lib/jxl/render_pipeline/stage_to_linear.cc     |  202 +
 .../lib/jxl/render_pipeline/stage_to_linear.h      |   21 +
 .../lib/jxl/render_pipeline/stage_tone_mapping.cc  |  151 +
 .../lib/jxl/render_pipeline/stage_tone_mapping.h   |   37 +
 .../lib/jxl/render_pipeline/stage_upsampling.cc    |  187 +
 .../lib/jxl/render_pipeline/stage_upsampling.h     |   26 +
 .../jpeg-xl/lib/jxl/render_pipeline/stage_write.cc |  601 +++
 .../jpeg-xl/lib/jxl/render_pipeline/stage_write.h  |   31 +
 .../jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc   |  176 +
 .../jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h    |   26 +
 .../jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc |   85 +
 .../jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h  |   25 +
 .../render_pipeline/test_render_pipeline_stages.h  |  101 +
 third_party/jpeg-xl/lib/jxl/roundtrip_test.cc      |  839 +++
 third_party/jpeg-xl/lib/jxl/sanitizers.h           |  242 +
 third_party/jpeg-xl/lib/jxl/simd_util-inl.h        |  349 ++
 third_party/jpeg-xl/lib/jxl/simd_util_test.cc      |   84 +
 third_party/jpeg-xl/lib/jxl/speed_tier_test.cc     |  108 +
 third_party/jpeg-xl/lib/jxl/splines.cc             |  694 +++
 third_party/jpeg-xl/lib/jxl/splines.h              |  148 +
 third_party/jpeg-xl/lib/jxl/splines_gbench.cc      |   52 +
 third_party/jpeg-xl/lib/jxl/splines_test.cc        |  348 ++
 third_party/jpeg-xl/lib/jxl/test_image.cc          |  453 ++
 third_party/jpeg-xl/lib/jxl/test_image.h           |   94 +
 third_party/jpeg-xl/lib/jxl/test_utils.cc          |  673 +++
 third_party/jpeg-xl/lib/jxl/test_utils.h           |  175 +
 third_party/jpeg-xl/lib/jxl/testing.h              |   73 +
 third_party/jpeg-xl/lib/jxl/tf_gbench.cc           |  143 +
 third_party/jpeg-xl/lib/jxl/toc.cc                 |  105 +
 third_party/jpeg-xl/lib/jxl/toc.h                  |   55 +
 third_party/jpeg-xl/lib/jxl/toc_test.cc            |   92 +
 .../jpeg-xl/lib/jxl/transfer_functions-inl.h       |  413 ++
 third_party/jpeg-xl/lib/jxl/transpose-inl.h        |  203 +
 third_party/jpeg-xl/lib/jxl/version.h.in           |   39 +
 third_party/jpeg-xl/lib/jxl/xorshift128plus-inl.h  |  103 +
 .../jpeg-xl/lib/jxl/xorshift128plus_test.cc        |  378 ++
 410 files changed, 109850 insertions(+)
 create mode 100644 third_party/jpeg-xl/lib/jxl/ac_context.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/ac_strategy.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/ac_strategy.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/alpha.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/alpha.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/alpha_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/ans_common.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/ans_common.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/ans_common_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/ans_params.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/ans_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/arch_macros.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/bits.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/byte_order.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/cache_aligned.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/compiler_specific.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/data_parallel.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/data_parallel.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/file_io.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/float.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/iaca.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/os_macros.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/override.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/padded_bytes.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/printf_macros.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/profiler.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/profiler.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/random.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/random.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/scope_guard.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/span.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/status.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/base/tsc_timer.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/bit_reader_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/bits_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/blending.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/blending.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/blending_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/box_content_decoder.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/box_content_decoder.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/butteraugli_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/butteraugli_wrapper.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/byte_order_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/chroma_from_luma.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/codec_in_out.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/coeff_order.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/coeff_order.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/coeff_order_fwd.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/coeff_order_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/color_encoding_internal.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/color_encoding_internal.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/color_encoding_internal_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/color_management.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/color_management.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/color_management_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/common.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/compressed_dc.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/compressed_dc.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/convolve-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/convolve.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/convolve_separable5.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/convolve_separable7.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/convolve_slow.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/convolve_symmetric3.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/convolve_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/data_parallel_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dct-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dct_block-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dct_for_test.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dct_scales.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dct_scales.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dct_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dct_util.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_ans.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_ans.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_bit_reader.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_cache.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_cache.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_context_map.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_context_map.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_external_image.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_external_image.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_external_image_gbench.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_frame.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_frame.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_group.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_group.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_group_border.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_group_border.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_huffman.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_huffman.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_modular.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_modular.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_noise.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_noise.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_tone_mapping-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_transforms-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_xyb-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_xyb.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/dec_xyb.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/decode.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/decode_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/decode_to_jpeg.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/decode_to_jpeg.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_ans.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_ans.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_ans_params.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_aux_out.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_aux_out.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_bit_writer.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_cache.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_cache.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_cluster.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_cluster.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_coeff_order.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_coeff_order.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_color_management.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_color_management.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_comparator.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_comparator.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_context_map.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_context_map.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_detect_dots.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_detect_dots.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_entropy_coder.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_entropy_coder.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_external_image.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_external_image.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_external_image_gbench.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_external_image_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_fields.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_fields.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_file.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_file.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_frame.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_frame.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_gaborish.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_gaborish.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_gamma_correct.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_group.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_group.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_heuristics.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_heuristics.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_huffman.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_huffman.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_huffman_tree.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_huffman_tree.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_icc_codec.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_icc_codec.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_image_bundle.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_image_bundle.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_jxl_skcms.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_linalg.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_linalg.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_linalg_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_modular.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_modular.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_noise.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_noise.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_optimize.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_optimize.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_optimize_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_params.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_photon_noise.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_photon_noise.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_photon_noise_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_progressive_split.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_progressive_split.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_quant_weights.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_quant_weights.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_splines.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_splines.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_toc.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_toc.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_transforms-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_transforms.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_transforms.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_xyb.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/enc_xyb.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/encode.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/encode_internal.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/encode_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/entropy_coder.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/entropy_coder.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/entropy_coder_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/epf.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/epf.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/exif.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/fake_parallel_runner_testonly.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct16-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct256-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct32-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct64-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/fast_dct_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/fast_math-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/fast_math_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/field_encodings.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/fields.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/fields.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/fields_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/frame_header.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/frame_header.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/gamma_correct_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/gauss_blur.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/gauss_blur.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/gauss_blur_gbench.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/gauss_blur_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/gradient_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/headers.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/headers.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/huffman_table.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/huffman_table.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/iaca_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/icc_codec.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/icc_codec.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/icc_codec_common.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/icc_codec_common.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/icc_codec_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/image.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/image.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/image_bundle.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/image_bundle.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/image_bundle_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/image_metadata.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/image_metadata.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/image_ops.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/image_ops_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/image_test_utils.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/inverse_mtf-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_output_chunk.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_serialization_state.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/jxl.syms
 create mode 100644 third_party/jpeg-xl/lib/jxl/jxl.version
 create mode 100644 third_party/jpeg-xl/lib/jxl/jxl_inspection.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/jxl_osx.syms
 create mode 100644 third_party/jpeg-xl/lib/jxl/jxl_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/lehmer_code.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/lehmer_code_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/libjxl.pc.in
 create mode 100644 third_party/jpeg-xl/lib/jxl/loop_filter.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/loop_filter.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/luminance.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/luminance.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/matrix_ops.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/memory_manager_internal.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/memory_manager_internal.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/context_predict.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/encoding/ma_common.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/modular_image.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/modular_image.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/options.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/palette.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/palette.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/rct.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/rct.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular/transform/transform.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/modular_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/noise.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/opsin_image_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/opsin_params.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/opsin_params.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/padded_bytes_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/passes_state.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/passes_state.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/passes_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/patch_dictionary_internal.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/patch_dictionary_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/preview_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/quant_weights.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/quant_weights.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/quant_weights_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/quantizer-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/quantizer.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/quantizer.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/quantizer_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/rational_polynomial-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/rational_polynomial_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/roundtrip_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/sanitizers.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/simd_util-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/simd_util_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/speed_tier_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/splines.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/splines.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/splines_gbench.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/splines_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/test_image.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/test_image.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/test_utils.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/test_utils.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/testing.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/tf_gbench.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/toc.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/toc.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/toc_test.cc
 create mode 100644 third_party/jpeg-xl/lib/jxl/transfer_functions-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/transpose-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/version.h.in
 create mode 100644 third_party/jpeg-xl/lib/jxl/xorshift128plus-inl.h
 create mode 100644 third_party/jpeg-xl/lib/jxl/xorshift128plus_test.cc

(limited to 'third_party/jpeg-xl/lib/jxl')

diff --git a/third_party/jpeg-xl/lib/jxl/ac_context.h b/third_party/jpeg-xl/lib/jxl/ac_context.h
new file mode 100644
index 0000000000..a2b9e046d1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ac_context.h
@@ -0,0 +1,149 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_AC_CONTEXT_H_
+#define LIB_JXL_AC_CONTEXT_H_
+
+#include <algorithm>
+#include <vector>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order_fwd.h"
+
+namespace jxl {
+
+// Block context used for scanning order, number of non-zeros, AC coefficients.
+// Equal to the channel.
+constexpr uint32_t kDCTOrderContextStart = 0;
+
+// The number of predicted nonzeros goes from 0 to 1008. We use
+// ceil(log2(predicted+1)) as a context for the number of nonzeros, so from 0 to
+// 10, inclusive.
+constexpr uint32_t kNonZeroBuckets = 37;
+
+static const uint16_t kCoeffFreqContext[64] = {
+    0xBAD, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+    15,    15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22,
+    23,    23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26,
+    27,    27, 27, 27, 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30,
+};
+
+static const uint16_t kCoeffNumNonzeroContext[64] = {
+    0xBAD, 0,   31,  62,  62,  93,  93,  93,  93,  123, 123, 123, 123,
+    152,   152, 152, 152, 152, 152, 152, 152, 180, 180, 180, 180, 180,
+    180,   180, 180, 180, 180, 180, 180, 206, 206, 206, 206, 206, 206,
+    206,   206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206,
+    206,   206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206,
+};
+
+// Supremum of ZeroDensityContext(x, y) + 1, when x + y < 64.
+constexpr int kZeroDensityContextCount = 458;
+// Supremum of ZeroDensityContext(x, y) + 1.
+constexpr int kZeroDensityContextLimit = 474;
+
+/* This function is used for entropy-sources pre-clustering.
+ *
+ * Ideally, each combination of |nonzeros_left| and |k| should go to its own
+ * bucket; but it implies (64 * 63 / 2) == 2016 buckets. If there is other
+ * dimension (e.g. block context), then number of primary clusters becomes too
+ * big.
+ *
+ * To solve this problem, |nonzeros_left| and |k| values are clustered. It is
+ * known that their sum is at most 64, consequently, the total number buckets
+ * is at most A(64) * B(64).
+ */
+// TODO(user): investigate, why disabling pre-clustering makes entropy code
+// less dense. Perhaps we would need to add HQ clustering algorithm that would
+// be able to squeeze better by spending more CPU cycles.
+static JXL_INLINE size_t ZeroDensityContext(size_t nonzeros_left, size_t k,
+                                            size_t covered_blocks,
+                                            size_t log2_covered_blocks,
+                                            size_t prev) {
+  JXL_DASSERT((1u << log2_covered_blocks) == covered_blocks);
+  nonzeros_left = (nonzeros_left + covered_blocks - 1) >> log2_covered_blocks;
+  k >>= log2_covered_blocks;
+  JXL_DASSERT(k > 0);
+  JXL_DASSERT(k < 64);
+  JXL_DASSERT(nonzeros_left > 0);
+  // Asserting nonzeros_left + k < 65 here causes crashes in debug mode with
+  // invalid input, since the (hot) decoding loop does not check this condition.
+  // As no out-of-bound memory reads are issued even if that condition is
+  // broken, we check this simpler condition which holds anyway. The decoder
+  // will still mark a file in which that condition happens as not valid at the
+  // end of the decoding loop, as `nzeros` will not be `0`.
+  JXL_DASSERT(nonzeros_left < 64);
+  return (kCoeffNumNonzeroContext[nonzeros_left] + kCoeffFreqContext[k]) * 2 +
+         prev;
+}
+
+struct BlockCtxMap {
+  std::vector<int> dc_thresholds[3];
+  std::vector<uint32_t> qf_thresholds;
+  std::vector<uint8_t> ctx_map;
+  size_t num_ctxs, num_dc_ctxs;
+
+  static constexpr uint8_t kDefaultCtxMap[] = {
+      // Default ctx map clusters all the large transforms together.
+      0, 1, 2, 2, 3,  3,  4,  5,  6,  6,  6,  6,  6,   //
+      7, 8, 9, 9, 10, 11, 12, 13, 14, 14, 14, 14, 14,  //
+      7, 8, 9, 9, 10, 11, 12, 13, 14, 14, 14, 14, 14,  //
+  };
+  static_assert(3 * kNumOrders ==
+                    sizeof(kDefaultCtxMap) / sizeof *kDefaultCtxMap,
+                "Update default context map");
+
+  size_t Context(int dc_idx, uint32_t qf, size_t ord, size_t c) const {
+    size_t qf_idx = 0;
+    for (uint32_t t : qf_thresholds) {
+      if (qf > t) qf_idx++;
+    }
+    size_t idx = c < 2 ? c ^ 1 : 2;
+    idx = idx * kNumOrders + ord;
+    idx = idx * (qf_thresholds.size() + 1) + qf_idx;
+    idx = idx * num_dc_ctxs + dc_idx;
+    return ctx_map[idx];
+  }
+  // Non-zero context is based on number of non-zeros and block context.
+  // For better clustering, contexts with same number of non-zeros are grouped.
+  constexpr uint32_t ZeroDensityContextsOffset(uint32_t block_ctx) const {
+    return num_ctxs * kNonZeroBuckets + kZeroDensityContextCount * block_ctx;
+  }
+
+  // Context map for AC coefficients consists of 2 blocks:
+  //  |num_ctxs x                : context for number of non-zeros in the block
+  //   kNonZeroBuckets|            computed from block context and predicted
+  //                               value (based top and left values)
+  //  |num_ctxs x                : context for AC coefficient symbols,
+  //   kZeroDensityContextCount|   computed from block context,
+  //                               number of non-zeros left and
+  //                               index in scan order
+  constexpr uint32_t NumACContexts() const {
+    return num_ctxs * (kNonZeroBuckets + kZeroDensityContextCount);
+  }
+
+  // Non-zero context is based on number of non-zeros and block context.
+  // For better clustering, contexts with same number of non-zeros are grouped.
+  inline uint32_t NonZeroContext(uint32_t non_zeros, uint32_t block_ctx) const {
+    uint32_t ctx;
+    if (non_zeros >= 64) non_zeros = 64;
+    if (non_zeros < 8) {
+      ctx = non_zeros;
+    } else {
+      ctx = 4 + non_zeros / 2;
+    }
+    return ctx * num_ctxs + block_ctx;
+  }
+
+  BlockCtxMap() {
+    ctx_map.assign(std::begin(kDefaultCtxMap), std::end(kDefaultCtxMap));
+    num_ctxs = *std::max_element(ctx_map.begin(), ctx_map.end()) + 1;
+    num_dc_ctxs = 1;
+  }
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_AC_CONTEXT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/ac_strategy.cc b/third_party/jpeg-xl/lib/jxl/ac_strategy.cc
new file mode 100644
index 0000000000..ada3bcb6f5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ac_strategy.cc
@@ -0,0 +1,108 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/ac_strategy.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <numeric>  // iota
+#include <type_traits>
+#include <utility>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+// Tries to generalize zig-zag order to non-square blocks. Surprisingly, in
+// square block frequency along the (i + j == const) diagonals is roughly the
+// same. For historical reasons, consecutive diagonals are traversed
+// in alternating directions - so called "zig-zag" (or "snake") order.
+template <bool is_lut>
+static void CoeffOrderAndLut(AcStrategy acs, coeff_order_t* out) {
+  size_t cx = acs.covered_blocks_x();
+  size_t cy = acs.covered_blocks_y();
+  CoefficientLayout(&cy, &cx);
+
+  // CoefficientLayout ensures cx >= cy.
+  // We compute the zigzag order for a cx x cx block, then discard all the
+  // lines that are not multiple of the ratio between cx and cy.
+  size_t xs = cx / cy;
+  size_t xsm = xs - 1;
+  size_t xss = CeilLog2Nonzero(xs);
+  // First half of the block
+  size_t cur = cx * cy;
+  for (size_t i = 0; i < cx * kBlockDim; i++) {
+    for (size_t j = 0; j <= i; j++) {
+      size_t x = j;
+      size_t y = i - j;
+      if (i % 2) std::swap(x, y);
+      if ((y & xsm) != 0) continue;
+      y >>= xss;
+      size_t val = 0;
+      if (x < cx && y < cy) {
+        val = y * cx + x;
+      } else {
+        val = cur++;
+      }
+      if (is_lut) {
+        out[y * cx * kBlockDim + x] = val;
+      } else {
+        out[val] = y * cx * kBlockDim + x;
+      }
+    }
+  }
+  // Second half
+  for (size_t ip = cx * kBlockDim - 1; ip > 0; ip--) {
+    size_t i = ip - 1;
+    for (size_t j = 0; j <= i; j++) {
+      size_t x = cx * kBlockDim - 1 - (i - j);
+      size_t y = cx * kBlockDim - 1 - j;
+      if (i % 2) std::swap(x, y);
+      if ((y & xsm) != 0) continue;
+      y >>= xss;
+      size_t val = cur++;
+      if (is_lut) {
+        out[y * cx * kBlockDim + x] = val;
+      } else {
+        out[val] = y * cx * kBlockDim + x;
+      }
+    }
+  }
+}
+
+void AcStrategy::ComputeNaturalCoeffOrder(coeff_order_t* order) const {
+  CoeffOrderAndLut</*is_lut=*/false>(*this, order);
+}
+void AcStrategy::ComputeNaturalCoeffOrderLut(coeff_order_t* lut) const {
+  CoeffOrderAndLut</*is_lut=*/true>(*this, lut);
+}
+
+// These definitions are needed before C++17.
+constexpr size_t AcStrategy::kMaxCoeffBlocks;
+constexpr size_t AcStrategy::kMaxBlockDim;
+constexpr size_t AcStrategy::kMaxCoeffArea;
+
+AcStrategyImage::AcStrategyImage(size_t xsize, size_t ysize)
+    : layers_(xsize, ysize) {
+  row_ = layers_.Row(0);
+  stride_ = layers_.PixelsPerRow();
+}
+
+size_t AcStrategyImage::CountBlocks(AcStrategy::Type type) const {
+  size_t ret = 0;
+  for (size_t y = 0; y < layers_.ysize(); y++) {
+    const uint8_t* JXL_RESTRICT row = layers_.ConstRow(y);
+    for (size_t x = 0; x < layers_.xsize(); x++) {
+      if (row[x] == ((static_cast<uint8_t>(type) << 1) | 1)) ret++;
+    }
+  }
+  return ret;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/ac_strategy.h b/third_party/jpeg-xl/lib/jxl/ac_strategy.h
new file mode 100644
index 0000000000..7d21167e6e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ac_strategy.h
@@ -0,0 +1,261 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_AC_STRATEGY_H_
+#define LIB_JXL_AC_STRATEGY_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <hwy/base.h>  // kMaxVectorSize
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image_ops.h"
+
+// Defines the different kinds of transforms, and heuristics to choose between
+// them.
+// `AcStrategy` represents what transform should be used, and which sub-block of
+// that transform we are currently in. Note that DCT4x4 is applied on all four
+// 4x4 sub-blocks of an 8x8 block.
+// `AcStrategyImage` defines which strategy should be used for each 8x8 block
+// of the image. The highest 4 bits represent the strategy to be used, the
+// lowest 4 represent the index of the block inside that strategy.
+
+namespace jxl {
+
+class AcStrategy {
+ public:
+  // Extremal values for the number of blocks/coefficients of a single strategy.
+  static constexpr size_t kMaxCoeffBlocks = 32;
+  static constexpr size_t kMaxBlockDim = kBlockDim * kMaxCoeffBlocks;
+  // Maximum number of coefficients in a block. Guaranteed to be a multiple of
+  // the vector size.
+  static constexpr size_t kMaxCoeffArea = kMaxBlockDim * kMaxBlockDim;
+  static_assert((kMaxCoeffArea * sizeof(float)) % hwy::kMaxVectorSize == 0,
+                "Coefficient area is not a multiple of vector size");
+
+  // Raw strategy types.
+  enum Type : uint32_t {
+    // Regular block size DCT
+    DCT = 0,
+    // Encode pixels without transforming
+    IDENTITY = 1,
+    // Use 2-by-2 DCT
+    DCT2X2 = 2,
+    // Use 4-by-4 DCT
+    DCT4X4 = 3,
+    // Use 16-by-16 DCT
+    DCT16X16 = 4,
+    // Use 32-by-32 DCT
+    DCT32X32 = 5,
+    // Use 16-by-8 DCT
+    DCT16X8 = 6,
+    // Use 8-by-16 DCT
+    DCT8X16 = 7,
+    // Use 32-by-8 DCT
+    DCT32X8 = 8,
+    // Use 8-by-32 DCT
+    DCT8X32 = 9,
+    // Use 32-by-16 DCT
+    DCT32X16 = 10,
+    // Use 16-by-32 DCT
+    DCT16X32 = 11,
+    // 4x8 and 8x4 DCT
+    DCT4X8 = 12,
+    DCT8X4 = 13,
+    // Corner-DCT.
+    AFV0 = 14,
+    AFV1 = 15,
+    AFV2 = 16,
+    AFV3 = 17,
+    // Larger DCTs
+    DCT64X64 = 18,
+    DCT64X32 = 19,
+    DCT32X64 = 20,
+    DCT128X128 = 21,
+    DCT128X64 = 22,
+    DCT64X128 = 23,
+    DCT256X256 = 24,
+    DCT256X128 = 25,
+    DCT128X256 = 26,
+    // Marker for num of valid strategies.
+    kNumValidStrategies
+  };
+
+  static constexpr uint32_t TypeBit(const Type type) {
+    return 1u << static_cast<uint32_t>(type);
+  }
+
+  // Returns true if this block is the first 8x8 block (i.e. top-left) of a
+  // possibly multi-block strategy.
+  JXL_INLINE bool IsFirstBlock() const { return is_first_; }
+
+  JXL_INLINE bool IsMultiblock() const {
+    constexpr uint32_t bits =
+        TypeBit(Type::DCT16X16) | TypeBit(Type::DCT32X32) |
+        TypeBit(Type::DCT16X8) | TypeBit(Type::DCT8X16) |
+        TypeBit(Type::DCT32X8) | TypeBit(Type::DCT8X32) |
+        TypeBit(Type::DCT16X32) | TypeBit(Type::DCT32X16) |
+        TypeBit(Type::DCT32X64) | TypeBit(Type::DCT64X32) |
+        TypeBit(Type::DCT64X64) | TypeBit(DCT64X128) | TypeBit(DCT128X64) |
+        TypeBit(DCT128X128) | TypeBit(DCT128X256) | TypeBit(DCT256X128) |
+        TypeBit(DCT256X256);
+    JXL_DASSERT(Strategy() < kNumValidStrategies);
+    return ((1u << static_cast<uint32_t>(Strategy())) & bits) != 0;
+  }
+
+  // Returns the raw strategy value. Should only be used for tokenization.
+  JXL_INLINE uint8_t RawStrategy() const {
+    return static_cast<uint8_t>(strategy_);
+  }
+
+  JXL_INLINE Type Strategy() const { return strategy_; }
+
+  // Inverse check
+  static JXL_INLINE constexpr bool IsRawStrategyValid(int raw_strategy) {
+    return raw_strategy < static_cast<int32_t>(kNumValidStrategies) &&
+           raw_strategy >= 0;
+  }
+  static JXL_INLINE AcStrategy FromRawStrategy(uint8_t raw_strategy) {
+    return FromRawStrategy(static_cast<Type>(raw_strategy));
+  }
+  static JXL_INLINE AcStrategy FromRawStrategy(Type raw_strategy) {
+    JXL_DASSERT(IsRawStrategyValid(static_cast<uint32_t>(raw_strategy)));
+    return AcStrategy(raw_strategy, /*is_first=*/true);
+  }
+
+  // "Natural order" means the order of increasing of "anisotropic" frequency of
+  // continuous version of DCT basis.
+  // Round-trip, for any given strategy s:
+  //  X = NaturalCoeffOrder(s)[NaturalCoeffOrderLutN(s)[X]]
+  //  X = NaturalCoeffOrderLut(s)[NaturalCoeffOrderN(s)[X]]
+  void ComputeNaturalCoeffOrder(coeff_order_t* order) const;
+  void ComputeNaturalCoeffOrderLut(coeff_order_t* lut) const;
+
+  // Number of 8x8 blocks that this strategy will cover. 0 for non-top-left
+  // blocks inside a multi-block transform.
+  JXL_INLINE size_t covered_blocks_x() const {
+    static constexpr uint8_t kLut[] = {1, 1, 1, 1,  2, 4,  1,  2,  1,
+                                       4, 2, 4, 1,  1, 1,  1,  1,  1,
+                                       8, 4, 8, 16, 8, 16, 32, 16, 32};
+    static_assert(sizeof(kLut) / sizeof(*kLut) == kNumValidStrategies,
+                  "Update LUT");
+    return kLut[size_t(strategy_)];
+  }
+
+  JXL_INLINE size_t covered_blocks_y() const {
+    static constexpr uint8_t kLut[] = {1, 1, 1, 1,  2,  4, 2,  1,  4,
+                                       1, 4, 2, 1,  1,  1, 1,  1,  1,
+                                       8, 8, 4, 16, 16, 8, 32, 32, 16};
+    static_assert(sizeof(kLut) / sizeof(*kLut) == kNumValidStrategies,
+                  "Update LUT");
+    return kLut[size_t(strategy_)];
+  }
+
+  JXL_INLINE size_t log2_covered_blocks() const {
+    static constexpr uint8_t kLut[] = {0, 0, 0, 0, 2, 4, 1,  1, 2,
+                                       2, 3, 3, 0, 0, 0, 0,  0, 0,
+                                       6, 5, 5, 8, 7, 7, 10, 9, 9};
+    static_assert(sizeof(kLut) / sizeof(*kLut) == kNumValidStrategies,
+                  "Update LUT");
+    return kLut[size_t(strategy_)];
+  }
+
+ private:
+  friend class AcStrategyRow;
+  JXL_INLINE AcStrategy(Type strategy, bool is_first)
+      : strategy_(strategy), is_first_(is_first) {
+    JXL_DASSERT(IsMultiblock() || is_first == true);
+  }
+
+  Type strategy_;
+  bool is_first_;
+};
+
+// Class to use a certain row of the AC strategy.
+class AcStrategyRow {
+ public:
+  explicit AcStrategyRow(const uint8_t* row) : row_(row) {}
+  AcStrategy operator[](size_t x) const {
+    return AcStrategy(static_cast<AcStrategy::Type>(row_[x] >> 1), row_[x] & 1);
+  }
+
+ private:
+  const uint8_t* JXL_RESTRICT row_;
+};
+
+class AcStrategyImage {
+ public:
+  AcStrategyImage() = default;
+  AcStrategyImage(size_t xsize, size_t ysize);
+  AcStrategyImage(AcStrategyImage&&) = default;
+  AcStrategyImage& operator=(AcStrategyImage&&) = default;
+
+  void FillDCT8(const Rect& rect) {
+    FillPlane<uint8_t>((static_cast<uint8_t>(AcStrategy::Type::DCT) << 1) | 1,
+                       &layers_, rect);
+  }
+  void FillDCT8() { FillDCT8(Rect(layers_)); }
+
+  void FillInvalid() { FillImage(INVALID, &layers_); }
+
+  void Set(size_t x, size_t y, AcStrategy::Type type) {
+#if JXL_ENABLE_ASSERT
+    AcStrategy acs = AcStrategy::FromRawStrategy(type);
+#endif  // JXL_ENABLE_ASSERT
+    JXL_ASSERT(y + acs.covered_blocks_y() <= layers_.ysize());
+    JXL_ASSERT(x + acs.covered_blocks_x() <= layers_.xsize());
+    JXL_CHECK(SetNoBoundsCheck(x, y, type, /*check=*/false));
+  }
+
+  Status SetNoBoundsCheck(size_t x, size_t y, AcStrategy::Type type,
+                          bool check = true) {
+    AcStrategy acs = AcStrategy::FromRawStrategy(type);
+    for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+      for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+        size_t pos = (y + iy) * stride_ + x + ix;
+        if (check && row_[pos] != INVALID) {
+          return JXL_FAILURE("Invalid AC strategy: block overlap");
+        }
+        row_[pos] =
+            (static_cast<uint8_t>(type) << 1) | ((iy | ix) == 0 ? 1 : 0);
+      }
+    }
+    return true;
+  }
+
+  bool IsValid(size_t x, size_t y) { return row_[y * stride_ + x] != INVALID; }
+
+  AcStrategyRow ConstRow(size_t y, size_t x_prefix = 0) const {
+    return AcStrategyRow(layers_.ConstRow(y) + x_prefix);
+  }
+
+  AcStrategyRow ConstRow(const Rect& rect, size_t y) const {
+    return ConstRow(rect.y0() + y, rect.x0());
+  }
+
+  size_t PixelsPerRow() const { return layers_.PixelsPerRow(); }
+
+  size_t xsize() const { return layers_.xsize(); }
+  size_t ysize() const { return layers_.ysize(); }
+
+  // Count the number of blocks of a given type.
+  size_t CountBlocks(AcStrategy::Type type) const;
+
+ private:
+  ImageB layers_;
+  uint8_t* JXL_RESTRICT row_;
+  size_t stride_;
+
+  // A value that does not represent a valid combined AC strategy
+  // value. Used as a sentinel.
+  static constexpr uint8_t INVALID = 0xFF;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_AC_STRATEGY_H_
diff --git a/third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc b/third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc
new file mode 100644
index 0000000000..d366aa3f82
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc
@@ -0,0 +1,237 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/ac_strategy.h"
+
+#include <string.h>
+
+#include <cmath>
+#include <hwy/aligned_allocator.h>
+#include <hwy/base.h>  // HWY_ALIGN_MAX
+#include <hwy/tests/test_util-inl.h>
+#include <utility>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/dec_transforms_testonly.h"
+#include "lib/jxl/enc_transforms.h"
+
+namespace jxl {
+namespace {
+
+// Test that DCT -> IDCT is a noop.
+class AcStrategyRoundtrip : public ::hwy::TestWithParamTargetAndT<int> {
+ protected:
+  void Run() {
+    const AcStrategy::Type type = static_cast<AcStrategy::Type>(GetParam());
+    const AcStrategy acs = AcStrategy::FromRawStrategy(type);
+
+    auto mem = hwy::AllocateAligned<float>(4 * AcStrategy::kMaxCoeffArea);
+    float* scratch_space = mem.get();
+    float* coeffs = scratch_space + AcStrategy::kMaxCoeffArea;
+    float* idct = coeffs + AcStrategy::kMaxCoeffArea;
+    Rng rng(type * 65537 + 13);
+
+    for (size_t j = 0; j < 64; j++) {
+      size_t i = (acs.log2_covered_blocks()
+                      ? rng.UniformU(0, 64u << acs.log2_covered_blocks())
+                      : j);
+      float* input = idct + AcStrategy::kMaxCoeffArea;
+      std::fill_n(input, AcStrategy::kMaxCoeffArea, 0);
+      input[i] = 0.2f;
+      TransformFromPixels(type, input, acs.covered_blocks_x() * 8, coeffs,
+                          scratch_space);
+      ASSERT_NEAR(coeffs[0], 0.2 / (64 << acs.log2_covered_blocks()), 1e-6)
+          << " i = " << i;
+      TransformToPixels(type, coeffs, idct, acs.covered_blocks_x() * 8,
+                        scratch_space);
+      for (size_t j = 0; j < 64u << acs.log2_covered_blocks(); j++) {
+        ASSERT_NEAR(idct[j], j == i ? 0.2f : 0, 2e-6)
+            << "j = " << j << " i = " << i << " acs " << type;
+      }
+    }
+    // Test DC.
+    std::fill_n(idct, AcStrategy::kMaxCoeffArea, 0);
+    for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
+      for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
+        float* dc = idct + AcStrategy::kMaxCoeffArea;
+        std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0);
+        dc[y * acs.covered_blocks_x() * 8 + x] = 0.2;
+        LowestFrequenciesFromDC(type, dc, acs.covered_blocks_x() * 8, coeffs);
+        DCFromLowestFrequencies(type, coeffs, idct, acs.covered_blocks_x() * 8);
+        std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0);
+        dc[y * acs.covered_blocks_x() * 8 + x] = 0.2;
+        for (size_t j = 0; j < 64u << acs.log2_covered_blocks(); j++) {
+          ASSERT_NEAR(idct[j], dc[j], 1e-6)
+              << "j = " << j << " x = " << x << " y = " << y << " acs " << type;
+        }
+      }
+    }
+  }
+};
+
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(
+    AcStrategyRoundtrip,
+    ::testing::Range(0, int(AcStrategy::Type::kNumValidStrategies)));
+
+TEST_P(AcStrategyRoundtrip, Test) { Run(); }
+
+// Test that DC(2x2) -> DCT coefficients -> IDCT -> downsampled IDCT is a noop.
+class AcStrategyRoundtripDownsample
+    : public ::hwy::TestWithParamTargetAndT<int> {
+ protected:
+  void Run() {
+    const AcStrategy::Type type = static_cast<AcStrategy::Type>(GetParam());
+    const AcStrategy acs = AcStrategy::FromRawStrategy(type);
+
+    auto mem = hwy::AllocateAligned<float>(4 * AcStrategy::kMaxCoeffArea);
+    float* scratch_space = mem.get();
+    float* coeffs = scratch_space + AcStrategy::kMaxCoeffArea;
+    std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0.0f);
+    float* idct = coeffs + AcStrategy::kMaxCoeffArea;
+    Rng rng(type * 65537 + 13);
+
+    for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
+      for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
+        if (x > 4 || y > 4) {
+          if (rng.Bernoulli(0.9f)) continue;
+        }
+        float* dc = idct + AcStrategy::kMaxCoeffArea;
+        std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0);
+        dc[y * acs.covered_blocks_x() * 8 + x] = 0.2f;
+        LowestFrequenciesFromDC(type, dc, acs.covered_blocks_x() * 8, coeffs);
+        TransformToPixels(type, coeffs, idct, acs.covered_blocks_x() * 8,
+                          scratch_space);
+        std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0.0f);
+        std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0);
+        dc[y * acs.covered_blocks_x() * 8 + x] = 0.2f;
+        // Downsample
+        for (size_t dy = 0; dy < acs.covered_blocks_y(); dy++) {
+          for (size_t dx = 0; dx < acs.covered_blocks_x(); dx++) {
+            float sum = 0;
+            for (size_t iy = 0; iy < 8; iy++) {
+              for (size_t ix = 0; ix < 8; ix++) {
+                sum += idct[(dy * 8 + iy) * 8 * acs.covered_blocks_x() +
+                            dx * 8 + ix];
+              }
+            }
+            sum /= 64.0f;
+            ASSERT_NEAR(sum, dc[dy * 8 * acs.covered_blocks_x() + dx], 1e-6)
+                << "acs " << type;
+          }
+        }
+      }
+    }
+  }
+};
+
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(
+    AcStrategyRoundtripDownsample,
+    ::testing::Range(0, int(AcStrategy::Type::kNumValidStrategies)));
+
+TEST_P(AcStrategyRoundtripDownsample, Test) { Run(); }
+
+// Test that IDCT(block with zeros in the non-topleft corner) -> downsampled
+// IDCT is the same as IDCT -> DC(2x2) of the same block.
+class AcStrategyDownsample : public ::hwy::TestWithParamTargetAndT<int> {
+ protected:
+  void Run() {
+    const AcStrategy::Type type = static_cast<AcStrategy::Type>(GetParam());
+    const AcStrategy acs = AcStrategy::FromRawStrategy(type);
+    size_t cx = acs.covered_blocks_y();
+    size_t cy = acs.covered_blocks_x();
+    CoefficientLayout(&cy, &cx);
+
+    auto mem = hwy::AllocateAligned<float>(4 * AcStrategy::kMaxCoeffArea);
+    float* scratch_space = mem.get();
+    float* idct = scratch_space + AcStrategy::kMaxCoeffArea;
+    float* idct_acs_downsampled = idct + AcStrategy::kMaxCoeffArea;
+    Rng rng(type * 65537 + 13);
+
+    for (size_t y = 0; y < cy; y++) {
+      for (size_t x = 0; x < cx; x++) {
+        if (x > 4 || y > 4) {
+          if (rng.Bernoulli(0.9f)) continue;
+        }
+        float* coeffs = idct + AcStrategy::kMaxCoeffArea;
+        std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0);
+        coeffs[y * cx * 8 + x] = 0.2f;
+        TransformToPixels(type, coeffs, idct, acs.covered_blocks_x() * 8,
+                          scratch_space);
+        std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0);
+        coeffs[y * cx * 8 + x] = 0.2f;
+        DCFromLowestFrequencies(type, coeffs, idct_acs_downsampled,
+                                acs.covered_blocks_x() * 8);
+        // Downsample
+        for (size_t dy = 0; dy < acs.covered_blocks_y(); dy++) {
+          for (size_t dx = 0; dx < acs.covered_blocks_x(); dx++) {
+            float sum = 0;
+            for (size_t iy = 0; iy < 8; iy++) {
+              for (size_t ix = 0; ix < 8; ix++) {
+                sum += idct[(dy * 8 + iy) * 8 * acs.covered_blocks_x() +
+                            dx * 8 + ix];
+              }
+            }
+            sum /= 64;
+            ASSERT_NEAR(
+                sum, idct_acs_downsampled[dy * 8 * acs.covered_blocks_x() + dx],
+                1e-6)
+                << " acs " << type;
+          }
+        }
+      }
+    }
+  }
+};
+
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(
+    AcStrategyDownsample,
+    ::testing::Range(0, int(AcStrategy::Type::kNumValidStrategies)));
+
+TEST_P(AcStrategyDownsample, Test) { Run(); }
+
+class AcStrategyTargetTest : public ::hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(AcStrategyTargetTest);
+
+TEST_P(AcStrategyTargetTest, RoundtripAFVDCT) {
+  HWY_ALIGN_MAX float idct[16];
+  for (size_t i = 0; i < 16; i++) {
+    HWY_ALIGN_MAX float pixels[16] = {};
+    pixels[i] = 1;
+    HWY_ALIGN_MAX float coeffs[16] = {};
+
+    AFVDCT4x4(pixels, coeffs);
+    AFVIDCT4x4(coeffs, idct);
+    for (size_t j = 0; j < 16; j++) {
+      EXPECT_NEAR(idct[j], pixels[j], 1e-6);
+    }
+  }
+}
+
+TEST_P(AcStrategyTargetTest, BenchmarkAFV) {
+  const AcStrategy::Type type = AcStrategy::Type::AFV0;
+  HWY_ALIGN_MAX float pixels[64] = {1};
+  HWY_ALIGN_MAX float coeffs[64] = {};
+  HWY_ALIGN_MAX float scratch_space[64] = {};
+  for (size_t i = 0; i < 1 << 14; i++) {
+    TransformToPixels(type, coeffs, pixels, 8, scratch_space);
+    TransformFromPixels(type, pixels, 8, coeffs, scratch_space);
+  }
+  EXPECT_NEAR(pixels[0], 0.0, 1E-6);
+}
+
+TEST_P(AcStrategyTargetTest, BenchmarkAFVDCT) {
+  HWY_ALIGN_MAX float pixels[64] = {1};
+  HWY_ALIGN_MAX float coeffs[64] = {};
+  for (size_t i = 0; i < 1 << 14; i++) {
+    AFVDCT4x4(pixels, coeffs);
+    AFVIDCT4x4(coeffs, pixels);
+  }
+  EXPECT_NEAR(pixels[0], 1.0, 1E-6);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/alpha.cc b/third_party/jpeg-xl/lib/jxl/alpha.cc
new file mode 100644
index 0000000000..48d7e7ee92
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/alpha.cc
@@ -0,0 +1,115 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/alpha.h"
+
+#include <string.h>
+
+#include <algorithm>
+
+namespace jxl {
+
+static float Clamp(float x) { return std::max(std::min(1.0f, x), 0.0f); }
+
+void PerformAlphaBlending(const AlphaBlendingInputLayer& bg,
+                          const AlphaBlendingInputLayer& fg,
+                          const AlphaBlendingOutput& out, size_t num_pixels,
+                          bool alpha_is_premultiplied, bool clamp) {
+  if (alpha_is_premultiplied) {
+    for (size_t x = 0; x < num_pixels; ++x) {
+      float fga = clamp ? Clamp(fg.a[x]) : fg.a[x];
+      out.r[x] = (fg.r[x] + bg.r[x] * (1.f - fga));
+      out.g[x] = (fg.g[x] + bg.g[x] * (1.f - fga));
+      out.b[x] = (fg.b[x] + bg.b[x] * (1.f - fga));
+      out.a[x] = (1.f - (1.f - fga) * (1.f - bg.a[x]));
+    }
+  } else {
+    for (size_t x = 0; x < num_pixels; ++x) {
+      float fga = clamp ? Clamp(fg.a[x]) : fg.a[x];
+      const float new_a = 1.f - (1.f - fga) * (1.f - bg.a[x]);
+      const float rnew_a = (new_a > 0 ? 1.f / new_a : 0.f);
+      out.r[x] = (fg.r[x] * fga + bg.r[x] * bg.a[x] * (1.f - fga)) * rnew_a;
+      out.g[x] = (fg.g[x] * fga + bg.g[x] * bg.a[x] * (1.f - fga)) * rnew_a;
+      out.b[x] = (fg.b[x] * fga + bg.b[x] * bg.a[x] * (1.f - fga)) * rnew_a;
+      out.a[x] = new_a;
+    }
+  }
+}
+void PerformAlphaBlending(const float* bg, const float* bga, const float* fg,
+                          const float* fga, float* out, size_t num_pixels,
+                          bool alpha_is_premultiplied, bool clamp) {
+  if (bg == bga && fg == fga) {
+    for (size_t x = 0; x < num_pixels; ++x) {
+      float fa = clamp ? fga[x] : Clamp(fga[x]);
+      out[x] = (1.f - (1.f - fa) * (1.f - bga[x]));
+    }
+  } else {
+    if (alpha_is_premultiplied) {
+      for (size_t x = 0; x < num_pixels; ++x) {
+        float fa = clamp ? fga[x] : Clamp(fga[x]);
+        out[x] = (fg[x] + bg[x] * (1.f - fa));
+      }
+    } else {
+      for (size_t x = 0; x < num_pixels; ++x) {
+        float fa = clamp ? fga[x] : Clamp(fga[x]);
+        const float new_a = 1.f - (1.f - fa) * (1.f - bga[x]);
+        const float rnew_a = (new_a > 0 ? 1.f / new_a : 0.f);
+        out[x] = (fg[x] * fa + bg[x] * bga[x] * (1.f - fa)) * rnew_a;
+      }
+    }
+  }
+}
+
+void PerformAlphaWeightedAdd(const float* bg, const float* fg, const float* fga,
+                             float* out, size_t num_pixels, bool clamp) {
+  if (fg == fga) {
+    memcpy(out, bg, num_pixels * sizeof(*out));
+  } else if (clamp) {
+    for (size_t x = 0; x < num_pixels; ++x) {
+      out[x] = bg[x] + fg[x] * Clamp(fga[x]);
+    }
+  } else {
+    for (size_t x = 0; x < num_pixels; ++x) {
+      out[x] = bg[x] + fg[x] * fga[x];
+    }
+  }
+}
+
+void PerformMulBlending(const float* bg, const float* fg, float* out,
+                        size_t num_pixels, bool clamp) {
+  if (clamp) {
+    for (size_t x = 0; x < num_pixels; ++x) {
+      out[x] = bg[x] * Clamp(fg[x]);
+    }
+  } else {
+    for (size_t x = 0; x < num_pixels; ++x) {
+      out[x] = bg[x] * fg[x];
+    }
+  }
+}
+
+void PremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g,
+                      float* JXL_RESTRICT b, const float* JXL_RESTRICT a,
+                      size_t num_pixels) {
+  for (size_t x = 0; x < num_pixels; ++x) {
+    const float multiplier = std::max(kSmallAlpha, a[x]);
+    r[x] *= multiplier;
+    g[x] *= multiplier;
+    b[x] *= multiplier;
+  }
+}
+
+void UnpremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g,
+                        float* JXL_RESTRICT b, const float* JXL_RESTRICT a,
+                        size_t num_pixels) {
+  for (size_t x = 0; x < num_pixels; ++x) {
+    const float multiplier = 1.f / std::max(kSmallAlpha, a[x]);
+    r[x] *= multiplier;
+    g[x] *= multiplier;
+    b[x] *= multiplier;
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/alpha.h b/third_party/jpeg-xl/lib/jxl/alpha.h
new file mode 100644
index 0000000000..efb76c800f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/alpha.h
@@ -0,0 +1,66 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ALPHA_H_
+#define LIB_JXL_ALPHA_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <limits>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+
+// A very small value to avoid divisions by zero when converting to
+// unpremultiplied alpha. Page 21 of the technical introduction to OpenEXR
+// (https://www.openexr.com/documentation/TechnicalIntroduction.pdf) recommends
+// "a power of two" that is "less than half of the smallest positive 16-bit
+// floating-point value". That smallest value happens to be the denormal number
+// 2^-24, so 2^-26 should be a good choice.
+static constexpr float kSmallAlpha = 1.f / (1u << 26u);
+
+struct AlphaBlendingInputLayer {
+  const float* r;
+  const float* g;
+  const float* b;
+  const float* a;
+};
+
+struct AlphaBlendingOutput {
+  float* r;
+  float* g;
+  float* b;
+  float* a;
+};
+
+// Note: The pointers in `out` are allowed to alias those in `bg` or `fg`.
+// No pointer shall be null.
+void PerformAlphaBlending(const AlphaBlendingInputLayer& bg,
+                          const AlphaBlendingInputLayer& fg,
+                          const AlphaBlendingOutput& out, size_t num_pixels,
+                          bool alpha_is_premultiplied, bool clamp);
+// Single plane alpha blending
+void PerformAlphaBlending(const float* bg, const float* bga, const float* fg,
+                          const float* fga, float* out, size_t num_pixels,
+                          bool alpha_is_premultiplied, bool clamp);
+
+void PerformAlphaWeightedAdd(const float* bg, const float* fg, const float* fga,
+                             float* out, size_t num_pixels, bool clamp);
+
+void PerformMulBlending(const float* bg, const float* fg, float* out,
+                        size_t num_pixels, bool clamp);
+
+void PremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g,
+                      float* JXL_RESTRICT b, const float* JXL_RESTRICT a,
+                      size_t num_pixels);
+void UnpremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g,
+                        float* JXL_RESTRICT b, const float* JXL_RESTRICT a,
+                        size_t num_pixels);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ALPHA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/alpha_test.cc b/third_party/jpeg-xl/lib/jxl/alpha_test.cc
new file mode 100644
index 0000000000..ddafd829ec
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/alpha_test.cc
@@ -0,0 +1,134 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/alpha.h"
+
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+using ::testing::_;
+using ::testing::ElementsAre;
+using ::testing::FloatNear;
+
+TEST(AlphaTest, BlendingWithNonPremultiplied) {
+  const float bg_rgb[3] = {100, 110, 120};
+  const float bg_a = 180.f / 255;
+  const float fg_rgb[3] = {25, 21, 23};
+  const float fg_a = 15420.f / 65535;
+  const float fg_a2 = 2.0f;
+  float out_rgb[3];
+  float out_a;
+  PerformAlphaBlending(
+      /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a},
+      /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a},
+      /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1,
+      /*alpha_is_premultiplied=*/false, /*clamp=*/false);
+  EXPECT_THAT(out_rgb,
+              ElementsAre(FloatNear(77.2f, .05f), FloatNear(83.0f, .05f),
+                          FloatNear(90.6f, .05f)));
+  EXPECT_NEAR(out_a, 3174.f / 4095, 1e-5);
+  PerformAlphaBlending(
+      /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a},
+      /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a2},
+      /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1,
+      /*alpha_is_premultiplied=*/false, /*clamp=*/true);
+  EXPECT_THAT(out_rgb, ElementsAre(FloatNear(fg_rgb[0], .05f),
+                                   FloatNear(fg_rgb[1], .05f),
+                                   FloatNear(fg_rgb[2], .05f)));
+  EXPECT_NEAR(out_a, 1.0f, 1e-5);
+}
+
+TEST(AlphaTest, BlendingWithPremultiplied) {
+  const float bg_rgb[3] = {100, 110, 120};
+  const float bg_a = 180.f / 255;
+  const float fg_rgb[3] = {25, 21, 23};
+  const float fg_a = 15420.f / 65535;
+  const float fg_a2 = 2.0f;
+  float out_rgb[3];
+  float out_a;
+  PerformAlphaBlending(
+      /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a},
+      /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a},
+      /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1,
+      /*alpha_is_premultiplied=*/true, /*clamp=*/false);
+  EXPECT_THAT(out_rgb,
+              ElementsAre(FloatNear(101.5f, .05f), FloatNear(105.1f, .05f),
+                          FloatNear(114.8f, .05f)));
+  EXPECT_NEAR(out_a, 3174.f / 4095, 1e-5);
+  PerformAlphaBlending(
+      /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a},
+      /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a2},
+      /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1,
+      /*alpha_is_premultiplied=*/true, /*clamp=*/true);
+  EXPECT_THAT(out_rgb, ElementsAre(FloatNear(fg_rgb[0], .05f),
+                                   FloatNear(fg_rgb[1], .05f),
+                                   FloatNear(fg_rgb[2], .05f)));
+  EXPECT_NEAR(out_a, 1.0f, 1e-5);
+}
+
+TEST(AlphaTest, Mul) {
+  const float bg = 100;
+  const float fg = 25;
+  float out;
+  PerformMulBlending(&bg, &fg, &out, 1, /*clamp=*/false);
+  EXPECT_THAT(out, FloatNear(fg * bg, .05f));
+  PerformMulBlending(&bg, &fg, &out, 1, /*clamp=*/true);
+  EXPECT_THAT(out, FloatNear(bg, .05f));
+}
+
+TEST(AlphaTest, PremultiplyAndUnpremultiply) {
+  const float alpha[] = {0.f, 63.f / 255, 127.f / 255, 1.f};
+  float r[] = {120, 130, 140, 150};
+  float g[] = {124, 134, 144, 154};
+  float b[] = {127, 137, 147, 157};
+
+  PremultiplyAlpha(r, g, b, alpha, 4);
+  EXPECT_THAT(
+      r, ElementsAre(FloatNear(0.f, 1e-5f), FloatNear(130 * 63.f / 255, 1e-5f),
+                     FloatNear(140 * 127.f / 255, 1e-5f), 150));
+  EXPECT_THAT(
+      g, ElementsAre(FloatNear(0.f, 1e-5f), FloatNear(134 * 63.f / 255, 1e-5f),
+                     FloatNear(144 * 127.f / 255, 1e-5f), 154));
+  EXPECT_THAT(
+      b, ElementsAre(FloatNear(0.f, 1e-5f), FloatNear(137 * 63.f / 255, 1e-5f),
+                     FloatNear(147 * 127.f / 255, 1e-5f), 157));
+
+  UnpremultiplyAlpha(r, g, b, alpha, 4);
+  EXPECT_THAT(r, ElementsAre(FloatNear(120, 1e-4f), FloatNear(130, 1e-4f),
+                             FloatNear(140, 1e-4f), FloatNear(150, 1e-4f)));
+  EXPECT_THAT(g, ElementsAre(FloatNear(124, 1e-4f), FloatNear(134, 1e-4f),
+                             FloatNear(144, 1e-4f), FloatNear(154, 1e-4f)));
+  EXPECT_THAT(b, ElementsAre(FloatNear(127, 1e-4f), FloatNear(137, 1e-4f),
+                             FloatNear(147, 1e-4f), FloatNear(157, 1e-4f)));
+}
+
+TEST(AlphaTest, UnpremultiplyAndPremultiply) {
+  const float alpha[] = {0.f, 63.f / 255, 127.f / 255, 1.f};
+  float r[] = {50, 60, 70, 80};
+  float g[] = {54, 64, 74, 84};
+  float b[] = {57, 67, 77, 87};
+
+  UnpremultiplyAlpha(r, g, b, alpha, 4);
+  EXPECT_THAT(r, ElementsAre(_, FloatNear(60 * 255.f / 63, 1e-4f),
+                             FloatNear(70 * 255.f / 127, 1e-4f), 80));
+  EXPECT_THAT(g, ElementsAre(_, FloatNear(64 * 255.f / 63, 1e-4f),
+                             FloatNear(74 * 255.f / 127, 1e-4f), 84));
+  EXPECT_THAT(b, ElementsAre(_, FloatNear(67 * 255.f / 63, 1e-4f),
+                             FloatNear(77 * 255.f / 127, 1e-4f), 87));
+
+  PremultiplyAlpha(r, g, b, alpha, 4);
+  EXPECT_THAT(r, ElementsAre(FloatNear(50, 1e-4f), FloatNear(60, 1e-4f),
+                             FloatNear(70, 1e-4f), FloatNear(80, 1e-4f)));
+  EXPECT_THAT(g, ElementsAre(FloatNear(54, 1e-4f), FloatNear(64, 1e-4f),
+                             FloatNear(74, 1e-4f), FloatNear(84, 1e-4f)));
+  EXPECT_THAT(b, ElementsAre(FloatNear(57, 1e-4f), FloatNear(67, 1e-4f),
+                             FloatNear(77, 1e-4f), FloatNear(87, 1e-4f)));
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/ans_common.cc b/third_party/jpeg-xl/lib/jxl/ans_common.cc
new file mode 100644
index 0000000000..d2cf897ec4
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ans_common.cc
@@ -0,0 +1,148 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/ans_common.h"
+
+#include <numeric>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+std::vector<int32_t> CreateFlatHistogram(int length, int total_count) {
+  JXL_ASSERT(length > 0);
+  JXL_ASSERT(length <= total_count);
+  const int count = total_count / length;
+  std::vector<int32_t> result(length, count);
+  const int rem_counts = total_count % length;
+  for (int i = 0; i < rem_counts; ++i) {
+    ++result[i];
+  }
+  return result;
+}
+
+// First, all trailing non-occurring symbols are removed from the distribution;
+// if this leaves the distribution empty, a dummy symbol with max weight is
+// added. This ensures that the resulting distribution sums to total table size.
+// Then, `entry_size` is chosen to be the largest power of two so that
+// `table_size` = ANS_TAB_SIZE/`entry_size` is at least as big as the
+// distribution size.
+// Note that each entry will only ever contain two different symbols, and
+// consecutive ranges of offsets, which allows us to use a compact
+// representation.
+// Each entry is initialized with only the (symbol=i, offset) pairs; then
+// positions for which the entry overflows (i.e. distribution[i] > entry_size)
+// or is not full are computed, and put into a stack in increasing order.
+// Missing symbols in the distribution are padded with 0 (because `table_size`
+// >= number of symbols). The `cutoff` value for each entry is initialized to
+// the number of occupied slots in that entry (i.e. `distributions[i]`). While
+// the overflowing-symbol stack is not empty (which implies that the
+// underflowing-symbol stack also is not), the top overfull and underfull
+// positions are popped from the stack; the empty slots in the underfull entry
+// are then filled with as many slots as needed from the overfull entry; such
+// slots are placed after the slots in the overfull entry, and `offsets[1]` is
+// computed accordingly. The formerly underfull entry is thus now neither
+// underfull nor overfull, and represents exactly two symbols. The overfull
+// entry might be either overfull or underfull, and is pushed into the
+// corresponding stack.
+void InitAliasTable(std::vector<int32_t> distribution, uint32_t range,
+                    size_t log_alpha_size, AliasTable::Entry* JXL_RESTRICT a) {
+  while (!distribution.empty() && distribution.back() == 0) {
+    distribution.pop_back();
+  }
+  // Ensure that a valid table is always returned, even for an empty
+  // alphabet. Otherwise, a specially-crafted stream might crash the
+  // decoder.
+  if (distribution.empty()) {
+    distribution.emplace_back(range);
+  }
+  const size_t table_size = 1 << log_alpha_size;
+#if JXL_ENABLE_ASSERT
+  int sum = std::accumulate(distribution.begin(), distribution.end(), 0);
+#endif  // JXL_ENABLE_ASSERT
+  JXL_ASSERT(static_cast<uint32_t>(sum) == range);
+  // range must be a power of two
+  JXL_ASSERT((range & (range - 1)) == 0);
+  JXL_ASSERT(distribution.size() <= table_size);
+  JXL_ASSERT(table_size <= range);
+  const uint32_t entry_size = range >> log_alpha_size;  // this is exact
+  // Special case for single-symbol distributions, that ensures that the state
+  // does not change when decoding from such a distribution. Note that, since we
+  // hardcode offset0 == 0, it is not straightforward (if at all possible) to
+  // fix the general case to produce this result.
+  for (size_t sym = 0; sym < distribution.size(); sym++) {
+    if (distribution[sym] == ANS_TAB_SIZE) {
+      for (size_t i = 0; i < table_size; i++) {
+        a[i].right_value = sym;
+        a[i].cutoff = 0;
+        a[i].offsets1 = entry_size * i;
+        a[i].freq0 = 0;
+        a[i].freq1_xor_freq0 = ANS_TAB_SIZE;
+      }
+      return;
+    }
+  }
+  std::vector<uint32_t> underfull_posn;
+  std::vector<uint32_t> overfull_posn;
+  std::vector<uint32_t> cutoffs(1 << log_alpha_size);
+  // Initialize entries.
+  for (size_t i = 0; i < distribution.size(); i++) {
+    cutoffs[i] = distribution[i];
+    if (cutoffs[i] > entry_size) {
+      overfull_posn.push_back(i);
+    } else if (cutoffs[i] < entry_size) {
+      underfull_posn.push_back(i);
+    }
+  }
+  for (uint32_t i = distribution.size(); i < table_size; i++) {
+    cutoffs[i] = 0;
+    underfull_posn.push_back(i);
+  }
+  // Reassign overflow/underflow values.
+  while (!overfull_posn.empty()) {
+    uint32_t overfull_i = overfull_posn.back();
+    overfull_posn.pop_back();
+    JXL_ASSERT(!underfull_posn.empty());
+    uint32_t underfull_i = underfull_posn.back();
+    underfull_posn.pop_back();
+    uint32_t underfull_by = entry_size - cutoffs[underfull_i];
+    cutoffs[overfull_i] -= underfull_by;
+    // overfull positions have their original symbols
+    a[underfull_i].right_value = overfull_i;
+    a[underfull_i].offsets1 = cutoffs[overfull_i];
+    // Slots in the right part of entry underfull_i were taken from the end
+    // of the symbols in entry overfull_i.
+    if (cutoffs[overfull_i] < entry_size) {
+      underfull_posn.push_back(overfull_i);
+    } else if (cutoffs[overfull_i] > entry_size) {
+      overfull_posn.push_back(overfull_i);
+    }
+  }
+  for (uint32_t i = 0; i < table_size; i++) {
+    // cutoffs[i] is properly initialized but the clang-analyzer doesn't infer
+    // it since it is partially initialized across two for-loops.
+    // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult)
+    if (cutoffs[i] == entry_size) {
+      a[i].right_value = i;
+      a[i].offsets1 = 0;
+      a[i].cutoff = 0;
+    } else {
+      // Note that, if cutoff is not equal to entry_size,
+      // a[i].offsets1 was initialized with (overfull cutoff) -
+      // (entry_size - a[i].cutoff). Thus, subtracting
+      // a[i].cutoff cannot make it negative.
+      a[i].offsets1 -= cutoffs[i];
+      a[i].cutoff = cutoffs[i];
+    }
+    const size_t freq0 = i < distribution.size() ? distribution[i] : 0;
+    const size_t i1 = a[i].right_value;
+    const size_t freq1 = i1 < distribution.size() ? distribution[i1] : 0;
+    a[i].freq0 = static_cast<uint16_t>(freq0);
+    a[i].freq1_xor_freq0 = static_cast<uint16_t>(freq1 ^ freq0);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/ans_common.h b/third_party/jpeg-xl/lib/jxl/ans_common.h
new file mode 100644
index 0000000000..fb5058e310
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ans_common.h
@@ -0,0 +1,143 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ANS_COMMON_H_
+#define LIB_JXL_ANS_COMMON_H_
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <hwy/cache_control.h>  // Prefetch
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Returns the precision (number of bits) that should be used to store
+// a histogram count such that Log2Floor(count) == logcount.
+static JXL_INLINE uint32_t GetPopulationCountPrecision(uint32_t logcount,
+                                                       uint32_t shift) {
+  int32_t r = std::min<int>(
+      logcount, int(shift) - int((ANS_LOG_TAB_SIZE - logcount) >> 1));
+  if (r < 0) return 0;
+  return r;
+}
+
+// Returns a histogram where the counts are positive, differ by at most 1,
+// and add up to total_count. The bigger counts (if any) are at the beginning
+// of the histogram.
+std::vector<int32_t> CreateFlatHistogram(int length, int total_count);
+
+// An alias table implements a mapping from the [0, ANS_TAB_SIZE) range into
+// the [0, ANS_MAX_ALPHABET_SIZE) range, satisfying the following conditions:
+// - each symbol occurs as many times as specified by any valid distribution
+//   of frequencies of the symbols. A valid distribution here is an array of
+//   ANS_MAX_ALPHABET_SIZE that contains numbers in the range [0, ANS_TAB_SIZE],
+//   and whose sum is ANS_TAB_SIZE.
+// - lookups can be done in constant time, and also return how many smaller
+//   input values map into the same symbol, according to some well-defined order
+//   of input values.
+// - the space used by the alias table is given by a small constant times the
+//   index of the largest symbol with nonzero probability in the distribution.
+// Each of the entries in the table covers a range of `entry_size` values in the
+// [0, ANS_TAB_SIZE) range; consecutive entries represent consecutive
+// sub-ranges. In the range covered by entry `i`, the first `cutoff` values map
+// to symbol `i`, while the others map to symbol `right_value`.
+//
+// TODO(veluca): consider making the order used for computing offsets easier to
+// define - it is currently defined by the algorithm to compute the alias table.
+// Beware of breaking the implicit assumption that symbols that come after the
+// cutoff value should have an offset at least as big as the cutoff.
+
+struct AliasTable {
+  struct Symbol {
+    size_t value;
+    size_t offset;
+    size_t freq;
+  };
+
+// Working set size matters here (~64 tables x 256 entries).
+// offsets0 is always zero (beginning of [0] side among the same symbol).
+// offsets1 is an offset of (pos >= cutoff) side decremented by cutoff.
+#pragma pack(push, 1)
+  struct Entry {
+    uint8_t cutoff;       // < kEntrySizeMinus1 when used by ANS.
+    uint8_t right_value;  // < alphabet size.
+    uint16_t freq0;
+
+    // Only used if `greater` (see Lookup)
+    uint16_t offsets1;         // <= ANS_TAB_SIZE
+    uint16_t freq1_xor_freq0;  // for branchless ternary in Lookup
+  };
+#pragma pack(pop)
+
+  // Dividing `value` by `entry_size` determines `i`, the entry which is
+  // responsible for the input. If the remainder is below `cutoff`, then the
+  // mapped symbol is `i`; since `offsets[0]` stores the number of occurrences
+  // of `i` "before" the start of this entry, the offset of the input will be
+  // `offsets[0] + remainder`. If the remainder is above cutoff, the mapped
+  // symbol is `right_value`; since `offsets[1]` stores the number of
+  // occurrences of `right_value` "before" this entry, minus the `cutoff` value,
+  // the input offset is then `remainder + offsets[1]`.
+  static JXL_INLINE Symbol Lookup(const Entry* JXL_RESTRICT table, size_t value,
+                                  size_t log_entry_size,
+                                  size_t entry_size_minus_1) {
+    const size_t i = value >> log_entry_size;
+    const size_t pos = value & entry_size_minus_1;
+
+#if JXL_BYTE_ORDER_LITTLE
+    uint64_t entry;
+    memcpy(&entry, &table[i].cutoff, sizeof(entry));
+    const size_t cutoff = entry & 0xFF;              // = MOVZX
+    const size_t right_value = (entry >> 8) & 0xFF;  // = MOVZX
+    const size_t freq0 = (entry >> 16) & 0xFFFF;
+#else
+    // Generates multiple loads with complex addressing.
+    const size_t cutoff = table[i].cutoff;
+    const size_t right_value = table[i].right_value;
+    const size_t freq0 = table[i].freq0;
+#endif
+
+    const bool greater = pos >= cutoff;
+
+#if JXL_BYTE_ORDER_LITTLE
+    const uint64_t conditional = greater ? entry : 0;  // = CMOV
+    const size_t offsets1_or_0 = (conditional >> 32) & 0xFFFF;
+    const size_t freq1_xor_freq0_or_0 = conditional >> 48;
+#else
+    const size_t offsets1_or_0 = greater ? table[i].offsets1 : 0;
+    const size_t freq1_xor_freq0_or_0 = greater ? table[i].freq1_xor_freq0 : 0;
+#endif
+
+    // WARNING: moving this code may interfere with CMOV heuristics.
+    Symbol s;
+    s.value = greater ? right_value : i;
+    s.offset = offsets1_or_0 + pos;
+    s.freq = freq0 ^ freq1_xor_freq0_or_0;  // = greater ? freq1 : freq0
+    // XOR avoids implementation-defined conversion from unsigned to signed.
+    // Alternatives considered: BEXTR is 2 cycles on HSW, SET+shift causes
+    // spills, simple ternary has a long dependency chain.
+
+    return s;
+  }
+
+  static HWY_INLINE void Prefetch(const Entry* JXL_RESTRICT table, size_t value,
+                                  size_t log_entry_size) {
+    const size_t i = value >> log_entry_size;
+    hwy::Prefetch(table + i);
+  }
+};
+
+// Computes an alias table for a given distribution.
+void InitAliasTable(std::vector<int32_t> distribution, uint32_t range,
+                    size_t log_alpha_size, AliasTable::Entry* JXL_RESTRICT a);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ANS_COMMON_H_
diff --git a/third_party/jpeg-xl/lib/jxl/ans_common_test.cc b/third_party/jpeg-xl/lib/jxl/ans_common_test.cc
new file mode 100644
index 0000000000..487b6cf5bd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ans_common_test.cc
@@ -0,0 +1,43 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/ans_common.h"
+
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+void VerifyAliasDistribution(const std::vector<int>& distribution,
+                             uint32_t range) {
+  constexpr size_t log_alpha_size = 8;
+  AliasTable::Entry table[1 << log_alpha_size];
+  InitAliasTable(distribution, range, log_alpha_size, table);
+  std::vector<std::vector<uint32_t>> offsets(distribution.size());
+  for (uint32_t i = 0; i < range; i++) {
+    AliasTable::Symbol s = AliasTable::Lookup(
+        table, i, ANS_LOG_TAB_SIZE - 8, (1 << (ANS_LOG_TAB_SIZE - 8)) - 1);
+    offsets[s.value].push_back(s.offset);
+  }
+  for (uint32_t i = 0; i < distribution.size(); i++) {
+    ASSERT_EQ(static_cast<size_t>(distribution[i]), offsets[i].size());
+    std::sort(offsets[i].begin(), offsets[i].end());
+    for (uint32_t j = 0; j < offsets[i].size(); j++) {
+      ASSERT_EQ(offsets[i][j], j);
+    }
+  }
+}
+
+TEST(ANSCommonTest, AliasDistributionSmoke) {
+  VerifyAliasDistribution({ANS_TAB_SIZE / 2, ANS_TAB_SIZE / 2}, ANS_TAB_SIZE);
+  VerifyAliasDistribution({ANS_TAB_SIZE}, ANS_TAB_SIZE);
+  VerifyAliasDistribution({0, 0, 0, ANS_TAB_SIZE, 0}, ANS_TAB_SIZE);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/ans_params.h b/third_party/jpeg-xl/lib/jxl/ans_params.h
new file mode 100644
index 0000000000..4bbc284c0b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ans_params.h
@@ -0,0 +1,36 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ANS_PARAMS_H_
+#define LIB_JXL_ANS_PARAMS_H_
+
+// Common parameters that are needed for both the ANS entropy encoding and
+// decoding methods.
+
+#include <stdint.h>
+#include <stdlib.h>
+
+namespace jxl {
+
+// TODO(veluca): decide if 12 is the best constant here (valid range is up to
+// 16). This requires recomputing the Huffman tables in {enc,dec}_ans.cc
+// 14 gives a 0.2% improvement at d1 and makes d8 slightly worse. This is
+// likely not worth the increase in encoder complexity.
+#define ANS_LOG_TAB_SIZE 12u
+#define ANS_TAB_SIZE (1 << ANS_LOG_TAB_SIZE)
+#define ANS_TAB_MASK (ANS_TAB_SIZE - 1)
+
+// Largest possible symbol to be encoded by either ANS or prefix coding.
+#define PREFIX_MAX_ALPHABET_SIZE 4096
+#define ANS_MAX_ALPHABET_SIZE 256
+
+// Max number of bits for prefix coding.
+#define PREFIX_MAX_BITS 15
+
+#define ANS_SIGNATURE 0x13  // Initial state, used as CRC.
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ANS_PARAMS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/ans_test.cc b/third_party/jpeg-xl/lib/jxl/ans_test.cc
new file mode 100644
index 0000000000..06bc46477f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ans_test.cc
@@ -0,0 +1,278 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+void RoundtripTestcase(int n_histograms, int alphabet_size,
+                       const std::vector<Token>& input_values) {
+  constexpr uint16_t kMagic1 = 0x9e33;
+  constexpr uint16_t kMagic2 = 0x8b04;
+
+  BitWriter writer;
+  // Space for magic bytes.
+  BitWriter::Allotment allotment_magic1(&writer, 16);
+  writer.Write(16, kMagic1);
+  allotment_magic1.ReclaimAndCharge(&writer, 0, nullptr);
+
+  std::vector<uint8_t> context_map;
+  EntropyEncodingData codes;
+  std::vector<std::vector<Token>> input_values_vec;
+  input_values_vec.push_back(input_values);
+
+  BuildAndEncodeHistograms(HistogramParams(), n_histograms, input_values_vec,
+                           &codes, &context_map, &writer, 0, nullptr);
+  WriteTokens(input_values_vec[0], codes, context_map, &writer, 0, nullptr);
+
+  // Magic bytes + padding
+  BitWriter::Allotment allotment_magic2(&writer, 24);
+  writer.Write(16, kMagic2);
+  writer.ZeroPadToByte();
+  allotment_magic2.ReclaimAndCharge(&writer, 0, nullptr);
+
+  // We do not truncate the output. Reading past the end reads out zeroes
+  // anyway.
+  BitReader br(writer.GetSpan());
+
+  ASSERT_EQ(br.ReadBits(16), kMagic1);
+
+  std::vector<uint8_t> dec_context_map;
+  ANSCode decoded_codes;
+  ASSERT_TRUE(
+      DecodeHistograms(&br, n_histograms, &decoded_codes, &dec_context_map));
+  ASSERT_EQ(dec_context_map, context_map);
+  ANSSymbolReader reader(&decoded_codes, &br);
+
+  for (const Token& symbol : input_values) {
+    uint32_t read_symbol =
+        reader.ReadHybridUint(symbol.context, &br, dec_context_map);
+    ASSERT_EQ(read_symbol, symbol.value);
+  }
+  ASSERT_TRUE(reader.CheckANSFinalState());
+
+  ASSERT_EQ(br.ReadBits(16), kMagic2);
+  EXPECT_TRUE(br.Close());
+}
+
+TEST(ANSTest, EmptyRoundtrip) {
+  RoundtripTestcase(2, ANS_MAX_ALPHABET_SIZE, std::vector<Token>());
+}
+
+TEST(ANSTest, SingleSymbolRoundtrip) {
+  for (uint32_t i = 0; i < ANS_MAX_ALPHABET_SIZE; i++) {
+    RoundtripTestcase(2, ANS_MAX_ALPHABET_SIZE, {{0, i}});
+  }
+  for (uint32_t i = 0; i < ANS_MAX_ALPHABET_SIZE; i++) {
+    RoundtripTestcase(2, ANS_MAX_ALPHABET_SIZE,
+                      std::vector<Token>(1024, {0, i}));
+  }
+}
+
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
+constexpr size_t kReps = 3;
+#else
+constexpr size_t kReps = 10;
+#endif
+
+void RoundtripRandomStream(int alphabet_size, size_t reps = kReps,
+                           size_t num = 1 << 18) {
+  constexpr int kNumHistograms = 3;
+  Rng rng(0);
+  for (size_t i = 0; i < reps; i++) {
+    std::vector<Token> symbols;
+    for (size_t j = 0; j < num; j++) {
+      int context = rng.UniformI(0, kNumHistograms);
+      int value = rng.UniformU(0, alphabet_size);
+      symbols.emplace_back(context, value);
+    }
+    RoundtripTestcase(kNumHistograms, alphabet_size, symbols);
+  }
+}
+
+void RoundtripRandomUnbalancedStream(int alphabet_size) {
+  constexpr int kNumHistograms = 3;
+  constexpr int kPrecision = 1 << 10;
+  Rng rng(0);
+  for (size_t i = 0; i < kReps; i++) {
+    std::vector<int> distributions[kNumHistograms] = {};
+    for (int j = 0; j < kNumHistograms; j++) {
+      distributions[j].resize(kPrecision);
+      int symbol = 0;
+      int remaining = 1;
+      for (int k = 0; k < kPrecision; k++) {
+        if (remaining == 0) {
+          if (symbol < alphabet_size - 1) symbol++;
+          // There is no meaning behind this distribution: it's anything that
+          // will create a nonuniform distribution and won't have too few
+          // symbols usually. Also we want different distributions we get to be
+          // sufficiently dissimilar.
+          remaining = rng.UniformU(0, kPrecision - k + 1);
+        }
+        distributions[j][k] = symbol;
+        remaining--;
+      }
+    }
+    std::vector<Token> symbols;
+    for (int j = 0; j < 1 << 18; j++) {
+      int context = rng.UniformI(0, kNumHistograms);
+      int value = rng.UniformU(0, kPrecision);
+      symbols.emplace_back(context, value);
+    }
+    RoundtripTestcase(kNumHistograms + 1, alphabet_size, symbols);
+  }
+}
+
+TEST(ANSTest, RandomStreamRoundtrip3Small) { RoundtripRandomStream(3, 1, 16); }
+
+TEST(ANSTest, RandomStreamRoundtrip3) { RoundtripRandomStream(3); }
+
+TEST(ANSTest, RandomStreamRoundtripBig) {
+  RoundtripRandomStream(ANS_MAX_ALPHABET_SIZE);
+}
+
+TEST(ANSTest, RandomUnbalancedStreamRoundtrip3) {
+  RoundtripRandomUnbalancedStream(3);
+}
+
+TEST(ANSTest, RandomUnbalancedStreamRoundtripBig) {
+  RoundtripRandomUnbalancedStream(ANS_MAX_ALPHABET_SIZE);
+}
+
+TEST(ANSTest, UintConfigRoundtrip) {
+  for (size_t log_alpha_size = 5; log_alpha_size <= 8; log_alpha_size++) {
+    std::vector<HybridUintConfig> uint_config, uint_config_dec;
+    for (size_t i = 0; i < log_alpha_size; i++) {
+      for (size_t j = 0; j <= i; j++) {
+        for (size_t k = 0; k <= i - j; k++) {
+          uint_config.emplace_back(i, j, k);
+        }
+      }
+    }
+    uint_config.emplace_back(log_alpha_size, 0, 0);
+    uint_config_dec.resize(uint_config.size());
+    BitWriter writer;
+    BitWriter::Allotment allotment(&writer, 10 * uint_config.size());
+    EncodeUintConfigs(uint_config, &writer, log_alpha_size);
+    allotment.ReclaimAndCharge(&writer, 0, nullptr);
+    writer.ZeroPadToByte();
+    BitReader br(writer.GetSpan());
+    EXPECT_TRUE(DecodeUintConfigs(log_alpha_size, &uint_config_dec, &br));
+    EXPECT_TRUE(br.Close());
+    for (size_t i = 0; i < uint_config.size(); i++) {
+      EXPECT_EQ(uint_config[i].split_token, uint_config_dec[i].split_token);
+      EXPECT_EQ(uint_config[i].msb_in_token, uint_config_dec[i].msb_in_token);
+      EXPECT_EQ(uint_config[i].lsb_in_token, uint_config_dec[i].lsb_in_token);
+    }
+  }
+}
+
+void TestCheckpointing(bool ans, bool lz77) {
+  std::vector<std::vector<Token>> input_values(1);
+  for (size_t i = 0; i < 1024; i++) {
+    input_values[0].push_back(Token(0, i % 4));
+  }
+  // up to lz77 window size.
+  for (size_t i = 0; i < (1 << 20) - 1022; i++) {
+    input_values[0].push_back(Token(0, (i % 5) + 4));
+  }
+  // Ensure that when the window wraps around, new values are different.
+  input_values[0].push_back(Token(0, 0));
+  for (size_t i = 0; i < 1024; i++) {
+    input_values[0].push_back(Token(0, i % 4));
+  }
+
+  std::vector<uint8_t> context_map;
+  EntropyEncodingData codes;
+  HistogramParams params;
+  params.lz77_method = lz77 ? HistogramParams::LZ77Method::kLZ77
+                            : HistogramParams::LZ77Method::kNone;
+  params.force_huffman = !ans;
+
+  BitWriter writer;
+  {
+    auto input_values_copy = input_values;
+    BuildAndEncodeHistograms(params, 1, input_values_copy, &codes, &context_map,
+                             &writer, 0, nullptr);
+    WriteTokens(input_values_copy[0], codes, context_map, &writer, 0, nullptr);
+    writer.ZeroPadToByte();
+  }
+
+  // We do not truncate the output. Reading past the end reads out zeroes
+  // anyway.
+  BitReader br(writer.GetSpan());
+  Status status = true;
+  {
+    BitReaderScopedCloser bc(&br, &status);
+
+    std::vector<uint8_t> dec_context_map;
+    ANSCode decoded_codes;
+    ASSERT_TRUE(DecodeHistograms(&br, 1, &decoded_codes, &dec_context_map));
+    ASSERT_EQ(dec_context_map, context_map);
+    ANSSymbolReader reader(&decoded_codes, &br);
+
+    ANSSymbolReader::Checkpoint checkpoint;
+    size_t br_pos = 0;
+    constexpr size_t kInterval = ANSSymbolReader::kMaxCheckpointInterval - 2;
+    for (size_t i = 0; i < input_values[0].size(); i++) {
+      if (i % kInterval == 0 && i > 0) {
+        reader.Restore(checkpoint);
+        ASSERT_TRUE(br.Close());
+        br = BitReader(writer.GetSpan());
+        br.SkipBits(br_pos);
+        for (size_t j = i - kInterval; j < i; j++) {
+          Token symbol = input_values[0][j];
+          uint32_t read_symbol =
+              reader.ReadHybridUint(symbol.context, &br, dec_context_map);
+          ASSERT_EQ(read_symbol, symbol.value) << "j = " << j;
+        }
+      }
+      if (i % kInterval == 0) {
+        reader.Save(&checkpoint);
+        br_pos = br.TotalBitsConsumed();
+      }
+      Token symbol = input_values[0][i];
+      uint32_t read_symbol =
+          reader.ReadHybridUint(symbol.context, &br, dec_context_map);
+      ASSERT_EQ(read_symbol, symbol.value) << "i = " << i;
+    }
+    ASSERT_TRUE(reader.CheckANSFinalState());
+  }
+  EXPECT_TRUE(status);
+}
+
+TEST(ANSTest, TestCheckpointingANS) {
+  TestCheckpointing(/*ans=*/true, /*lz77=*/false);
+}
+
+TEST(ANSTest, TestCheckpointingPrefix) {
+  TestCheckpointing(/*ans=*/false, /*lz77=*/false);
+}
+
+TEST(ANSTest, TestCheckpointingANSLZ77) {
+  TestCheckpointing(/*ans=*/true, /*lz77=*/true);
+}
+
+TEST(ANSTest, TestCheckpointingPrefixLZ77) {
+  TestCheckpointing(/*ans=*/false, /*lz77=*/true);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/base/arch_macros.h b/third_party/jpeg-xl/lib/jxl/base/arch_macros.h
new file mode 100644
index 0000000000..a98301915e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/arch_macros.h
@@ -0,0 +1,33 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_ARCH_MACROS_H_
+#define LIB_JXL_BASE_ARCH_MACROS_H_
+
+// Defines the JXL_ARCH_* macros.
+
+namespace jxl {
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define JXL_ARCH_X64 1
+#else
+#define JXL_ARCH_X64 0
+#endif
+
+#if defined(__powerpc64__) || defined(_M_PPC)
+#define JXL_ARCH_PPC 1
+#else
+#define JXL_ARCH_PPC 0
+#endif
+
+#if defined(__aarch64__) || defined(__arm__)
+#define JXL_ARCH_ARM 1
+#else
+#define JXL_ARCH_ARM 0
+#endif
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_ARCH_MACROS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/bits.h b/third_party/jpeg-xl/lib/jxl/base/bits.h
new file mode 100644
index 0000000000..9f86118e72
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/bits.h
@@ -0,0 +1,147 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_BITS_H_
+#define LIB_JXL_BASE_BITS_H_
+
+// Specialized instructions for processing register-sized bit arrays.
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+#if JXL_COMPILER_MSVC
+#include <intrin.h>
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace jxl {
+
+// Empty struct used as a size tag type.
+template <size_t N>
+struct SizeTag {};
+
+template <typename T>
+constexpr bool IsSigned() {
+  return T(0) > T(-1);
+}
+
+// Undefined results for x == 0.
+static JXL_INLINE JXL_MAYBE_UNUSED size_t
+Num0BitsAboveMS1Bit_Nonzero(SizeTag<4> /* tag */, const uint32_t x) {
+  JXL_DASSERT(x != 0);
+#if JXL_COMPILER_MSVC
+  unsigned long index;
+  _BitScanReverse(&index, x);
+  return 31 - index;
+#else
+  return static_cast<size_t>(__builtin_clz(x));
+#endif
+}
+static JXL_INLINE JXL_MAYBE_UNUSED size_t
+Num0BitsAboveMS1Bit_Nonzero(SizeTag<8> /* tag */, const uint64_t x) {
+  JXL_DASSERT(x != 0);
+#if JXL_COMPILER_MSVC
+#if JXL_ARCH_X64
+  unsigned long index;
+  _BitScanReverse64(&index, x);
+  return 63 - index;
+#else   // JXL_ARCH_X64
+  // _BitScanReverse64 not available
+  uint32_t msb = static_cast<uint32_t>(x >> 32u);
+  unsigned long index;
+  if (msb == 0) {
+    uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
+    _BitScanReverse(&index, lsb);
+    return 63 - index;
+  } else {
+    _BitScanReverse(&index, msb);
+    return 31 - index;
+  }
+#endif  // JXL_ARCH_X64
+#else
+  return static_cast<size_t>(__builtin_clzll(x));
+#endif
+}
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t
+Num0BitsAboveMS1Bit_Nonzero(const T x) {
+  static_assert(!IsSigned<T>(), "Num0BitsAboveMS1Bit_Nonzero: use unsigned");
+  return Num0BitsAboveMS1Bit_Nonzero(SizeTag<sizeof(T)>(), x);
+}
+
+// Undefined results for x == 0.
+static JXL_INLINE JXL_MAYBE_UNUSED size_t
+Num0BitsBelowLS1Bit_Nonzero(SizeTag<4> /* tag */, const uint32_t x) {
+  JXL_DASSERT(x != 0);
+#if JXL_COMPILER_MSVC
+  unsigned long index;
+  _BitScanForward(&index, x);
+  return index;
+#else
+  return static_cast<size_t>(__builtin_ctz(x));
+#endif
+}
+static JXL_INLINE JXL_MAYBE_UNUSED size_t
+Num0BitsBelowLS1Bit_Nonzero(SizeTag<8> /* tag */, const uint64_t x) {
+  JXL_DASSERT(x != 0);
+#if JXL_COMPILER_MSVC
+#if JXL_ARCH_X64
+  unsigned long index;
+  _BitScanForward64(&index, x);
+  return index;
+#else   // JXL_ARCH_64
+  // _BitScanForward64 not available
+  uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
+  unsigned long index;
+  if (lsb == 0) {
+    uint32_t msb = static_cast<uint32_t>(x >> 32u);
+    _BitScanForward(&index, msb);
+    return 32 + index;
+  } else {
+    _BitScanForward(&index, lsb);
+    return index;
+  }
+#endif  // JXL_ARCH_X64
+#else
+  return static_cast<size_t>(__builtin_ctzll(x));
+#endif
+}
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsBelowLS1Bit_Nonzero(T x) {
+  static_assert(!IsSigned<T>(), "Num0BitsBelowLS1Bit_Nonzero: use unsigned");
+  return Num0BitsBelowLS1Bit_Nonzero(SizeTag<sizeof(T)>(), x);
+}
+
+// Returns bit width for x == 0.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsAboveMS1Bit(const T x) {
+  return (x == 0) ? sizeof(T) * 8 : Num0BitsAboveMS1Bit_Nonzero(x);
+}
+
+// Returns bit width for x == 0.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsBelowLS1Bit(const T x) {
+  return (x == 0) ? sizeof(T) * 8 : Num0BitsBelowLS1Bit_Nonzero(x);
+}
+
+// Returns base-2 logarithm, rounded down.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t FloorLog2Nonzero(const T x) {
+  return (sizeof(T) * 8 - 1) ^ Num0BitsAboveMS1Bit_Nonzero(x);
+}
+
+// Returns base-2 logarithm, rounded up.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t CeilLog2Nonzero(const T x) {
+  const size_t floor_log2 = FloorLog2Nonzero(x);
+  if ((x & (x - 1)) == 0) return floor_log2;  // power of two
+  return floor_log2 + 1;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_BITS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/byte_order.h b/third_party/jpeg-xl/lib/jxl/base/byte_order.h
new file mode 100644
index 0000000000..8966834e08
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/byte_order.h
@@ -0,0 +1,274 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_BYTE_ORDER_H_
+#define LIB_JXL_BASE_BYTE_ORDER_H_
+
+#include <jxl/types.h>
+#include <stdint.h>
+#include <string.h>  // memcpy
+
+#include "lib/jxl/base/compiler_specific.h"
+
+#if JXL_COMPILER_MSVC
+#include <intrin.h>  // _byteswap_*
+#endif
+
+#if (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#define JXL_BYTE_ORDER_LITTLE 1
+#else
+// This means that we don't know that the byte order is little endian, in
+// this case we use endian-neutral code that works for both little- and
+// big-endian.
+#define JXL_BYTE_ORDER_LITTLE 0
+#endif
+
+// Returns whether the system is little-endian (least-significant byte first).
+#if JXL_BYTE_ORDER_LITTLE
+static constexpr bool IsLittleEndian() { return true; }
+#else
+static inline bool IsLittleEndian() {
+  const uint32_t multibyte = 1;
+  uint8_t byte;
+  memcpy(&byte, &multibyte, 1);
+  return byte == 1;
+}
+#endif
+
+static inline bool SwapEndianness(JxlEndianness endianness) {
+  return ((endianness == JXL_BIG_ENDIAN && IsLittleEndian()) ||
+          (endianness == JXL_LITTLE_ENDIAN && !IsLittleEndian()));
+}
+
+#if JXL_COMPILER_MSVC
+#define JXL_BSWAP16(x) _byteswap_ushort(x)
+#define JXL_BSWAP32(x) _byteswap_ulong(x)
+#define JXL_BSWAP64(x) _byteswap_uint64(x)
+#else
+#define JXL_BSWAP16(x) __builtin_bswap16(x)
+#define JXL_BSWAP32(x) __builtin_bswap32(x)
+#define JXL_BSWAP64(x) __builtin_bswap64(x)
+#endif
+
+static JXL_INLINE uint32_t LoadBE16(const uint8_t* p) {
+  const uint32_t byte1 = p[0];
+  const uint32_t byte0 = p[1];
+  return (byte1 << 8) | byte0;
+}
+
+static JXL_INLINE uint32_t LoadLE16(const uint8_t* p) {
+  const uint32_t byte0 = p[0];
+  const uint32_t byte1 = p[1];
+  return (byte1 << 8) | byte0;
+}
+
+static JXL_INLINE uint32_t LoadBE32(const uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  uint32_t big;
+  memcpy(&big, p, 4);
+  return JXL_BSWAP32(big);
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  const uint32_t byte3 = p[0];
+  const uint32_t byte2 = p[1];
+  const uint32_t byte1 = p[2];
+  const uint32_t byte0 = p[3];
+  return (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0;
+#endif
+}
+
+static JXL_INLINE uint64_t LoadBE64(const uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  uint64_t big;
+  memcpy(&big, p, 8);
+  return JXL_BSWAP64(big);
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  const uint64_t byte7 = p[0];
+  const uint64_t byte6 = p[1];
+  const uint64_t byte5 = p[2];
+  const uint64_t byte4 = p[3];
+  const uint64_t byte3 = p[4];
+  const uint64_t byte2 = p[5];
+  const uint64_t byte1 = p[6];
+  const uint64_t byte0 = p[7];
+  return (byte7 << 56ull) | (byte6 << 48ull) | (byte5 << 40ull) |
+         (byte4 << 32ull) | (byte3 << 24ull) | (byte2 << 16ull) |
+         (byte1 << 8ull) | byte0;
+#endif
+}
+
+static JXL_INLINE uint32_t LoadLE32(const uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  uint32_t little;
+  memcpy(&little, p, 4);
+  return little;
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  const uint32_t byte0 = p[0];
+  const uint32_t byte1 = p[1];
+  const uint32_t byte2 = p[2];
+  const uint32_t byte3 = p[3];
+  return (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0;
+#endif
+}
+
+static JXL_INLINE uint64_t LoadLE64(const uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  uint64_t little;
+  memcpy(&little, p, 8);
+  return little;
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  const uint64_t byte0 = p[0];
+  const uint64_t byte1 = p[1];
+  const uint64_t byte2 = p[2];
+  const uint64_t byte3 = p[3];
+  const uint64_t byte4 = p[4];
+  const uint64_t byte5 = p[5];
+  const uint64_t byte6 = p[6];
+  const uint64_t byte7 = p[7];
+  return (byte7 << 56) | (byte6 << 48) | (byte5 << 40) | (byte4 << 32) |
+         (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0;
+#endif
+}
+
+// Loads a Big-Endian float
+static JXL_INLINE float LoadBEFloat(const uint8_t* p) {
+  uint32_t u = LoadBE32(p);
+  float result;
+  memcpy(&result, &u, 4);
+  return result;
+}
+
+// Loads a Little-Endian float
+static JXL_INLINE float LoadLEFloat(const uint8_t* p) {
+  uint32_t u = LoadLE32(p);
+  float result;
+  memcpy(&result, &u, 4);
+  return result;
+}
+
+static JXL_INLINE void StoreBE16(const uint32_t native, uint8_t* p) {
+  p[0] = (native >> 8) & 0xFF;
+  p[1] = native & 0xFF;
+}
+
+static JXL_INLINE void StoreLE16(const uint32_t native, uint8_t* p) {
+  p[1] = (native >> 8) & 0xFF;
+  p[0] = native & 0xFF;
+}
+
+static JXL_INLINE void StoreBE32(const uint32_t native, uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  const uint32_t big = JXL_BSWAP32(native);
+  memcpy(p, &big, 4);
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  p[0] = native >> 24;
+  p[1] = (native >> 16) & 0xFF;
+  p[2] = (native >> 8) & 0xFF;
+  p[3] = native & 0xFF;
+#endif
+}
+
+static JXL_INLINE void StoreBE64(const uint64_t native, uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  const uint64_t big = JXL_BSWAP64(native);
+  memcpy(p, &big, 8);
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  p[0] = native >> 56ull;
+  p[1] = (native >> 48ull) & 0xFF;
+  p[2] = (native >> 40ull) & 0xFF;
+  p[3] = (native >> 32ull) & 0xFF;
+  p[4] = (native >> 24ull) & 0xFF;
+  p[5] = (native >> 16ull) & 0xFF;
+  p[6] = (native >> 8ull) & 0xFF;
+  p[7] = native & 0xFF;
+#endif
+}
+
+static JXL_INLINE void StoreLE32(const uint32_t native, uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  const uint32_t little = native;
+  memcpy(p, &little, 4);
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  p[3] = native >> 24;
+  p[2] = (native >> 16) & 0xFF;
+  p[1] = (native >> 8) & 0xFF;
+  p[0] = native & 0xFF;
+#endif
+}
+
+static JXL_INLINE void StoreLE64(const uint64_t native, uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  const uint64_t little = native;
+  memcpy(p, &little, 8);
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  p[7] = native >> 56;
+  p[6] = (native >> 48) & 0xFF;
+  p[5] = (native >> 40) & 0xFF;
+  p[4] = (native >> 32) & 0xFF;
+  p[3] = (native >> 24) & 0xFF;
+  p[2] = (native >> 16) & 0xFF;
+  p[1] = (native >> 8) & 0xFF;
+  p[0] = native & 0xFF;
+#endif
+}
+
+static JXL_INLINE float BSwapFloat(float x) {
+  uint32_t u;
+  memcpy(&u, &x, 4);
+  uint32_t uswap = JXL_BSWAP32(u);
+  float xswap;
+  memcpy(&xswap, &uswap, 4);
+  return xswap;
+}
+
+// Big/Little Endian order.
+struct OrderBE {};
+struct OrderLE {};
+
+// Wrappers for calling from generic code.
+static JXL_INLINE void Store16(OrderBE /*tag*/, const uint32_t native,
+                               uint8_t* p) {
+  return StoreBE16(native, p);
+}
+
+static JXL_INLINE void Store16(OrderLE /*tag*/, const uint32_t native,
+                               uint8_t* p) {
+  return StoreLE16(native, p);
+}
+
+static JXL_INLINE void Store32(OrderBE /*tag*/, const uint32_t native,
+                               uint8_t* p) {
+  return StoreBE32(native, p);
+}
+
+static JXL_INLINE void Store32(OrderLE /*tag*/, const uint32_t native,
+                               uint8_t* p) {
+  return StoreLE32(native, p);
+}
+
+static JXL_INLINE uint32_t Load16(OrderBE /*tag*/, const uint8_t* p) {
+  return LoadBE16(p);
+}
+
+static JXL_INLINE uint32_t Load16(OrderLE /*tag*/, const uint8_t* p) {
+  return LoadLE16(p);
+}
+
+static JXL_INLINE uint32_t Load32(OrderBE /*tag*/, const uint8_t* p) {
+  return LoadBE32(p);
+}
+
+static JXL_INLINE uint32_t Load32(OrderLE /*tag*/, const uint8_t* p) {
+  return LoadLE32(p);
+}
+
+#endif  // LIB_JXL_BASE_BYTE_ORDER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc
new file mode 100644
index 0000000000..9a9cc585a1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc
@@ -0,0 +1,157 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/cache_aligned.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+// Disabled: slower than malloc + alignment.
+#define JXL_USE_MMAP 0
+
+#if JXL_USE_MMAP
+#include <sys/mman.h>
+#endif
+
+#include <algorithm>  // std::max
+#include <atomic>
+#include <hwy/base.h>  // kMaxVectorSize
+#include <limits>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace {
+
+#pragma pack(push, 1)
+struct AllocationHeader {
+  void* allocated;
+  size_t allocated_size;
+  uint8_t left_padding[hwy::kMaxVectorSize];
+};
+#pragma pack(pop)
+
+std::atomic<uint64_t> num_allocations{0};
+std::atomic<uint64_t> bytes_in_use{0};
+std::atomic<uint64_t> max_bytes_in_use{0};
+
+}  // namespace
+
+// Avoids linker errors in pre-C++17 builds.
+constexpr size_t CacheAligned::kPointerSize;
+constexpr size_t CacheAligned::kCacheLineSize;
+constexpr size_t CacheAligned::kAlignment;
+constexpr size_t CacheAligned::kAlias;
+
+void CacheAligned::PrintStats() {
+  fprintf(
+      stderr, "Allocations: %" PRIuS " (max bytes in use: %E)\n",
+      static_cast<size_t>(num_allocations.load(std::memory_order_relaxed)),
+      static_cast<double>(max_bytes_in_use.load(std::memory_order_relaxed)));
+}
+
+size_t CacheAligned::NextOffset() {
+  static std::atomic<uint32_t> next{0};
+  constexpr uint32_t kGroups = CacheAligned::kAlias / CacheAligned::kAlignment;
+  const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups;
+  return CacheAligned::kAlignment * group;
+}
+
+void* CacheAligned::Allocate(const size_t payload_size, size_t offset) {
+  JXL_ASSERT(payload_size <= std::numeric_limits<size_t>::max() / 2);
+  JXL_ASSERT((offset % kAlignment == 0) && offset <= kAlias);
+
+  // What: | misalign | unused | AllocationHeader |payload
+  // Size: |<= kAlias | offset |                  |payload_size
+  //       ^allocated.^aligned.^header............^payload
+  // The header must immediately precede payload, which must remain aligned.
+  // To avoid wasting space, the header resides at the end of `unused`,
+  // which therefore cannot be empty (offset == 0).
+  if (offset == 0) {
+    // SVE/RVV vectors can be large, so we cannot rely on them (including the
+    // padding at the end of AllocationHeader) to fit in kAlignment.
+    offset = hwy::RoundUpTo(sizeof(AllocationHeader), kAlignment);
+  }
+
+#if JXL_USE_MMAP
+  const size_t allocated_size = offset + payload_size;
+  const int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE;
+  void* allocated =
+      mmap(nullptr, allocated_size, PROT_READ | PROT_WRITE, flags, -1, 0);
+  if (allocated == MAP_FAILED) return nullptr;
+  const uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated);
+#else
+  const size_t allocated_size = kAlias + offset + payload_size;
+  void* allocated = malloc(allocated_size);
+  if (allocated == nullptr) return nullptr;
+  // Always round up even if already aligned - we already asked for kAlias
+  // extra bytes and there's no way to give them back.
+  uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
+  static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
+  static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
+  aligned &= ~(kAlias - 1);
+#endif
+
+#if 0
+  // No effect.
+  uintptr_t page_aligned = reinterpret_cast<uintptr_t>(allocated);
+  page_aligned &= ~(4096 - 1);
+  if (madvise(reinterpret_cast<void*>(page_aligned), allocated_size,
+              MADV_WILLNEED) != 0) {
+    JXL_NOTIFY_ERROR("madvise failed");
+  }
+#elif 0
+  // INCREASES both first and subsequent decode times.
+  if (mlock(allocated, allocated_size) != 0) {
+    JXL_NOTIFY_ERROR("mlock failed");
+  }
+#endif
+
+  // Update statistics (#allocations and max bytes in use)
+  num_allocations.fetch_add(1, std::memory_order_relaxed);
+  const uint64_t prev_bytes =
+      bytes_in_use.fetch_add(allocated_size, std::memory_order_acq_rel);
+  uint64_t expected_max = max_bytes_in_use.load(std::memory_order_acquire);
+  for (;;) {
+    const uint64_t desired =
+        std::max(expected_max, prev_bytes + allocated_size);
+    if (max_bytes_in_use.compare_exchange_strong(expected_max, desired,
+                                                 std::memory_order_acq_rel)) {
+      break;
+    }
+  }
+
+  const uintptr_t payload = aligned + offset;  // still aligned
+
+  // Stash `allocated` and payload_size inside header for use by Free().
+  AllocationHeader* header = reinterpret_cast<AllocationHeader*>(payload) - 1;
+  header->allocated = allocated;
+  header->allocated_size = allocated_size;
+
+  return JXL_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), 64);
+}
+
+void CacheAligned::Free(const void* aligned_pointer) {
+  if (aligned_pointer == nullptr) {
+    return;
+  }
+  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
+  JXL_ASSERT(payload % kAlignment == 0);
+  const AllocationHeader* header =
+      reinterpret_cast<const AllocationHeader*>(payload) - 1;
+
+  // Subtract (2's complement negation).
+  bytes_in_use.fetch_add(~header->allocated_size + 1,
+                         std::memory_order_acq_rel);
+
+#if JXL_USE_MMAP
+  munmap(header->allocated, header->allocated_size);
+#else
+  free(header->allocated);
+#endif
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/base/cache_aligned.h b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.h
new file mode 100644
index 0000000000..e57df14837
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.h
@@ -0,0 +1,74 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_CACHE_ALIGNED_H_
+#define LIB_JXL_BASE_CACHE_ALIGNED_H_
+
+// Memory allocator with support for alignment + misalignment.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <memory>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+
+// Functions that depend on the cache line size.
+class CacheAligned {
+ public:
+  static void PrintStats();
+
+  static constexpr size_t kPointerSize = sizeof(void*);
+  static constexpr size_t kCacheLineSize = 64;
+  // To avoid RFOs, match L2 fill size (pairs of lines).
+  static constexpr size_t kAlignment = 2 * kCacheLineSize;
+  // Minimum multiple for which cache set conflicts and/or loads blocked by
+  // preceding stores can occur.
+  static constexpr size_t kAlias = 2048;
+
+  // Returns a 'random' (cyclical) offset suitable for Allocate.
+  static size_t NextOffset();
+
+  // Returns null or memory whose address is congruent to `offset` (mod kAlias).
+  // This reduces cache conflicts and load/store stalls, especially with large
+  // allocations that would otherwise have similar alignments. At least
+  // `payload_size` (which can be zero) bytes will be accessible.
+  static void* Allocate(size_t payload_size, size_t offset);
+
+  static void* Allocate(const size_t payload_size) {
+    return Allocate(payload_size, NextOffset());
+  }
+
+  static void Free(const void* aligned_pointer);
+};
+
+// Avoids the need for a function pointer (deleter) in CacheAlignedUniquePtr.
+struct CacheAlignedDeleter {
+  void operator()(uint8_t* aligned_pointer) const {
+    return CacheAligned::Free(aligned_pointer);
+  }
+};
+
+using CacheAlignedUniquePtr = std::unique_ptr<uint8_t[], CacheAlignedDeleter>;
+
+// Does not invoke constructors.
+static inline CacheAlignedUniquePtr AllocateArray(const size_t bytes) {
+  return CacheAlignedUniquePtr(
+      static_cast<uint8_t*>(CacheAligned::Allocate(bytes)),
+      CacheAlignedDeleter());
+}
+
+static inline CacheAlignedUniquePtr AllocateArray(const size_t bytes,
+                                                  const size_t offset) {
+  return CacheAlignedUniquePtr(
+      static_cast<uint8_t*>(CacheAligned::Allocate(bytes, offset)),
+      CacheAlignedDeleter());
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_CACHE_ALIGNED_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h b/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h
new file mode 100644
index 0000000000..abe1261f48
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h
@@ -0,0 +1,157 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_COMPILER_SPECIFIC_H_
+#define LIB_JXL_BASE_COMPILER_SPECIFIC_H_
+
+// Macros for compiler version + nonstandard keywords, e.g. __builtin_expect.
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "lib/jxl/base/sanitizer_definitions.h"
+
+// #if is shorter and safer than #ifdef. *_VERSION are zero if not detected,
+// otherwise 100 * major + minor version. Note that other packages check for
+// #ifdef COMPILER_MSVC, so we cannot use that same name.
+
+#ifdef _MSC_VER
+#define JXL_COMPILER_MSVC _MSC_VER
+#else
+#define JXL_COMPILER_MSVC 0
+#endif
+
+#ifdef __GNUC__
+#define JXL_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
+#else
+#define JXL_COMPILER_GCC 0
+#endif
+
+#ifdef __clang__
+#define JXL_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
+// Clang pretends to be GCC for compatibility.
+#undef JXL_COMPILER_GCC
+#define JXL_COMPILER_GCC 0
+#else
+#define JXL_COMPILER_CLANG 0
+#endif
+
+#if JXL_COMPILER_MSVC
+#define JXL_RESTRICT __restrict
+#elif JXL_COMPILER_GCC || JXL_COMPILER_CLANG
+#define JXL_RESTRICT __restrict__
+#else
+#define JXL_RESTRICT
+#endif
+
+#if JXL_COMPILER_MSVC
+#define JXL_INLINE __forceinline
+#define JXL_NOINLINE __declspec(noinline)
+#else
+#define JXL_INLINE inline __attribute__((always_inline))
+#define JXL_NOINLINE __attribute__((noinline))
+#endif
+
+#if JXL_COMPILER_MSVC
+#define JXL_NORETURN __declspec(noreturn)
+#elif JXL_COMPILER_GCC || JXL_COMPILER_CLANG
+#define JXL_NORETURN __attribute__((noreturn))
+#else
+#define JXL_NORETURN
+#endif
+
+#if JXL_COMPILER_MSVC
+#define JXL_UNREACHABLE __assume(false)
+#elif JXL_COMPILER_CLANG || JXL_COMPILER_GCC >= 405
+#define JXL_UNREACHABLE __builtin_unreachable()
+#else
+#define JXL_UNREACHABLE
+#endif
+
+#if JXL_COMPILER_MSVC
+#define JXL_MAYBE_UNUSED
+#else
+// Encountered "attribute list cannot appear here" when using the C++17
+// [[maybe_unused]], so only use the old style attribute for now.
+#define JXL_MAYBE_UNUSED __attribute__((unused))
+#endif
+
+// MSAN execution won't hurt if some code it not inlined, but this can greatly
+// improve compilation time. Unfortunately this macro can not be used just
+// everywhere - inside header files it leads to "multiple definition" error;
+// though it would be better not to have JXL_INLINE in header overall.
+#if JXL_MEMORY_SANITIZER || JXL_ADDRESS_SANITIZER || JXL_THREAD_SANITIZER
+#define JXL_MAYBE_INLINE JXL_MAYBE_UNUSED
+#else
+#define JXL_MAYBE_INLINE JXL_INLINE
+#endif
+
+#if JXL_COMPILER_MSVC
+// Unsupported, __assume is not the same.
+#define JXL_LIKELY(expr) expr
+#define JXL_UNLIKELY(expr) expr
+#else
+#define JXL_LIKELY(expr) __builtin_expect(!!(expr), 1)
+#define JXL_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#endif
+
+// Returns a void* pointer which the compiler then assumes is N-byte aligned.
+// Example: float* JXL_RESTRICT aligned = (float*)JXL_ASSUME_ALIGNED(in, 32);
+//
+// The assignment semantics are required by GCC/Clang. ICC provides an in-place
+// __assume_aligned, whereas MSVC's __assume appears unsuitable.
+#if JXL_COMPILER_CLANG
+// Early versions of Clang did not support __builtin_assume_aligned.
+#define JXL_HAS_ASSUME_ALIGNED __has_builtin(__builtin_assume_aligned)
+#elif JXL_COMPILER_GCC
+#define JXL_HAS_ASSUME_ALIGNED 1
+#else
+#define JXL_HAS_ASSUME_ALIGNED 0
+#endif
+
+#if JXL_HAS_ASSUME_ALIGNED
+#define JXL_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
+#else
+#define JXL_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
+#endif
+
+#ifdef __has_attribute
+#define JXL_HAVE_ATTRIBUTE(x) __has_attribute(x)
+#else
+#define JXL_HAVE_ATTRIBUTE(x) 0
+#endif
+
+// Raises warnings if the function return value is unused. Should appear as the
+// first part of a function definition/declaration.
+#if JXL_HAVE_ATTRIBUTE(nodiscard)
+#define JXL_MUST_USE_RESULT [[nodiscard]]
+#elif JXL_COMPILER_CLANG && JXL_HAVE_ATTRIBUTE(warn_unused_result)
+#define JXL_MUST_USE_RESULT __attribute__((warn_unused_result))
+#else
+#define JXL_MUST_USE_RESULT
+#endif
+
+// Disable certain -fsanitize flags for functions that are expected to include
+// things like unsigned integer overflow. For example use in the function
+// declaration JXL_NO_SANITIZE("unsigned-integer-overflow") to silence unsigned
+// integer overflow ubsan messages.
+#if JXL_COMPILER_CLANG && JXL_HAVE_ATTRIBUTE(no_sanitize)
+#define JXL_NO_SANITIZE(X) __attribute__((no_sanitize(X)))
+#else
+#define JXL_NO_SANITIZE(X)
+#endif
+
+#if JXL_HAVE_ATTRIBUTE(__format__)
+#define JXL_FORMAT(idx_fmt, idx_arg) \
+  __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
+#else
+#define JXL_FORMAT(idx_fmt, idx_arg)
+#endif
+
+#if JXL_COMPILER_MSVC
+using ssize_t = intptr_t;
+#endif
+
+#endif  // LIB_JXL_BASE_COMPILER_SPECIFIC_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc b/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc
new file mode 100644
index 0000000000..20a911255c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/data_parallel.h"
+
+namespace jxl {
+
+// static
+JxlParallelRetCode ThreadPool::SequentialRunnerStatic(
+    void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
+    JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) {
+  JxlParallelRetCode init_ret = (*init)(jpegxl_opaque, 1);
+  if (init_ret != 0) return init_ret;
+
+  for (uint32_t i = start_range; i < end_range; i++) {
+    (*func)(jpegxl_opaque, i, 0);
+  }
+  return 0;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/base/data_parallel.h b/third_party/jpeg-xl/lib/jxl/base/data_parallel.h
new file mode 100644
index 0000000000..ba7e7adfad
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/data_parallel.h
@@ -0,0 +1,120 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_DATA_PARALLEL_H_
+#define LIB_JXL_BASE_DATA_PARALLEL_H_
+
+// Portable, low-overhead C++11 ThreadPool alternative to OpenMP for
+// data-parallel computations.
+
+#include <jxl/parallel_runner.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/status.h"
+#if JXL_COMPILER_MSVC
+// suppress warnings about the const & applied to function types
+#pragma warning(disable : 4180)
+#endif
+
+namespace jxl {
+
+class ThreadPool {
+ public:
+  ThreadPool(JxlParallelRunner runner, void* runner_opaque)
+      : runner_(runner ? runner : &ThreadPool::SequentialRunnerStatic),
+        runner_opaque_(runner ? runner_opaque : static_cast<void*>(this)) {}
+
+  ThreadPool(const ThreadPool&) = delete;
+  ThreadPool& operator&(const ThreadPool&) = delete;
+
+  JxlParallelRunner runner() const { return runner_; }
+  void* runner_opaque() const { return runner_opaque_; }
+
+  // Runs init_func(num_threads) followed by data_func(task, thread) on worker
+  // thread(s) for every task in [begin, end). init_func() must return a Status
+  // indicating whether the initialization succeeded.
+  // "thread" is an integer smaller than num_threads.
+  // Not thread-safe - no two calls to Run may overlap.
+  // Subsequent calls will reuse the same threads.
+  //
+  // Precondition: begin <= end.
+  template <class InitFunc, class DataFunc>
+  Status Run(uint32_t begin, uint32_t end, const InitFunc& init_func,
+             const DataFunc& data_func, const char* caller = "") {
+    JXL_ASSERT(begin <= end);
+    if (begin == end) return true;
+    RunCallState<InitFunc, DataFunc> call_state(init_func, data_func);
+    // The runner_ uses the C convention and returns 0 in case of error, so we
+    // convert it to a Status.
+    return (*runner_)(runner_opaque_, static_cast<void*>(&call_state),
+                      &call_state.CallInitFunc, &call_state.CallDataFunc, begin,
+                      end) == 0;
+  }
+
+  // Use this as init_func when no initialization is needed.
+  static Status NoInit(size_t num_threads) { return true; }
+
+ private:
+  // class holding the state of a Run() call to pass to the runner_ as an
+  // opaque_jpegxl pointer.
+  template <class InitFunc, class DataFunc>
+  class RunCallState final {
+   public:
+    RunCallState(const InitFunc& init_func, const DataFunc& data_func)
+        : init_func_(init_func), data_func_(data_func) {}
+
+    // JxlParallelRunInit interface.
+    static int CallInitFunc(void* jpegxl_opaque, size_t num_threads) {
+      const auto* self =
+          static_cast<RunCallState<InitFunc, DataFunc>*>(jpegxl_opaque);
+      // Returns -1 when the internal init function returns false Status to
+      // indicate an error.
+      return self->init_func_(num_threads) ? 0 : -1;
+    }
+
+    // JxlParallelRunFunction interface.
+    static void CallDataFunc(void* jpegxl_opaque, uint32_t value,
+                             size_t thread_id) {
+      const auto* self =
+          static_cast<RunCallState<InitFunc, DataFunc>*>(jpegxl_opaque);
+      return self->data_func_(value, thread_id);
+    }
+
+   private:
+    const InitFunc& init_func_;
+    const DataFunc& data_func_;
+  };
+
+  // Default JxlParallelRunner used when no runner is provided by the
+  // caller. This runner doesn't use any threading and thread_id is always 0.
+  static JxlParallelRetCode SequentialRunnerStatic(
+      void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
+      JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range);
+
+  // The caller supplied runner function and its opaque void*.
+  const JxlParallelRunner runner_;
+  void* const runner_opaque_;
+};
+
+template <class InitFunc, class DataFunc>
+Status RunOnPool(ThreadPool* pool, const uint32_t begin, const uint32_t end,
+                 const InitFunc& init_func, const DataFunc& data_func,
+                 const char* caller) {
+  if (pool == nullptr) {
+    ThreadPool default_pool(nullptr, nullptr);
+    return default_pool.Run(begin, end, init_func, data_func, caller);
+  } else {
+    return pool->Run(begin, end, init_func, data_func, caller);
+  }
+}
+
+}  // namespace jxl
+#if JXL_COMPILER_MSVC
+#pragma warning(default : 4180)
+#endif
+
+#endif  // LIB_JXL_BASE_DATA_PARALLEL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/file_io.h b/third_party/jpeg-xl/lib/jxl/base/file_io.h
new file mode 100644
index 0000000000..64d5860915
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/file_io.h
@@ -0,0 +1,153 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_FILE_IO_H_
+#define LIB_JXL_BASE_FILE_IO_H_
+
+// Helper functions for reading/writing files.
+
+#include <stdio.h>
+#include <sys/stat.h>
+
+#include <list>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Returns extension including the dot, or empty string if none. Assumes
+// filename is not a hidden file (e.g. ".bashrc"). May be called with a pathname
+// if the filename contains a dot and/or no other path component does.
+static inline std::string Extension(const std::string& filename) {
+  const size_t pos = filename.rfind('.');
+  if (pos == std::string::npos) return std::string();
+  return filename.substr(pos);
+}
+
+// RAII, ensures files are closed even when returning early.
+class FileWrapper {
+ public:
+  FileWrapper(const FileWrapper& other) = delete;
+  FileWrapper& operator=(const FileWrapper& other) = delete;
+
+  explicit FileWrapper(const std::string& pathname, const char* mode)
+      : file_(pathname == "-" ? (mode[0] == 'r' ? stdin : stdout)
+                              : fopen(pathname.c_str(), mode)),
+        close_on_delete_(pathname != "-") {
+#ifdef _WIN32
+    struct __stat64 s = {};
+    const int err = _stat64(pathname.c_str(), &s);
+    const bool is_file = (s.st_mode & S_IFREG) != 0;
+#else
+    struct stat s = {};
+    const int err = stat(pathname.c_str(), &s);
+    const bool is_file = S_ISREG(s.st_mode);
+#endif
+    if (err == 0 && is_file) {
+      size_ = s.st_size;
+    }
+  }
+
+  ~FileWrapper() {
+    if (file_ != nullptr && close_on_delete_) {
+      const int err = fclose(file_);
+      JXL_CHECK(err == 0);
+    }
+  }
+
+  // We intend to use FileWrapper as a replacement of FILE.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  operator FILE*() const { return file_; }
+
+  int64_t size() { return size_; }
+
+ private:
+  FILE* const file_;
+  bool close_on_delete_ = true;
+  int64_t size_ = -1;
+};
+
+template <typename ContainerType>
+static inline Status ReadFile(const std::string& pathname,
+                              ContainerType* JXL_RESTRICT bytes) {
+  FileWrapper f(pathname, "rb");
+  if (f == nullptr)
+    return JXL_FAILURE("Failed to open file for reading: %s", pathname.c_str());
+
+  // Get size of file in bytes
+  const int64_t size = f.size();
+  if (size < 0) {
+    // Size is unknown, loop reading chunks until EOF.
+    bytes->clear();
+    std::list<std::vector<uint8_t>> chunks;
+
+    size_t total_size = 0;
+    while (true) {
+      std::vector<uint8_t> chunk(16 * 1024);
+      const size_t bytes_read = fread(chunk.data(), 1, chunk.size(), f);
+      if (ferror(f) || bytes_read > chunk.size()) {
+        return JXL_FAILURE("Error reading %s", pathname.c_str());
+      }
+
+      chunk.resize(bytes_read);
+      total_size += bytes_read;
+      if (bytes_read != 0) {
+        chunks.emplace_back(std::move(chunk));
+      }
+      if (feof(f)) {
+        break;
+      }
+    }
+    bytes->resize(total_size);
+    size_t pos = 0;
+    for (const auto& chunk : chunks) {
+      // Needed in case ContainerType is std::string, whose data() is const.
+      char* bytes_writable = reinterpret_cast<char*>(&(*bytes)[0]);
+      memcpy(bytes_writable + pos, chunk.data(), chunk.size());
+      pos += chunk.size();
+    }
+  } else {
+    // Size is known, read the file directly.
+    bytes->resize(static_cast<size_t>(size));
+    size_t pos = 0;
+    while (pos < bytes->size()) {
+      // Needed in case ContainerType is std::string, whose data() is const.
+      char* bytes_writable = reinterpret_cast<char*>(&(*bytes)[0]);
+      const size_t bytes_read =
+          fread(bytes_writable + pos, 1, bytes->size() - pos, f);
+      if (bytes_read == 0) return JXL_FAILURE("Failed to read");
+      pos += bytes_read;
+    }
+    JXL_ASSERT(pos == bytes->size());
+  }
+  return true;
+}
+
+template <typename ContainerType>
+static inline Status WriteFile(const ContainerType& bytes,
+                               const std::string& pathname) {
+  FileWrapper f(pathname, "wb");
+  if (f == nullptr)
+    return JXL_FAILURE("Failed to open file for writing: %s", pathname.c_str());
+
+  size_t pos = 0;
+  while (pos < bytes.size()) {
+    const size_t bytes_written =
+        fwrite(bytes.data() + pos, 1, bytes.size() - pos, f);
+    if (bytes_written == 0) return JXL_FAILURE("Failed to write");
+    pos += bytes_written;
+  }
+  JXL_ASSERT(pos == bytes.size());
+
+  return true;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_FILE_IO_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/float.h b/third_party/jpeg-xl/lib/jxl/base/float.h
new file mode 100644
index 0000000000..90bdeedf54
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/float.h
@@ -0,0 +1,98 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_FLOAT_H_
+#define LIB_JXL_BASE_FLOAT_H_
+
+#include <jxl/types.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+namespace {
+// Based on highway scalar implementation, for testing
+float LoadFloat16(uint16_t bits16) {
+  const uint32_t sign = bits16 >> 15;
+  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
+  const uint32_t mantissa = bits16 & 0x3FF;
+
+  // Subnormal or zero
+  if (biased_exp == 0) {
+    const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024));
+    return sign ? -subnormal : subnormal;
+  }
+
+  // Normalized: convert the representation directly (faster than ldexp/tables).
+  const uint32_t biased_exp32 = biased_exp + (127 - 15);
+  const uint32_t mantissa32 = mantissa << (23 - 10);
+  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
+
+  float result;
+  memcpy(&result, &bits32, 4);
+  return result;
+}
+}  // namespace
+
+template <typename SaveFloatAtFn>
+static Status JXL_INLINE LoadFloatRow(const uint8_t* src, size_t count,
+                                      size_t stride, JxlDataType type,
+                                      bool little_endian, float scale,
+                                      SaveFloatAtFn callback) {
+  switch (type) {
+    case JXL_TYPE_FLOAT:
+      if (little_endian) {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadLEFloat(src + stride * i));
+        }
+      } else {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadBEFloat(src + stride * i));
+        }
+      }
+      return true;
+
+    case JXL_TYPE_UINT8:
+      for (size_t i = 0; i < count; ++i) {
+        callback(i, src[stride * i] * scale);
+      }
+      return true;
+
+    case JXL_TYPE_UINT16:
+      if (little_endian) {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadLE16(src + stride * i) * scale);
+        }
+      } else {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadBE16(src + stride * i) * scale);
+        }
+      }
+      return true;
+
+    case JXL_TYPE_FLOAT16:
+      if (little_endian) {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadFloat16(LoadLE16(src + stride * i)));
+        }
+      } else {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadFloat16(LoadBE16(src + stride * i)));
+        }
+      }
+      return true;
+
+    default:
+      return JXL_FAILURE("Unsupported sample format");
+  }
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_FLOAT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/iaca.h b/third_party/jpeg-xl/lib/jxl/base/iaca.h
new file mode 100644
index 0000000000..e5732dae5c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/iaca.h
@@ -0,0 +1,65 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_IACA_H_
+#define LIB_JXL_BASE_IACA_H_
+
+#include "lib/jxl/base/compiler_specific.h"
+
+// IACA (Intel's Code Analyzer) analyzes instruction latencies, but only for
+// code between special markers. These functions embed such markers in an
+// executable, but only for reading via IACA - they deliberately trigger a
+// crash if executed to ensure they are removed in normal builds.
+
+#ifndef JXL_IACA_ENABLED
+#define JXL_IACA_ENABLED 0
+#endif
+
+namespace jxl {
+
+// Call before the region of interest.
+static JXL_INLINE void BeginIACA() {
+#if JXL_IACA_ENABLED && (JXL_COMPILER_GCC || JXL_COMPILER_CLANG)
+  asm volatile(
+      // UD2 "instruction" raises an invalid opcode exception.
+      ".byte 0x0F, 0x0B\n\t"
+      // Magic sequence recognized by IACA (MOV + addr32 fs:NOP). This actually
+      // clobbers EBX, but we don't care because the code won't be run, and we
+      // want IACA to observe the same code the compiler would have generated
+      // without this marker.
+      "movl $111, %%ebx\n\t"
+      ".byte 0x64, 0x67, 0x90\n\t"
+      :
+      :
+      // (Allegedly) clobbering memory may prevent reordering.
+      : "memory");
+#endif
+}
+
+// Call after the region of interest.
+static JXL_INLINE void EndIACA() {
+#if JXL_IACA_ENABLED && (JXL_COMPILER_GCC || JXL_COMPILER_CLANG)
+  asm volatile(
+      // See above.
+      "movl $222, %%ebx\n\t"
+      ".byte 0x64, 0x67, 0x90\n\t"
+      // UD2
+      ".byte 0x0F, 0x0B\n\t"
+      :
+      :
+      // (Allegedly) clobbering memory may prevent reordering.
+      : "memory");
+#endif
+}
+
+// Add to a scope to mark a region.
+struct ScopeIACA {
+  JXL_INLINE ScopeIACA() { BeginIACA(); }
+  JXL_INLINE ~ScopeIACA() { EndIACA(); }
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_IACA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/os_macros.h b/third_party/jpeg-xl/lib/jxl/base/os_macros.h
new file mode 100644
index 0000000000..84d0b82bf5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/os_macros.h
@@ -0,0 +1,50 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_OS_MACROS_H_
+#define LIB_JXL_BASE_OS_MACROS_H_
+
+// Defines the JXL_OS_* macros.
+
+#if defined(_WIN32) || defined(_WIN64)
+#define JXL_OS_WIN 1
+#else
+#define JXL_OS_WIN 0
+#endif
+
+#ifdef __linux__
+#define JXL_OS_LINUX 1
+#else
+#define JXL_OS_LINUX 0
+#endif
+
+#ifdef __APPLE__
+#define JXL_OS_MAC 1
+#else
+#define JXL_OS_MAC 0
+#endif
+
+#define JXL_OS_IOS 0
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#if TARGET_OS_IPHONE
+#undef JXL_OS_IOS
+#define JXL_OS_IOS 1
+#endif
+#endif
+
+#ifdef __FreeBSD__
+#define JXL_OS_FREEBSD 1
+#else
+#define JXL_OS_FREEBSD 0
+#endif
+
+#ifdef __HAIKU__
+#define JXL_OS_HAIKU 1
+#else
+#define JXL_OS_HAIKU 0
+#endif
+
+#endif  // LIB_JXL_BASE_OS_MACROS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/override.h b/third_party/jpeg-xl/lib/jxl/base/override.h
new file mode 100644
index 0000000000..1f8b657974
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/override.h
@@ -0,0 +1,29 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_OVERRIDE_H_
+#define LIB_JXL_BASE_OVERRIDE_H_
+
+// 'Trool' for command line arguments: force enable/disable, or use default.
+
+namespace jxl {
+
+// No effect if kDefault, otherwise forces a feature (typically a FrameHeader
+// flag) on or off.
+enum class Override : int { kOn = 1, kOff = 0, kDefault = -1 };
+
+static inline Override OverrideFromBool(bool flag) {
+  return flag ? Override::kOn : Override::kOff;
+}
+
+static inline bool ApplyOverride(Override o, bool default_condition) {
+  if (o == Override::kOn) return true;
+  if (o == Override::kOff) return false;
+  return default_condition;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_OVERRIDE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc
new file mode 100644
index 0000000000..11e4bff6fe
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc
@@ -0,0 +1,63 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/padded_bytes.h"
+
+namespace jxl {
+
+void PaddedBytes::IncreaseCapacityTo(size_t capacity) {
+  JXL_ASSERT(capacity > capacity_);
+
+  size_t new_capacity = std::max(capacity, 3 * capacity_ / 2);
+  new_capacity = std::max<size_t>(64, new_capacity);
+
+  // BitWriter writes up to 7 bytes past the end.
+  CacheAlignedUniquePtr new_data = AllocateArray(new_capacity + 8);
+  if (new_data == nullptr) {
+    // Allocation failed, discard all data to ensure this is noticed.
+    size_ = capacity_ = 0;
+    return;
+  }
+
+  if (data_ == nullptr) {
+    // First allocation: ensure first byte is initialized (won't be copied).
+    new_data[0] = 0;
+  } else {
+    // Subsequent resize: copy existing data to new location.
+    memcpy(new_data.get(), data_.get(), size_);
+    // Ensure that the first new byte is initialized, to allow write_bits to
+    // safely append to the newly-resized PaddedBytes.
+    new_data[size_] = 0;
+  }
+
+  capacity_ = new_capacity;
+  std::swap(new_data, data_);
+}
+
+void PaddedBytes::assign(const uint8_t* new_begin, const uint8_t* new_end) {
+  JXL_DASSERT(new_begin <= new_end);
+  const size_t new_size = static_cast<size_t>(new_end - new_begin);
+
+  // memcpy requires non-overlapping ranges, and resizing might invalidate the
+  // new range. Neither happens if the new range is completely to the left or
+  // right of the _allocated_ range (irrespective of size_).
+  const uint8_t* allocated_end = begin() + capacity_;
+  const bool outside = new_end <= begin() || new_begin >= allocated_end;
+  if (outside) {
+    resize(new_size);  // grow or shrink
+    memcpy(data(), new_begin, new_size);
+    return;
+  }
+
+  // There is overlap. The new size cannot be larger because we own the memory
+  // and the new range cannot include anything outside the allocated range.
+  JXL_ASSERT(new_size <= capacity_);
+
+  // memmove allows overlap and capacity_ is sufficient.
+  memmove(data(), new_begin, new_size);
+  size_ = new_size;  // shrink
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/base/padded_bytes.h b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.h
new file mode 100644
index 0000000000..4534ddf863
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.h
@@ -0,0 +1,197 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_PADDED_BYTES_H_
+#define LIB_JXL_BASE_PADDED_BYTES_H_
+
+// std::vector replacement with padding to reduce bounds checks in WriteBits
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memcpy
+
+#include <algorithm>  // max
+#include <initializer_list>
+#include <utility>  // swap
+
+#include "lib/jxl/base/cache_aligned.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Provides a subset of the std::vector interface with some differences:
+// - allows BitWriter to write 64 bits at a time without bounds checking;
+// - ONLY zero-initializes the first byte (required by BitWriter);
+// - ensures cache-line alignment.
+class PaddedBytes {
+ public:
+  // Required for output params.
+  PaddedBytes() : size_(0), capacity_(0) {}
+
+  explicit PaddedBytes(size_t size) : size_(size), capacity_(0) {
+    if (size != 0) IncreaseCapacityTo(size);
+  }
+
+  PaddedBytes(size_t size, uint8_t value) : size_(size), capacity_(0) {
+    if (size != 0) {
+      IncreaseCapacityTo(size);
+    }
+    if (size_ != 0) {
+      memset(data(), value, size);
+    }
+  }
+
+  PaddedBytes(const PaddedBytes& other) : size_(other.size_), capacity_(0) {
+    if (size_ != 0) IncreaseCapacityTo(size_);
+    if (data() != nullptr) memcpy(data(), other.data(), size_);
+  }
+  PaddedBytes& operator=(const PaddedBytes& other) {
+    // Self-assignment is safe.
+    resize(other.size());
+    if (data() != nullptr) memmove(data(), other.data(), size_);
+    return *this;
+  }
+
+  // default is not OK - need to set other.size_ to 0!
+  PaddedBytes(PaddedBytes&& other) noexcept
+      : size_(other.size_),
+        capacity_(other.capacity_),
+        data_(std::move(other.data_)) {
+    other.size_ = other.capacity_ = 0;
+  }
+  PaddedBytes& operator=(PaddedBytes&& other) noexcept {
+    size_ = other.size_;
+    capacity_ = other.capacity_;
+    data_ = std::move(other.data_);
+
+    if (&other != this) {
+      other.size_ = other.capacity_ = 0;
+    }
+    return *this;
+  }
+
+  void swap(PaddedBytes& other) {
+    std::swap(size_, other.size_);
+    std::swap(capacity_, other.capacity_);
+    std::swap(data_, other.data_);
+  }
+
+  void reserve(size_t capacity) {
+    if (capacity > capacity_) IncreaseCapacityTo(capacity);
+  }
+
+  // NOTE: unlike vector, this does not initialize the new data!
+  // However, we guarantee that write_bits can safely append after
+  // the resize, as we zero-initialize the first new byte of data.
+  // If size < capacity(), does not invalidate the memory.
+  void resize(size_t size) {
+    if (size > capacity_) IncreaseCapacityTo(size);
+    size_ = (data() == nullptr) ? 0 : size;
+  }
+
+  // resize(size) plus explicit initialization of the new data with `value`.
+  void resize(size_t size, uint8_t value) {
+    size_t old_size = size_;
+    resize(size);
+    if (size_ > old_size) {
+      memset(data() + old_size, value, size_ - old_size);
+    }
+  }
+
+  // Amortized constant complexity due to exponential growth.
+  void push_back(uint8_t x) {
+    if (size_ == capacity_) {
+      IncreaseCapacityTo(capacity_ + 1);
+      if (data() == nullptr) return;
+    }
+
+    data_[size_++] = x;
+  }
+
+  size_t size() const { return size_; }
+  size_t capacity() const { return capacity_; }
+
+  uint8_t* data() { return data_.get(); }
+  const uint8_t* data() const { return data_.get(); }
+
+  // std::vector operations implemented in terms of the public interface above.
+
+  void clear() { resize(0); }
+  bool empty() const { return size() == 0; }
+
+  void assign(std::initializer_list<uint8_t> il) {
+    resize(il.size());
+    memcpy(data(), il.begin(), il.size());
+  }
+
+  // Replaces data() with [new_begin, new_end); potentially reallocates.
+  void assign(const uint8_t* new_begin, const uint8_t* new_end);
+
+  uint8_t* begin() { return data(); }
+  const uint8_t* begin() const { return data(); }
+  uint8_t* end() { return begin() + size(); }
+  const uint8_t* end() const { return begin() + size(); }
+
+  uint8_t& operator[](const size_t i) {
+    BoundsCheck(i);
+    return data()[i];
+  }
+  const uint8_t& operator[](const size_t i) const {
+    BoundsCheck(i);
+    return data()[i];
+  }
+
+  uint8_t& back() {
+    JXL_ASSERT(size() != 0);
+    return data()[size() - 1];
+  }
+  const uint8_t& back() const {
+    JXL_ASSERT(size() != 0);
+    return data()[size() - 1];
+  }
+
+  template <typename T>
+  void append(const T& other) {
+    append(reinterpret_cast<const uint8_t*>(other.data()),
+           reinterpret_cast<const uint8_t*>(other.data()) + other.size());
+  }
+
+  void append(const uint8_t* begin, const uint8_t* end) {
+    if (end - begin > 0) {
+      size_t old_size = size();
+      resize(size() + (end - begin));
+      memcpy(data() + old_size, begin, end - begin);
+    }
+  }
+
+ private:
+  void BoundsCheck(size_t i) const {
+    // <= is safe due to padding and required by BitWriter.
+    JXL_ASSERT(i <= size());
+  }
+
+  // Copies existing data to newly allocated "data_". If allocation fails,
+  // data() == nullptr and size_ = capacity_ = 0.
+  // The new capacity will be at least 1.5 times the old capacity. This ensures
+  // that we avoid quadratic behaviour.
+  void IncreaseCapacityTo(size_t capacity);
+
+  size_t size_;
+  size_t capacity_;
+  CacheAlignedUniquePtr data_;
+};
+
+template <typename T>
+static inline void Append(const T& s, PaddedBytes* out,
+                          size_t* JXL_RESTRICT byte_pos) {
+  memcpy(out->data() + *byte_pos, s.data(), s.size());
+  *byte_pos += s.size();
+  JXL_CHECK(*byte_pos <= out->size());
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_PADDED_BYTES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/printf_macros.h b/third_party/jpeg-xl/lib/jxl/base/printf_macros.h
new file mode 100644
index 0000000000..3215052afd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/printf_macros.h
@@ -0,0 +1,34 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_PRINTF_MACROS_H_
+#define LIB_JXL_BASE_PRINTF_MACROS_H_
+
+// Format string macros. These should be included after any other system
+// library since those may unconditionally define these, depending on the
+// platform.
+
+// PRIuS and PRIdS macros to print size_t and ssize_t respectively.
+#if !defined(PRIdS)
+#if defined(_WIN64)
+#define PRIdS "lld"
+#elif defined(_WIN32)
+#define PRIdS "d"
+#else
+#define PRIdS "zd"
+#endif
+#endif  // PRIdS
+
+#if !defined(PRIuS)
+#if defined(_WIN64)
+#define PRIuS "llu"
+#elif defined(_WIN32)
+#define PRIuS "u"
+#else
+#define PRIuS "zu"
+#endif
+#endif  // PRIuS
+
+#endif  // LIB_JXL_BASE_PRINTF_MACROS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/profiler.cc b/third_party/jpeg-xl/lib/jxl/base/profiler.cc
new file mode 100644
index 0000000000..a38d9b82b7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/profiler.cc
@@ -0,0 +1,540 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/profiler.h"
+
+#if JXL_PROFILER_ENABLED
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>  // memcpy
+
+#include <algorithm>  // sort
+#include <atomic>
+#include <cinttypes>  // PRIu64
+#include <hwy/cache_control.h>
+#include <limits>
+#include <new>
+
+// Optionally use SIMD in StreamCacheLine if available.
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/base/profiler.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace profiler {
+namespace HWY_NAMESPACE {
+
+// Overwrites `to` without loading it into cache (read-for-ownership).
+// Copies 64 bytes from/to naturally aligned addresses.
+void StreamCacheLine(const Packet* HWY_RESTRICT from, Packet* HWY_RESTRICT to) {
+#if HWY_TARGET == HWY_SCALAR
+  hwy::CopyBytes<64>(from, to);
+#else
+  const HWY_CAPPED(uint64_t, 2) d;
+  HWY_FENCE;
+  const uint64_t* HWY_RESTRICT from64 = reinterpret_cast<const uint64_t*>(from);
+  const auto v0 = Load(d, from64 + 0);
+  const auto v1 = Load(d, from64 + 2);
+  const auto v2 = Load(d, from64 + 4);
+  const auto v3 = Load(d, from64 + 6);
+  // Fences prevent the compiler from reordering loads/stores, which may
+  // interfere with write-combining.
+  HWY_FENCE;
+  uint64_t* HWY_RESTRICT to64 = reinterpret_cast<uint64_t*>(to);
+  Stream(v0, d, to64 + 0);
+  Stream(v1, d, to64 + 2);
+  Stream(v2, d, to64 + 4);
+  Stream(v3, d, to64 + 6);
+  HWY_FENCE;
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace profiler
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+namespace profiler {
+
+HWY_EXPORT(StreamCacheLine);
+
+namespace {
+
+// How many mebibytes to allocate (if JXL_PROFILER_ENABLED) per thread that
+// enters at least one zone. Once this buffer is full, the thread will analyze
+// packets (two per zone), which introduces observer overhead.
+#ifndef PROFILER_THREAD_STORAGE
+#define PROFILER_THREAD_STORAGE 32ULL
+#endif
+
+#define PROFILER_PRINT_OVERHEAD 0
+
+// Upper bounds for fixed-size data structures (guarded via HWY_ASSERT):
+constexpr size_t kMaxDepth = 64;   // Maximum nesting of zones.
+constexpr size_t kMaxZones = 256;  // Total number of zones.
+
+// Stack of active (entered but not exited) zones. POD, uninitialized.
+// Used to deduct child duration from the parent's self time.
+struct ActiveZone {
+  const char* name;
+  uint64_t entry_timestamp;
+  uint64_t child_total;
+};
+
+// Totals for all Zones with the same name. POD, must be zero-initialized.
+struct ZoneTotals {
+  uint64_t total_duration;
+  const char* name;
+  uint64_t num_calls;
+};
+
+template <typename T>
+inline T ClampedSubtract(const T minuend, const T subtrahend) {
+  if (subtrahend > minuend) {
+    return 0;
+  }
+  return minuend - subtrahend;
+}
+
+}  // namespace
+
+// Per-thread call graph (stack) and ZoneTotals for each zone.
+class Results {
+ public:
+  Results() {
+    // Zero-initialize all accumulators (avoids a check for num_zones_ == 0).
+    memset(zones_, 0, sizeof(zones_));
+  }
+
+  // Used for computing overhead when this thread encounters its first Zone.
+  // This has no observable effect apart from increasing "analyze_elapsed_".
+  uint64_t ZoneDuration(const Packet* packets) {
+    HWY_ASSERT(depth_ == 0);
+    HWY_ASSERT(num_zones_ == 0);
+    AnalyzePackets(packets, 2);
+    const uint64_t duration = zones_[0].total_duration;
+    zones_[0].num_calls = 0;
+    zones_[0].total_duration = 0;
+    HWY_ASSERT(depth_ == 0);
+    num_zones_ = 0;
+    return duration;
+  }
+
+  void SetSelfOverhead(const uint64_t self_overhead) {
+    self_overhead_ = self_overhead;
+  }
+
+  void SetChildOverhead(const uint64_t child_overhead) {
+    child_overhead_ = child_overhead;
+  }
+
+  // Draw all required information from the packets, which can be discarded
+  // afterwards. Called whenever this thread's storage is full.
+  void AnalyzePackets(const Packet* HWY_RESTRICT packets,
+                      const size_t num_packets) {
+    // Ensures prior weakly-ordered streaming stores are globally visible.
+    hwy::FlushStream();
+
+    const uint64_t t0 = TicksBefore();
+
+    for (size_t i = 0; i < num_packets; ++i) {
+      const uint64_t timestamp = packets[i].timestamp;
+      // Entering a zone
+      if (packets[i].name != nullptr) {
+        HWY_ASSERT(depth_ < kMaxDepth);
+        zone_stack_[depth_].name = packets[i].name;
+        zone_stack_[depth_].entry_timestamp = timestamp;
+        zone_stack_[depth_].child_total = 0;
+        ++depth_;
+        continue;
+      }
+
+      HWY_ASSERT(depth_ != 0);
+      const ActiveZone& active = zone_stack_[depth_ - 1];
+      const uint64_t duration = timestamp - active.entry_timestamp;
+      const uint64_t self_duration = ClampedSubtract(
+          duration, self_overhead_ + child_overhead_ + active.child_total);
+
+      UpdateOrAdd(active.name, 1, self_duration);
+      --depth_;
+
+      // "Deduct" the nested time from its parent's self_duration.
+      if (depth_ != 0) {
+        zone_stack_[depth_ - 1].child_total += duration + child_overhead_;
+      }
+    }
+
+    const uint64_t t1 = TicksAfter();
+    analyze_elapsed_ += t1 - t0;
+  }
+
+  // Incorporates results from another thread. Call after all threads have
+  // exited any zones.
+  void Assimilate(const Results& other) {
+    const uint64_t t0 = TicksBefore();
+    HWY_ASSERT(depth_ == 0);
+    HWY_ASSERT(other.depth_ == 0);
+
+    for (size_t i = 0; i < other.num_zones_; ++i) {
+      const ZoneTotals& zone = other.zones_[i];
+      UpdateOrAdd(zone.name, zone.num_calls, zone.total_duration);
+    }
+    const uint64_t t1 = TicksAfter();
+    analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
+  }
+
+  // Single-threaded.
+  void Print() {
+    const uint64_t t0 = TicksBefore();
+    MergeDuplicates();
+
+    // Sort by decreasing total (self) cost.
+    std::sort(zones_, zones_ + num_zones_,
+              [](const ZoneTotals& r1, const ZoneTotals& r2) {
+                return r1.total_duration > r2.total_duration;
+              });
+
+    uint64_t total_visible_duration = 0;
+    for (size_t i = 0; i < num_zones_; ++i) {
+      const ZoneTotals& r = zones_[i];
+      if (r.name[0] != '@') {
+        total_visible_duration += r.total_duration;
+        printf("%-40s: %10" PRIu64 " x %15" PRIu64 "= %15" PRIu64 "\n", r.name,
+               r.num_calls, r.total_duration / r.num_calls, r.total_duration);
+      }
+    }
+
+    const uint64_t t1 = TicksAfter();
+    analyze_elapsed_ += t1 - t0;
+    printf("Total clocks during analysis: %" PRIu64 "\n", analyze_elapsed_);
+    printf("Total clocks measured: %" PRIu64 "\n", total_visible_duration);
+  }
+
+  // Single-threaded. Clears all results as if no zones had been recorded.
+  void Reset() {
+    analyze_elapsed_ = 0;
+    HWY_ASSERT(depth_ == 0);
+    num_zones_ = 0;
+    memset(zone_stack_, 0, sizeof(zone_stack_));
+    memset(zones_, 0, sizeof(zones_));
+  }
+
+ private:
+  // Updates ZoneTotals of the same name, or inserts a new one if this thread
+  // has not yet seen that name. Uses a self-organizing list data structure,
+  // which avoids dynamic memory allocations and is faster than unordered_map.
+  void UpdateOrAdd(const char* name, const uint64_t num_calls,
+                   const uint64_t duration) {
+    // Special case for first zone: (maybe) update, without swapping.
+    if (zones_[0].name == name) {
+      zones_[0].total_duration += duration;
+      zones_[0].num_calls += num_calls;
+      return;
+    }
+
+    // Look for a zone with the same name.
+    for (size_t i = 1; i < num_zones_; ++i) {
+      if (zones_[i].name == name) {
+        zones_[i].total_duration += duration;
+        zones_[i].num_calls += num_calls;
+        // Swap with predecessor (more conservative than move to front,
+        // but at least as successful).
+        std::swap(zones_[i - 1], zones_[i]);
+        return;
+      }
+    }
+
+    // Not found; create a new ZoneTotals.
+    HWY_ASSERT(num_zones_ < kMaxZones);
+    ZoneTotals* HWY_RESTRICT zone = zones_ + num_zones_;
+    zone->name = name;
+    zone->num_calls = num_calls;
+    zone->total_duration = duration;
+    ++num_zones_;
+  }
+
+  // Each instantiation of a function template seems to get its own copy of
+  // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
+  // acceptable because we only expect a few dozen zones.
+  void MergeDuplicates() {
+    for (size_t i = 0; i < num_zones_; ++i) {
+      // Add any subsequent duplicates to num_calls and total_duration.
+      for (size_t j = i + 1; j < num_zones_;) {
+        if (!strcmp(zones_[i].name, zones_[j].name)) {
+          zones_[i].num_calls += zones_[j].num_calls;
+          zones_[i].total_duration += zones_[j].total_duration;
+          // Fill hole with last item.
+          zones_[j] = zones_[--num_zones_];
+        } else {  // Name differed, try next ZoneTotals.
+          ++j;
+        }
+      }
+    }
+  }
+
+  uint64_t analyze_elapsed_ = 0;
+  uint64_t self_overhead_ = 0;
+  uint64_t child_overhead_ = 0;
+
+  size_t depth_ = 0;      // Number of active zones <= kMaxDepth.
+  size_t num_zones_ = 0;  // Number of unique zones <= kMaxZones.
+
+  // After other members to avoid large pointer offsets.
+  alignas(64) ActiveZone zone_stack_[kMaxDepth];  // Last = newest
+  alignas(64) ZoneTotals zones_[kMaxZones];       // Self-organizing list
+};
+
+ThreadSpecific::ThreadSpecific()
+    : max_packets_(PROFILER_THREAD_STORAGE << 16),  // MiB / sizeof(Packet)
+      packets_(hwy::AllocateAligned<Packet>(max_packets_)),
+      num_packets_(0),
+      results_(hwy::MakeUniqueAligned<Results>()) {}
+
+ThreadSpecific::~ThreadSpecific() {}
+
+void ThreadSpecific::FlushBuffer() {
+  if (num_packets_ + kBufferCapacity > max_packets_) {
+    results_->AnalyzePackets(packets_.get(), num_packets_);
+    num_packets_ = 0;
+  }
+  // This buffering halves observer overhead and decreases the overall
+  // runtime by about 3%.
+  HWY_DYNAMIC_DISPATCH(StreamCacheLine)
+  (buffer_, packets_.get() + num_packets_);
+  num_packets_ += kBufferCapacity;
+  buffer_size_ = 0;
+}
+
+void ThreadSpecific::AnalyzeRemainingPackets() {
+  // Storage full => empty it.
+  if (num_packets_ + buffer_size_ > max_packets_) {
+    results_->AnalyzePackets(packets_.get(), num_packets_);
+    num_packets_ = 0;
+  }
+
+  // Move buffer to storage
+  memcpy(packets_.get() + num_packets_, buffer_, buffer_size_ * sizeof(Packet));
+  num_packets_ += buffer_size_;
+  buffer_size_ = 0;
+
+  results_->AnalyzePackets(packets_.get(), num_packets_);
+  num_packets_ = 0;
+}
+
+namespace {
+
+class HalfSampleMode {
+ public:
+  // Returns mode. "sorted" must be in ascending order.
+  template <typename T>
+  T operator()(const T* const HWY_RESTRICT sorted,
+               const size_t num_values) const {
+    int64_t center = num_values / 2;
+    int64_t width = num_values;
+
+    // Zoom in on modal intervals of decreasing width. Stop before we reach
+    // width=1, i.e. single values, for which there is no "slope".
+    while (width > 2) {
+      // Round up so we can still reach the outer edges of odd widths.
+      width = (width + 1) / 2;
+
+      center = CenterOfIntervalWithMinSlope(sorted, num_values, center, width);
+    }
+
+    return sorted[center];  // mode := middle value in modal interval.
+  }
+
+ private:
+  // Returns center of the densest region [c-radius, c+radius].
+  template <typename T>
+  static HWY_INLINE int64_t CenterOfIntervalWithMinSlope(
+      const T* HWY_RESTRICT sorted, const int64_t total_values,
+      const int64_t center, const int64_t width) {
+    const int64_t radius = (width + 1) / 2;
+
+    auto compute_slope = [radius, total_values, sorted](
+                             int64_t c, int64_t* actual_center = nullptr) {
+      // For symmetry, check 2*radius+1 values, i.e. [min, max].
+      const int64_t min = std::max(c - radius, int64_t(0));
+      const int64_t max = std::min(c + radius, total_values - 1);
+      HWY_ASSERT(min < max);
+      HWY_ASSERT(sorted[min] <=
+                 sorted[max] + std::numeric_limits<float>::epsilon());
+      const float dx = max - min + 1;
+      const float slope = (sorted[max] - sorted[min]) / dx;
+
+      if (actual_center != nullptr) {
+        // c may be out of bounds, so return center of the clamped bounds.
+        *actual_center = (min + max + 1) / 2;
+      }
+      return slope;
+    };
+
+    // First find min_slope for all centers.
+    float min_slope = std::numeric_limits<float>::max();
+    for (int64_t c = center - radius; c <= center + radius; ++c) {
+      min_slope = std::min(min_slope, compute_slope(c));
+    }
+
+    // Candidates := centers with slope ~= min_slope.
+    std::vector<int64_t> candidates;
+    for (int64_t c = center - radius; c <= center + radius; ++c) {
+      int64_t actual_center;
+      const float slope = compute_slope(c, &actual_center);
+      if (slope <= min_slope * 1.001f) {
+        candidates.push_back(actual_center);
+      }
+    }
+
+    // Keep the median.
+    HWY_ASSERT(!candidates.empty());
+    if (candidates.size() == 1) return candidates[0];
+    std::nth_element(candidates.begin(),
+                     candidates.begin() + candidates.size() / 2,
+                     candidates.end());
+    return candidates[candidates.size() / 2];
+  }
+};
+
+}  // namespace
+
+void ThreadSpecific::ComputeOverhead() {
+  // Delay after capturing timestamps before/after the actual zone runs. Even
+  // with frequency throttling disabled, this has a multimodal distribution,
+  // including 32, 34, 48, 52, 59, 62.
+  uint64_t self_overhead;
+  {
+    const size_t kNumSamples = 32;
+    uint32_t samples[kNumSamples];
+    for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
+      const size_t kNumDurations = 1024;
+      uint32_t durations[kNumDurations];
+
+      for (size_t idx_duration = 0; idx_duration < kNumDurations;
+           ++idx_duration) {
+        {  //
+          PROFILER_ZONE("Dummy Zone (never shown)");
+        }
+        const uint64_t duration = results_->ZoneDuration(buffer_);
+        buffer_size_ = 0;
+        durations[idx_duration] = static_cast<uint32_t>(duration);
+        HWY_ASSERT(num_packets_ == 0);
+      }
+      std::sort(durations, durations + kNumDurations);
+      samples[idx_sample] = HalfSampleMode()(durations, kNumDurations);
+    }
+    // Median.
+    std::sort(samples, samples + kNumSamples);
+    self_overhead = samples[kNumSamples / 2];
+#if PROFILER_PRINT_OVERHEAD
+    printf("Overhead: %" PRIu64 "\n", static_cast<uint64_t>(self_overhead));
+#endif
+    results_->SetSelfOverhead(self_overhead);
+  }
+
+  // Delay before capturing start timestamp / after end timestamp.
+  const size_t kNumSamples = 32;
+  uint32_t samples[kNumSamples];
+  for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
+    const size_t kNumDurations = 16;
+    uint32_t durations[kNumDurations];
+    for (size_t idx_duration = 0; idx_duration < kNumDurations;
+         ++idx_duration) {
+      const size_t kReps = 10000;
+      // Analysis time should not be included => must fit within buffer.
+      HWY_ASSERT(kReps * 2 < max_packets_);
+      hwy::FlushStream();
+      const uint64_t t0 = TicksBefore();
+      for (size_t i = 0; i < kReps; ++i) {
+        PROFILER_ZONE("Dummy");
+      }
+      hwy::FlushStream();
+      const uint64_t t1 = TicksAfter();
+      HWY_ASSERT(num_packets_ + buffer_size_ == kReps * 2);
+      buffer_size_ = 0;
+      num_packets_ = 0;
+      const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
+      durations[idx_duration] =
+          static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
+    }
+    std::sort(durations, durations + kNumDurations);
+    samples[idx_sample] = HalfSampleMode()(durations, kNumDurations);
+  }
+  std::sort(samples, samples + kNumSamples);
+  const uint64_t child_overhead = samples[9 * kNumSamples / 10];
+#if PROFILER_PRINT_OVERHEAD
+  printf("Child overhead: %" PRIu64 "\n",
+         static_cast<uint64_t>(child_overhead));
+#endif
+  results_->SetChildOverhead(child_overhead);
+}
+
+namespace {
+
+// Could be a static member of Zone, but that would expose <atomic> in header.
+std::atomic<ThreadSpecific*>& GetHead() {
+  static std::atomic<ThreadSpecific*> head_{nullptr};  // Owning
+  return head_;
+}
+
+}  // namespace
+
+// Thread-safe.
+ThreadSpecific* Zone::InitThreadSpecific() {
+  ThreadSpecific* thread_specific =
+      hwy::MakeUniqueAligned<ThreadSpecific>().release();
+
+  // Insert into unordered list
+  std::atomic<ThreadSpecific*>& head = GetHead();
+  ThreadSpecific* old_head = head.load(std::memory_order_relaxed);
+  thread_specific->SetNext(old_head);
+  while (!head.compare_exchange_weak(old_head, thread_specific,
+                                     std::memory_order_release,
+                                     std::memory_order_relaxed)) {
+    thread_specific->SetNext(old_head);
+    // TODO(janwas): pause
+  }
+
+  // ComputeOverhead also creates a Zone, so this needs to be set before that
+  // to prevent infinite recursion.
+  GetThreadSpecific() = thread_specific;
+
+  thread_specific->ComputeOverhead();
+  return thread_specific;
+}
+
+// Single-threaded.
+/*static*/ void Zone::PrintResults() {
+  ThreadSpecific* head = GetHead().load(std::memory_order_relaxed);
+  ThreadSpecific* p = head;
+  while (p) {
+    p->AnalyzeRemainingPackets();
+
+    // Combine all threads into a single Result.
+    if (p != head) {
+      head->GetResults().Assimilate(p->GetResults());
+      p->GetResults().Reset();
+    }
+
+    p = p->GetNext();
+  }
+
+  if (head != nullptr) {
+    head->GetResults().Print();
+    head->GetResults().Reset();
+  }
+}
+
+}  // namespace profiler
+}  // namespace jxl
+
+#endif  // HWY_ONCE
+#endif  // JXL_PROFILER_ENABLED
diff --git a/third_party/jpeg-xl/lib/jxl/base/profiler.h b/third_party/jpeg-xl/lib/jxl/base/profiler.h
new file mode 100644
index 0000000000..4c0efa4b3a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/profiler.h
@@ -0,0 +1,170 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_PROFILER_H_
+#define LIB_JXL_BASE_PROFILER_H_
+
+// High precision, low overhead time measurements. Returns exact call counts and
+// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
+//
+// To use the profiler you must set the JPEGXL_ENABLE_PROFILER CMake flag, which
+// defines JXL_PROFILER_ENABLED.
+//
+// Usage: instrument regions of interest: { PROFILER_ZONE("name"); /*code*/ } or
+// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
+// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to
+// print call counts and average durations [CPU cycles] to stdout, sorted in
+// descending order of total duration.
+
+// If zero, this file has no effect and no measurements will be recorded.
+#ifndef JXL_PROFILER_ENABLED
+#define JXL_PROFILER_ENABLED 0
+#endif
+#if JXL_PROFILER_ENABLED
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <hwy/aligned_allocator.h>
+#include <hwy/base.h>
+
+#include "lib/jxl/base/tsc_timer.h"
+
+#if HWY_COMPILER_MSVC
+#define PROFILER_PUBLIC
+#else
+#define PROFILER_PUBLIC __attribute__((visibility("default")))
+#endif
+
+namespace jxl {
+namespace profiler {
+
+// Represents zone entry/exit events. POD.
+#pragma pack(push, 1)
+struct Packet {
+  // Computing a hash or string table is likely too expensive, and offsets
+  // from other libraries' string literals can be too large to combine them and
+  // a full-resolution timestamp into 64 bits.
+  uint64_t timestamp;
+  const char* name;  // nullptr for exit packets
+#if UINTPTR_MAX <= 0xFFFFFFFFu
+  uint32_t padding;
+#endif
+};
+#pragma pack(pop)
+static_assert(sizeof(Packet) == 16, "Wrong Packet size");
+
+class Results;  // pImpl
+
+// Per-thread packet storage, dynamically allocated and aligned.
+class ThreadSpecific {
+  static constexpr size_t kBufferCapacity = 64 / sizeof(Packet);
+
+ public:
+  PROFILER_PUBLIC explicit ThreadSpecific();
+  PROFILER_PUBLIC ~ThreadSpecific();
+
+  // Depends on Zone => defined out of line.
+  PROFILER_PUBLIC void ComputeOverhead();
+
+  HWY_INLINE void WriteEntry(const char* name) { Write(name, TicksBefore()); }
+  HWY_INLINE void WriteExit() { Write(nullptr, TicksAfter()); }
+
+  PROFILER_PUBLIC void AnalyzeRemainingPackets();
+
+  // Accessors instead of public member for well-defined data layout.
+  void SetNext(ThreadSpecific* next) { next_ = next; }
+  ThreadSpecific* GetNext() const { return next_; }
+
+  Results& GetResults() { return *results_; }
+
+ private:
+  PROFILER_PUBLIC void FlushBuffer();
+
+  // Write packet to buffer/storage, emptying them as needed.
+  void Write(const char* name, const uint64_t timestamp) {
+    if (buffer_size_ == kBufferCapacity) {  // Full
+      FlushBuffer();
+    }
+    buffer_[buffer_size_].name = name;
+    buffer_[buffer_size_].timestamp = timestamp;
+    ++buffer_size_;
+  }
+
+  // Write-combining buffer to avoid cache pollution. Must be the first
+  // non-static member to ensure cache-line alignment.
+  Packet buffer_[kBufferCapacity];
+  size_t buffer_size_ = 0;
+
+  // Contiguous storage for zone enter/exit packets.
+  const size_t max_packets_;
+  hwy::AlignedFreeUniquePtr<Packet[]> packets_;
+  size_t num_packets_;
+
+  // Linked list of all threads.
+  ThreadSpecific* next_ = nullptr;  // Owned, never released.
+
+  hwy::AlignedUniquePtr<Results> results_;
+};
+
+// RAII zone enter/exit recorder constructed by PROFILER_ZONE; also
+// responsible for initializing ThreadSpecific.
+class Zone {
+ public:
+  HWY_NOINLINE explicit Zone(const char* name) {
+    HWY_FENCE;
+    ThreadSpecific* HWY_RESTRICT thread_specific = GetThreadSpecific();
+    if (HWY_UNLIKELY(thread_specific == nullptr)) {
+      thread_specific = InitThreadSpecific();
+    }
+
+    thread_specific->WriteEntry(name);
+  }
+
+  HWY_NOINLINE ~Zone() { GetThreadSpecific()->WriteExit(); }
+
+  // Call exactly once after all threads have exited all zones.
+  PROFILER_PUBLIC static void PrintResults();
+
+ private:
+  // Returns reference to the thread's ThreadSpecific pointer (initially null).
+  // Function-local static avoids needing a separate definition.
+  static ThreadSpecific*& GetThreadSpecific() {
+    static thread_local ThreadSpecific* thread_specific;
+    return thread_specific;
+  }
+
+  // Non time-critical.
+  PROFILER_PUBLIC ThreadSpecific* InitThreadSpecific();
+};
+
+// Creates a zone starting from here until the end of the current scope.
+// Timestamps will be recorded when entering and exiting the zone.
+// To ensure the name pointer remains valid, we require it to be a string
+// literal (by merging with ""). We also compare strings by address.
+#define PROFILER_ZONE(name)                  \
+  HWY_FENCE;                                 \
+  const ::jxl::profiler::Zone zone("" name); \
+  HWY_FENCE
+
+// Creates a zone for an entire function (when placed at its beginning).
+// Shorter/more convenient than ZONE.
+#define PROFILER_FUNC                         \
+  HWY_FENCE;                                  \
+  const ::jxl::profiler::Zone zone(__func__); \
+  HWY_FENCE
+
+#define PROFILER_PRINT_RESULTS ::jxl::profiler::Zone::PrintResults
+
+}  // namespace profiler
+}  // namespace jxl
+
+#else  // !JXL_PROFILER_ENABLED
+#define PROFILER_ZONE(name)
+#define PROFILER_FUNC
+#define PROFILER_PRINT_RESULTS()
+#endif
+
+#endif  // LIB_JXL_BASE_PROFILER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/random.cc b/third_party/jpeg-xl/lib/jxl/base/random.cc
new file mode 100644
index 0000000000..c99f88921c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/random.cc
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/random.h"
+
+#include <cmath>
+
+namespace jxl {
+
+Rng::GeometricDistribution::GeometricDistribution(float p)
+    : inv_log_1mp(1.0 / std::log(1 - p)) {}
+
+uint32_t Rng::Geometric(const GeometricDistribution& dist) {
+  float f = UniformF(0, 1);
+  float log = std::log(1 - f) * dist.inv_log_1mp;
+  return static_cast<uint32_t>(log);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/base/random.h b/third_party/jpeg-xl/lib/jxl/base/random.h
new file mode 100644
index 0000000000..663b88c95d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/random.h
@@ -0,0 +1,95 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_RANDOM_
+#define LIB_JXL_BASE_RANDOM_
+
+// Random number generator + distributions.
+// We don't use <random> because the implementation (and thus results) differs
+// between libstdc++ and libc++.
+
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+struct Rng {
+  explicit Rng(size_t seed)
+      : s{static_cast<uint64_t>(0x94D049BB133111EBull),
+          static_cast<uint64_t>(0xBF58476D1CE4E5B9ull) + seed} {}
+
+  // Xorshift128+ adapted from xorshift128+-inl.h
+  uint64_t operator()() {
+    uint64_t s1 = s[0];
+    const uint64_t s0 = s[1];
+    const uint64_t bits = s1 + s0;  // b, c
+    s[0] = s0;
+    s1 ^= s1 << 23;
+    s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
+    s[1] = s1;
+    return bits;
+  }
+
+  // Uniformly distributed int64_t in [begin, end), under the assumption that
+  // `end-begin` is significantly smaller than 1<<64, otherwise there is some
+  // bias.
+  int64_t UniformI(int64_t begin, int64_t end) {
+    JXL_DASSERT(end > begin);
+    return static_cast<int64_t>((*this)() %
+                                static_cast<uint64_t>(end - begin)) +
+           begin;
+  }
+
+  // Same as UniformI, but for uint64_t.
+  uint64_t UniformU(uint64_t begin, uint64_t end) {
+    JXL_DASSERT(end > begin);
+    return (*this)() % (end - begin) + begin;
+  }
+
+  // Uniformly distributed float in [begin, end) range. Note: only 23 bits of
+  // randomness.
+  float UniformF(float begin, float end) {
+    float f;
+    // Bits of a random [1, 2) float.
+    uint32_t u = ((*this)() >> (64 - 23)) | 0x3F800000;
+    static_assert(sizeof(f) == sizeof(u),
+                  "Float and U32 must have the same size");
+    memcpy(&f, &u, sizeof(f));
+    // Note: (end-begin) * f + (2*begin-end) may fail to return a number >=
+    // begin.
+    return (end - begin) * (f - 1.0f) + begin;
+  }
+
+  // Bernoulli trial
+  bool Bernoulli(float p) { return UniformF(0, 1) < p; }
+
+  // State for geometric distributions.
+  struct GeometricDistribution {
+    explicit GeometricDistribution(float p);
+
+   private:
+    float inv_log_1mp;
+    friend struct Rng;
+  };
+
+  uint32_t Geometric(const GeometricDistribution& dist);
+
+  template <typename T>
+  void Shuffle(T* t, size_t n) {
+    for (size_t i = 0; i + 1 < n; i++) {
+      size_t a = UniformU(i, n);
+      std::swap(t[a], t[i]);
+    }
+  }
+
+ private:
+  uint64_t s[2];
+};
+
+}  // namespace jxl
+#endif  // LIB_JXL_BASE_RANDOM_
diff --git a/third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h b/third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h
new file mode 100644
index 0000000000..315f3bd003
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h
@@ -0,0 +1,44 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_SANITIZER_DEFINITIONS_H_
+#define LIB_JXL_BASE_SANITIZER_DEFINITIONS_H_
+
+#ifdef MEMORY_SANITIZER
+#define JXL_MEMORY_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define JXL_MEMORY_SANITIZER 1
+#else
+#define JXL_MEMORY_SANITIZER 0
+#endif
+#else
+#define JXL_MEMORY_SANITIZER 0
+#endif
+
+#ifdef ADDRESS_SANITIZER
+#define JXL_ADDRESS_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define JXL_ADDRESS_SANITIZER 1
+#else
+#define JXL_ADDRESS_SANITIZER 0
+#endif
+#else
+#define JXL_ADDRESS_SANITIZER 0
+#endif
+
+#ifdef THREAD_SANITIZER
+#define JXL_THREAD_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define JXL_THREAD_SANITIZER 1
+#else
+#define JXL_THREAD_SANITIZER 0
+#endif
+#else
+#define JXL_THREAD_SANITIZER 0
+#endif
+#endif  // LIB_JXL_BASE_SANITIZER_DEFINITIONS_H
diff --git a/third_party/jpeg-xl/lib/jxl/base/scope_guard.h b/third_party/jpeg-xl/lib/jxl/base/scope_guard.h
new file mode 100644
index 0000000000..a18a44cb79
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/scope_guard.h
@@ -0,0 +1,48 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_SCOPE_GUARD_H_
+#define LIB_JXL_BASE_SCOPE_GUARD_H_
+
+#include <utility>
+
+namespace jxl {
+
+template <typename Callback>
+class ScopeGuard {
+ public:
+  // Discourage unnecessary moves / copies.
+  ScopeGuard(const ScopeGuard &) = delete;
+  ScopeGuard &operator=(const ScopeGuard &) = delete;
+  ScopeGuard &operator=(ScopeGuard &&) = delete;
+
+  // Pre-C++17 does not guarantee RVO -> require move constructor.
+  ScopeGuard(ScopeGuard &&other) : callback_(std::move(other.callback_)) {
+    other.armed_ = false;
+  }
+
+  template <typename CallbackParam>
+  explicit ScopeGuard(CallbackParam &&callback)
+      : callback_(std::forward<CallbackParam>(callback)), armed_(true) {}
+
+  ~ScopeGuard() {
+    if (armed_) callback_();
+  }
+
+  void Disarm() { armed_ = false; }
+
+ private:
+  Callback callback_;
+  bool armed_;
+};
+
+template <typename Callback>
+ScopeGuard<Callback> MakeScopeGuard(Callback &&callback) {
+  return ScopeGuard<Callback>{std::forward<Callback>(callback)};
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_SCOPE_GUARD_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/span.h b/third_party/jpeg-xl/lib/jxl/base/span.h
new file mode 100644
index 0000000000..41c3623a4b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/span.h
@@ -0,0 +1,60 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_SPAN_H_
+#define LIB_JXL_BASE_SPAN_H_
+
+// Span (array view) is a non-owning container that provides cheap "cut"
+// operations and could be used as "ArrayLike" data source for PaddedBytes.
+
+#include <stddef.h>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+template <typename T>
+class Span {
+ public:
+  constexpr Span() noexcept : Span(nullptr, 0) {}
+
+  constexpr Span(T* array, size_t length) noexcept
+      : ptr_(array), len_(length) {}
+
+  template <size_t N>
+  explicit constexpr Span(T (&a)[N]) noexcept : Span(a, N) {}
+
+  template <typename ArrayLike>
+  explicit constexpr Span(const ArrayLike& other) noexcept
+      : Span(reinterpret_cast<T*>(other.data()), other.size()) {
+    static_assert(sizeof(*other.data()) == sizeof(T),
+                  "Incompatible type of source.");
+  }
+
+  constexpr T* data() const noexcept { return ptr_; }
+
+  constexpr size_t size() const noexcept { return len_; }
+
+  constexpr bool empty() const noexcept { return len_ == 0; }
+
+  constexpr T& operator[](size_t i) const noexcept {
+    // MSVC 2015 accepts this as constexpr, but not ptr_[i]
+    return *(data() + i);
+  }
+
+  void remove_prefix(size_t n) noexcept {
+    JXL_ASSERT(size() >= n);
+    ptr_ += n;
+    len_ -= n;
+  }
+
+ private:
+  T* ptr_;
+  size_t len_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_SPAN_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/status.h b/third_party/jpeg-xl/lib/jxl/base/status.h
new file mode 100644
index 0000000000..f40be0c434
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/status.h
@@ -0,0 +1,326 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_STATUS_H_
+#define LIB_JXL_BASE_STATUS_H_
+
+// Error handling: Status return type + helper macros.
+
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/sanitizer_definitions.h"
+
+#if JXL_ADDRESS_SANITIZER || JXL_MEMORY_SANITIZER || JXL_THREAD_SANITIZER
+#include "sanitizer/common_interface_defs.h"  // __sanitizer_print_stack_trace
+#endif                                        // defined(*_SANITIZER)
+
+namespace jxl {
+
+// Uncomment to abort when JXL_FAILURE or JXL_STATUS with a fatal error is
+// reached:
+// #define JXL_CRASH_ON_ERROR
+
+#ifndef JXL_ENABLE_ASSERT
+#define JXL_ENABLE_ASSERT 1
+#endif
+
+#ifndef JXL_ENABLE_CHECK
+#define JXL_ENABLE_CHECK 1
+#endif
+
+// Pass -DJXL_DEBUG_ON_ERROR at compile time to print debug messages when a
+// function returns JXL_FAILURE or calls JXL_NOTIFY_ERROR. Note that this is
+// irrelevant if you also pass -DJXL_CRASH_ON_ERROR.
+#if defined(JXL_DEBUG_ON_ERROR) || defined(JXL_CRASH_ON_ERROR)
+#undef JXL_DEBUG_ON_ERROR
+#define JXL_DEBUG_ON_ERROR 1
+#else  // JXL_DEBUG_ON_ERROR || JXL_CRASH_ON_ERROR
+#ifdef NDEBUG
+#define JXL_DEBUG_ON_ERROR 0
+#else  // NDEBUG
+#define JXL_DEBUG_ON_ERROR 1
+#endif  // NDEBUG
+#endif  // JXL_DEBUG_ON_ERROR || JXL_CRASH_ON_ERROR
+
+// Pass -DJXL_DEBUG_ON_ALL_ERROR at compile time to print debug messages on
+// all error (fatal and non-fatal) status. This implies JXL_DEBUG_ON_ERROR.
+#if defined(JXL_DEBUG_ON_ALL_ERROR)
+#undef JXL_DEBUG_ON_ALL_ERROR
+#define JXL_DEBUG_ON_ALL_ERROR 1
+// JXL_DEBUG_ON_ALL_ERROR implies JXL_DEBUG_ON_ERROR too.
+#undef JXL_DEBUG_ON_ERROR
+#define JXL_DEBUG_ON_ERROR 1
+#else  // JXL_DEBUG_ON_ALL_ERROR
+#define JXL_DEBUG_ON_ALL_ERROR 0
+#endif  // JXL_DEBUG_ON_ALL_ERROR
+
+// The Verbose level for the library
+#ifndef JXL_DEBUG_V_LEVEL
+#define JXL_DEBUG_V_LEVEL 0
+#endif  // JXL_DEBUG_V_LEVEL
+
+// Pass -DJXL_DEBUG_ON_ABORT=0 to disable the debug messages on JXL_ASSERT,
+// JXL_CHECK and JXL_ABORT.
+#ifndef JXL_DEBUG_ON_ABORT
+#define JXL_DEBUG_ON_ABORT 1
+#endif  // JXL_DEBUG_ON_ABORT
+
+// Print a debug message on standard error. You should use the JXL_DEBUG macro
+// instead of calling Debug directly. This function returns false, so it can be
+// used as a return value in JXL_FAILURE.
+JXL_FORMAT(1, 2)
+inline JXL_NOINLINE bool Debug(const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  vfprintf(stderr, format, args);
+  va_end(args);
+  return false;
+}
+
+// Print a debug message on standard error if "enabled" is true. "enabled" is
+// normally a macro that evaluates to 0 or 1 at compile time, so the Debug
+// function is never called and optimized out in release builds. Note that the
+// arguments are compiled but not evaluated when enabled is false. The format
+// string must be a explicit string in the call, for example:
+//   JXL_DEBUG(JXL_DEBUG_MYMODULE, "my module message: %d", some_var);
+// Add a header at the top of your module's .cc or .h file (depending on whether
+// you have JXL_DEBUG calls from the .h as well) like this:
+//   #ifndef JXL_DEBUG_MYMODULE
+//   #define JXL_DEBUG_MYMODULE 0
+//   #endif JXL_DEBUG_MYMODULE
+#define JXL_DEBUG_TMP(format, ...) \
+  ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__)
+
+#define JXL_DEBUG(enabled, format, ...)     \
+  do {                                      \
+    if (enabled) {                          \
+      JXL_DEBUG_TMP(format, ##__VA_ARGS__); \
+    }                                       \
+  } while (0)
+
+// JXL_DEBUG version that prints the debug message if the global verbose level
+// defined at compile time by JXL_DEBUG_V_LEVEL is greater or equal than the
+// passed level.
+#define JXL_DEBUG_V(level, format, ...) \
+  JXL_DEBUG(level <= JXL_DEBUG_V_LEVEL, format, ##__VA_ARGS__)
+
+// Warnings (via JXL_WARNING) are enabled by default in debug builds (opt and
+// debug).
+#ifdef JXL_DEBUG_WARNING
+#undef JXL_DEBUG_WARNING
+#define JXL_DEBUG_WARNING 1
+#else  // JXL_DEBUG_WARNING
+#ifdef NDEBUG
+#define JXL_DEBUG_WARNING 0
+#else  // JXL_DEBUG_WARNING
+#define JXL_DEBUG_WARNING 1
+#endif  // NDEBUG
+#endif  // JXL_DEBUG_WARNING
+#define JXL_WARNING(format, ...) \
+  JXL_DEBUG(JXL_DEBUG_WARNING, format, ##__VA_ARGS__)
+
+// Exits the program after printing a stack trace when possible.
+JXL_NORETURN inline JXL_NOINLINE bool Abort() {
+#if JXL_ADDRESS_SANITIZER || JXL_MEMORY_SANITIZER || JXL_THREAD_SANITIZER
+  // If compiled with any sanitizer print a stack trace. This call doesn't crash
+  // the program, instead the trap below will crash it also allowing gdb to
+  // break there.
+  __sanitizer_print_stack_trace();
+#endif  // *_SANITIZER)
+
+#if JXL_COMPILER_MSVC
+  __debugbreak();
+  abort();
+#else
+  __builtin_trap();
+#endif
+}
+
+// Exits the program after printing file/line plus a formatted string.
+#define JXL_ABORT(format, ...)                                              \
+  ((JXL_DEBUG_ON_ABORT) && ::jxl::Debug(("%s:%d: JXL_ABORT: " format "\n"), \
+                                        __FILE__, __LINE__, ##__VA_ARGS__), \
+   ::jxl::Abort())
+
+// Does not guarantee running the code, use only for debug mode checks.
+#if JXL_ENABLE_ASSERT
+#define JXL_ASSERT(condition)                                      \
+  do {                                                             \
+    if (!(condition)) {                                            \
+      JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_ASSERT: %s", #condition); \
+      ::jxl::Abort();                                              \
+    }                                                              \
+  } while (0)
+#else
+#define JXL_ASSERT(condition) \
+  do {                        \
+  } while (0)
+#endif
+
+// Define JXL_IS_DEBUG_BUILD that denotes asan, msan and other debug builds,
+// but not opt or release.
+#ifndef JXL_IS_DEBUG_BUILD
+#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) ||         \
+    defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER) || \
+    defined(__clang_analyzer__)
+#define JXL_IS_DEBUG_BUILD 1
+#else
+#define JXL_IS_DEBUG_BUILD 0
+#endif
+#endif  //  JXL_IS_DEBUG_BUILD
+
+// Same as above, but only runs in debug builds (builds where NDEBUG is not
+// defined). This is useful for slower asserts that we want to run more rarely
+// than usual. These will run on asan, msan and other debug builds, but not in
+// opt or release.
+#if JXL_IS_DEBUG_BUILD
+#define JXL_DASSERT(condition)                                      \
+  do {                                                              \
+    if (!(condition)) {                                             \
+      JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_DASSERT: %s", #condition); \
+      ::jxl::Abort();                                               \
+    }                                                               \
+  } while (0)
+#else
+#define JXL_DASSERT(condition) \
+  do {                         \
+  } while (0)
+#endif
+
+// Always runs the condition, so can be used for non-debug calls.
+#if JXL_ENABLE_CHECK
+#define JXL_CHECK(condition)                                      \
+  do {                                                            \
+    if (!(condition)) {                                           \
+      JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_CHECK: %s", #condition); \
+      ::jxl::Abort();                                             \
+    }                                                             \
+  } while (0)
+#else
+#define JXL_CHECK(condition) \
+  do {                       \
+    (void)(condition);       \
+  } while (0)
+#endif
+
+// A jxl::Status value from a StatusCode or Status which prints a debug message
+// when enabled.
+#define JXL_STATUS(status, format, ...)                                        \
+  ::jxl::StatusMessage(::jxl::Status(status), "%s:%d: " format "\n", __FILE__, \
+                       __LINE__, ##__VA_ARGS__)
+
+// Notify of an error but discard the resulting Status value. This is only
+// useful for debug builds or when building with JXL_CRASH_ON_ERROR.
+#define JXL_NOTIFY_ERROR(format, ...)                                      \
+  (void)JXL_STATUS(::jxl::StatusCode::kGenericError, "JXL_ERROR: " format, \
+                   ##__VA_ARGS__)
+
+// An error Status with a message. The JXL_STATUS() macro will return a Status
+// object with a kGenericError code, but the comma operator helps with
+// clang-tidy inference and potentially with optimizations.
+#define JXL_FAILURE(format, ...)                                              \
+  ((void)JXL_STATUS(::jxl::StatusCode::kGenericError, "JXL_FAILURE: " format, \
+                    ##__VA_ARGS__),                                           \
+   ::jxl::Status(::jxl::StatusCode::kGenericError))
+
+// Always evaluates the status exactly once, so can be used for non-debug calls.
+// Returns from the current context if the passed Status expression is an error
+// (fatal or non-fatal). The return value is the passed Status.
+#define JXL_RETURN_IF_ERROR(status)                                       \
+  do {                                                                    \
+    ::jxl::Status jxl_return_if_error_status = (status);                  \
+    if (!jxl_return_if_error_status) {                                    \
+      (void)::jxl::StatusMessage(                                         \
+          jxl_return_if_error_status,                                     \
+          "%s:%d: JXL_RETURN_IF_ERROR code=%d: %s\n", __FILE__, __LINE__, \
+          static_cast<int>(jxl_return_if_error_status.code()), #status);  \
+      return jxl_return_if_error_status;                                  \
+    }                                                                     \
+  } while (0)
+
+// As above, but without calling StatusMessage. Intended for bundles (see
+// fields.h), which have numerous call sites (-> relevant for code size) and do
+// not want to generate excessive messages when decoding partial headers.
+#define JXL_QUIET_RETURN_IF_ERROR(status)                \
+  do {                                                   \
+    ::jxl::Status jxl_return_if_error_status = (status); \
+    if (!jxl_return_if_error_status) {                   \
+      return jxl_return_if_error_status;                 \
+    }                                                    \
+  } while (0)
+
+enum class StatusCode : int32_t {
+  // Non-fatal errors (negative values).
+  kNotEnoughBytes = -1,
+
+  // The only non-error status code.
+  kOk = 0,
+
+  // Fatal-errors (positive values)
+  kGenericError = 1,
+};
+
+// Drop-in replacement for bool that raises compiler warnings if not used
+// after being returned from a function. Example:
+// Status LoadFile(...) { return true; } is more compact than
+// bool JXL_MUST_USE_RESULT LoadFile(...) { return true; }
+// In case of error, the status can carry an extra error code in its value which
+// is split between fatal and non-fatal error codes.
+class JXL_MUST_USE_RESULT Status {
+ public:
+  // We want implicit constructor from bool to allow returning "true" or "false"
+  // on a function when using Status. "true" means kOk while "false" means a
+  // generic fatal error.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr Status(bool ok)
+      : code_(ok ? StatusCode::kOk : StatusCode::kGenericError) {}
+
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr Status(StatusCode code) : code_(code) {}
+
+  // We also want implicit cast to bool to check for return values of functions.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator bool() const { return code_ == StatusCode::kOk; }
+
+  constexpr StatusCode code() const { return code_; }
+
+  // Returns whether the status code is a fatal error.
+  constexpr bool IsFatalError() const {
+    return static_cast<int32_t>(code_) > 0;
+  }
+
+ private:
+  StatusCode code_;
+};
+
+// Helper function to create a Status and print the debug message or abort when
+// needed.
+inline JXL_FORMAT(2, 3) Status
+    StatusMessage(const Status status, const char* format, ...) {
+  // This block will be optimized out when JXL_DEBUG_ON_ERROR and
+  // JXL_DEBUG_ON_ALL_ERROR are both disabled.
+  if ((JXL_DEBUG_ON_ERROR && status.IsFatalError()) ||
+      (JXL_DEBUG_ON_ALL_ERROR && !status)) {
+    va_list args;
+    va_start(args, format);
+    vfprintf(stderr, format, args);
+    va_end(args);
+  }
+#ifdef JXL_CRASH_ON_ERROR
+  // JXL_CRASH_ON_ERROR means to Abort() only on non-fatal errors.
+  if (status.IsFatalError()) {
+    Abort();
+  }
+#endif  // JXL_CRASH_ON_ERROR
+  return status;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_STATUS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/tsc_timer.h b/third_party/jpeg-xl/lib/jxl/base/tsc_timer.h
new file mode 100644
index 0000000000..74d51f72d1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/tsc_timer.h
@@ -0,0 +1,172 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_TSC_TIMER_H_
+#define LIB_JXL_BASE_TSC_TIMER_H_
+
+// High-resolution (~10 ns) timestamps, using fences to prevent reordering and
+// ensure exactly the desired regions are measured.
+
+#include <stdint.h>
+#include <time.h>  // clock_gettime
+
+#if defined(_WIN32) || defined(_WIN64)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif  // WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif  // NOMINMAX
+#ifndef NOGDI
+#define NOGDI
+#endif  // NOGDI
+#include <windows.h>
+// Undef macros to avoid collisions
+#undef LoadFence
+#endif
+
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+#endif
+
+#if defined(__HAIKU__)
+#include <OS.h>
+#endif
+
+#include <ctime>
+#include <hwy/base.h>
+#include <hwy/cache_control.h>  // LoadFence
+
+namespace jxl {
+namespace profiler {
+
+// Ticks := platform-specific timer values (CPU cycles on x86). Must be
+// unsigned to guarantee wraparound on overflow.
+using Ticks = uint64_t;
+
+// TicksBefore/After return absolute timestamps and must be placed immediately
+// before and after the region to measure. We provide separate Before/After
+// functions because they use different fences.
+//
+// Background: RDTSC is not 'serializing'; earlier instructions may complete
+// after it, and/or later instructions may complete before it. 'Fences' ensure
+// regions' elapsed times are independent of such reordering. The only
+// documented unprivileged serializing instruction is CPUID, which acts as a
+// full fence (no reordering across it in either direction). Unfortunately
+// the latency of CPUID varies wildly (perhaps made worse by not initializing
+// its EAX input). Because it cannot reliably be deducted from the region's
+// elapsed time, it must not be included in the region to measure (i.e.
+// between the two RDTSC).
+//
+// The newer RDTSCP is sometimes described as serializing, but it actually
+// only serves as a half-fence with release semantics. Although all
+// instructions in the region will complete before the final timestamp is
+// captured, subsequent instructions may leak into the region and increase the
+// elapsed time. Inserting another fence after the final RDTSCP would prevent
+// such reordering without affecting the measured region.
+//
+// Fortunately, such a fence exists. The LFENCE instruction is only documented
+// to delay later loads until earlier loads are visible. However, Intel's
+// reference manual says it acts as a full fence (waiting until all earlier
+// instructions have completed, and delaying later instructions until it
+// completes). AMD assigns the same behavior to MFENCE.
+//
+// We need a fence before the initial RDTSC to prevent earlier instructions
+// from leaking into the region, and arguably another after RDTSC to avoid
+// region instructions from completing before the timestamp is recorded.
+// When surrounded by fences, the additional RDTSCP half-fence provides no
+// benefit, so the initial timestamp can be recorded via RDTSC, which has
+// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
+// we define Before = LFENCE/RDTSC/LFENCE; After = RDTSCP/LFENCE.
+//
+// Using Before+Before leads to higher variance and overhead than After+After.
+// However, After+After includes an LFENCE in the region measurements, which
+// adds a delay dependent on earlier loads. The combination of Before+After
+// is faster than Before+Before and more consistent than After+After because
+// the first LFENCE already delayed subsequent loads before the measured
+// region. This combination seems not to have been considered in prior work:
+// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
+//
+// Note: performance counters can measure 'exact' instructions-retired or
+// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
+// requires fences. Unfortunately, it is not accessible on all OSes and we
+// prefer to avoid kernel-mode drivers. Performance counters are also affected
+// by several under/over-count errata, so we use the TSC instead.
+
+// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
+// divide by InvariantTicksPerSecond.
+static HWY_INLINE HWY_MAYBE_UNUSED Ticks TicksBefore() {
+  Ticks t;
+#if HWY_ARCH_PPC && defined(__GLIBC__)
+  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
+  hwy::LoadFence();
+  HWY_FENCE;
+  t = __rdtsc();
+  hwy::LoadFence();
+  HWY_FENCE;
+#elif HWY_ARCH_X86_64
+  asm volatile(
+      "lfence\n\t"
+      "rdtsc\n\t"
+      "shl $32, %%rdx\n\t"
+      "or %%rdx, %0\n\t"
+      "lfence"
+      : "=a"(t)
+      :
+      // "memory" avoids reordering. rdx = TSC >> 32.
+      // "cc" = flags modified by SHL.
+      : "rdx", "memory", "cc");
+#elif HWY_ARCH_RVV
+  asm volatile("rdcycle %0" : "=r"(t));
+#elif defined(_WIN32) || defined(_WIN64)
+  LARGE_INTEGER counter;
+  (void)QueryPerformanceCounter(&counter);
+  t = counter.QuadPart;
+#elif defined(__APPLE__)
+  t = mach_absolute_time();
+#elif defined(__HAIKU__)
+  t = system_time_nsecs();  // since boot
+#else  // POSIX
+  timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
+#endif
+  return t;
+}
+
+static HWY_INLINE HWY_MAYBE_UNUSED Ticks TicksAfter() {
+  Ticks t;
+#if HWY_ARCH_PPC && defined(__GLIBC__)
+  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
+  HWY_FENCE;
+  unsigned aux;
+  t = __rdtscp(&aux);
+  hwy::LoadFence();
+  HWY_FENCE;
+#elif HWY_ARCH_X86_64
+  // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
+  asm volatile(
+      "rdtscp\n\t"
+      "shl $32, %%rdx\n\t"
+      "or %%rdx, %0\n\t"
+      "lfence"
+      : "=a"(t)
+      :
+      // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
+      // "cc" = flags modified by SHL.
+      : "rcx", "rdx", "memory", "cc");
+#else
+  t = TicksBefore();  // no difference on other platforms.
+#endif
+  return t;
+}
+
+}  // namespace profiler
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_TSC_TIMER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/bit_reader_test.cc b/third_party/jpeg-xl/lib/jxl/bit_reader_test.cc
new file mode 100644
index 0000000000..24cc9b64e8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/bit_reader_test.cc
@@ -0,0 +1,262 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <array>
+#include <vector>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(BitReaderTest, ExtendsWithZeroes) {
+  for (size_t size = 4; size < 32; ++size) {
+    std::vector<uint8_t> data(size, 0xff);
+
+    for (size_t n_bytes = 0; n_bytes < size; n_bytes++) {
+      BitReader br(Span<const uint8_t>(data.data(), n_bytes));
+      // Read all the bits
+      for (size_t i = 0; i < n_bytes * kBitsPerByte; i++) {
+        ASSERT_EQ(br.ReadBits(1), 1u) << "n_bytes=" << n_bytes << " i=" << i;
+      }
+
+      // PEEK more than the declared size - all will be zero. Cannot consume.
+      for (size_t i = 0; i < BitReader::kMaxBitsPerCall; i++) {
+        ASSERT_EQ(br.PeekBits(i), 0u)
+            << "size=" << size << "n_bytes=" << n_bytes << " i=" << i;
+      }
+
+      EXPECT_TRUE(br.Close());
+    }
+  }
+}
+
+struct Symbol {
+  uint32_t num_bits;
+  uint32_t value;
+};
+
+// Reading from output gives the same values.
+TEST(BitReaderTest, TestRoundTrip) {
+  test::ThreadPoolForTests pool(8);
+  EXPECT_TRUE(RunOnPool(
+      &pool, 0, 1000, ThreadPool::NoInit,
+      [](const uint32_t task, size_t /* thread */) {
+        constexpr size_t kMaxBits = 8000;
+        BitWriter writer;
+        BitWriter::Allotment allotment(&writer, kMaxBits);
+
+        std::vector<Symbol> symbols;
+        symbols.reserve(1000);
+
+        Rng rng(55537 + 129 * task);
+
+        for (;;) {
+          const uint32_t num_bits = rng.UniformU(1, 33);
+          if (writer.BitsWritten() + num_bits > kMaxBits) break;
+          const uint32_t value = rng.UniformU(0, 1ULL << num_bits);
+          symbols.push_back({num_bits, value});
+          writer.Write(num_bits, value);
+        }
+
+        writer.ZeroPadToByte();
+        allotment.ReclaimAndCharge(&writer, 0, nullptr);
+        BitReader reader(writer.GetSpan());
+        for (const Symbol& s : symbols) {
+          EXPECT_EQ(s.value, reader.ReadBits(s.num_bits));
+        }
+        EXPECT_TRUE(reader.Close());
+      },
+      "TestTBitReaderRoundTrip"));
+}
+
+// SkipBits is the same as reading that many bits.
+TEST(BitReaderTest, TestSkip) {
+  test::ThreadPoolForTests pool(8);
+  EXPECT_TRUE(RunOnPool(
+      &pool, 0, 96, ThreadPool::NoInit,
+      [](const uint32_t task, size_t /* thread */) {
+        constexpr size_t kSize = 100;
+
+        for (size_t skip = 0; skip < 128; ++skip) {
+          BitWriter writer;
+          BitWriter::Allotment allotment(&writer, kSize * kBitsPerByte);
+          // Start with "task" 1-bits.
+          for (size_t i = 0; i < task; ++i) {
+            writer.Write(1, 1);
+          }
+
+          // Write 0-bits that we will skip over
+          for (size_t i = 0; i < skip; ++i) {
+            writer.Write(1, 0);
+          }
+
+          // Write terminator bits '101'
+          writer.Write(3, 5);
+          EXPECT_EQ(task + skip + 3, writer.BitsWritten());
+          writer.ZeroPadToByte();
+          AuxOut aux_out;
+          allotment.ReclaimAndCharge(&writer, 0, &aux_out);
+          EXPECT_LT(aux_out.layers[0].total_bits, kSize * 8);
+
+          BitReader reader1(writer.GetSpan());
+          BitReader reader2(writer.GetSpan());
+          // Verify initial 1-bits
+          for (size_t i = 0; i < task; ++i) {
+            EXPECT_EQ(1u, reader1.ReadBits(1));
+            EXPECT_EQ(1u, reader2.ReadBits(1));
+          }
+
+          // SkipBits or manually read "skip" bits
+          reader1.SkipBits(skip);
+          for (size_t i = 0; i < skip; ++i) {
+            EXPECT_EQ(0u, reader2.ReadBits(1))
+                << " skip=" << skip << " i=" << i;
+          }
+          EXPECT_EQ(reader1.TotalBitsConsumed(), reader2.TotalBitsConsumed());
+
+          // Ensure both readers see the terminator bits.
+          EXPECT_EQ(5u, reader1.ReadBits(3));
+          EXPECT_EQ(5u, reader2.ReadBits(3));
+
+          EXPECT_TRUE(reader1.Close());
+          EXPECT_TRUE(reader2.Close());
+        }
+      },
+      "TestSkip"));
+}
+
+// Verifies byte order and different groupings of bits.
+TEST(BitReaderTest, TestOrder) {
+  constexpr size_t kMaxBits = 16;
+
+  // u(1) - bits written into LSBs of first byte
+  {
+    BitWriter writer;
+    BitWriter::Allotment allotment(&writer, kMaxBits);
+    for (size_t i = 0; i < 5; ++i) {
+      writer.Write(1, 1);
+    }
+    for (size_t i = 0; i < 5; ++i) {
+      writer.Write(1, 0);
+    }
+    for (size_t i = 0; i < 6; ++i) {
+      writer.Write(1, 1);
+    }
+
+    writer.ZeroPadToByte();
+    allotment.ReclaimAndCharge(&writer, 0, nullptr);
+    BitReader reader(writer.GetSpan());
+    EXPECT_EQ(0x1Fu, reader.ReadFixedBits<8>());
+    EXPECT_EQ(0xFCu, reader.ReadFixedBits<8>());
+    EXPECT_TRUE(reader.Close());
+  }
+
+  // u(8) - get bytes in the same order
+  {
+    BitWriter writer;
+    BitWriter::Allotment allotment(&writer, kMaxBits);
+    writer.Write(8, 0xF8);
+    writer.Write(8, 0x3F);
+
+    writer.ZeroPadToByte();
+    allotment.ReclaimAndCharge(&writer, 0, nullptr);
+    BitReader reader(writer.GetSpan());
+    EXPECT_EQ(0xF8u, reader.ReadFixedBits<8>());
+    EXPECT_EQ(0x3Fu, reader.ReadFixedBits<8>());
+    EXPECT_TRUE(reader.Close());
+  }
+
+  // u(16) - little-endian bytes
+  {
+    BitWriter writer;
+    BitWriter::Allotment allotment(&writer, kMaxBits);
+    writer.Write(16, 0xF83F);
+
+    writer.ZeroPadToByte();
+    allotment.ReclaimAndCharge(&writer, 0, nullptr);
+    BitReader reader(writer.GetSpan());
+    EXPECT_EQ(0x3Fu, reader.ReadFixedBits<8>());
+    EXPECT_EQ(0xF8u, reader.ReadFixedBits<8>());
+    EXPECT_TRUE(reader.Close());
+  }
+
+  // Non-byte-aligned, mixed sizes
+  {
+    BitWriter writer;
+    BitWriter::Allotment allotment(&writer, kMaxBits);
+    writer.Write(1, 1);
+    writer.Write(3, 6);
+    writer.Write(8, 0xDB);
+    writer.Write(4, 8);
+
+    writer.ZeroPadToByte();
+    allotment.ReclaimAndCharge(&writer, 0, nullptr);
+    BitReader reader(writer.GetSpan());
+    EXPECT_EQ(0xBDu, reader.ReadFixedBits<8>());
+    EXPECT_EQ(0x8Du, reader.ReadFixedBits<8>());
+    EXPECT_TRUE(reader.Close());
+  }
+}
+
+TEST(BitReaderTest, TotalCountersTest) {
+  uint8_t buf[8] = {1, 2, 3, 4};
+  BitReader reader(Span<const uint8_t>(buf, sizeof(buf)));
+
+  EXPECT_EQ(sizeof(buf), reader.TotalBytes());
+  EXPECT_EQ(0u, reader.TotalBitsConsumed());
+  reader.ReadFixedBits<1>();
+  EXPECT_EQ(1u, reader.TotalBitsConsumed());
+
+  reader.ReadFixedBits<10>();
+  EXPECT_EQ(11u, reader.TotalBitsConsumed());
+
+  reader.ReadFixedBits<4>();
+  EXPECT_EQ(15u, reader.TotalBitsConsumed());
+
+  reader.ReadFixedBits<1>();
+  EXPECT_EQ(16u, reader.TotalBitsConsumed());
+
+  reader.ReadFixedBits<16>();
+  EXPECT_EQ(32u, reader.TotalBitsConsumed());
+
+  EXPECT_TRUE(reader.Close());
+}
+
+TEST(BitReaderTest, MoveTest) {
+  uint8_t buf[8] = {1, 2, 3, 4};
+  BitReader reader2;
+  {
+    BitReader reader1(Span<const uint8_t>(buf, sizeof(buf)));
+
+    EXPECT_EQ(0u, reader1.TotalBitsConsumed());
+    reader1.ReadFixedBits<16>();
+    EXPECT_EQ(16u, reader1.TotalBitsConsumed());
+
+    reader2 = std::move(reader1);
+    // From this point reader1 is invalid, but can continue to access reader2
+    // and we don't need to call Close() on reader1.
+  }
+
+  EXPECT_EQ(16u, reader2.TotalBitsConsumed());
+  EXPECT_EQ(3U, reader2.ReadFixedBits<8>());
+  EXPECT_EQ(24u, reader2.TotalBitsConsumed());
+
+  EXPECT_TRUE(reader2.Close());
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/bits_test.cc b/third_party/jpeg-xl/lib/jxl/bits_test.cc
new file mode 100644
index 0000000000..bd7aa548c8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/bits_test.cc
@@ -0,0 +1,87 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/bits.h"
+
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(BitsTest, TestNumZeroBits) {
+  // Zero input is well-defined.
+  EXPECT_EQ(32u, Num0BitsAboveMS1Bit(0u));
+  EXPECT_EQ(64u, Num0BitsAboveMS1Bit(0ull));
+  EXPECT_EQ(32u, Num0BitsBelowLS1Bit(0u));
+  EXPECT_EQ(64u, Num0BitsBelowLS1Bit(0ull));
+
+  EXPECT_EQ(31u, Num0BitsAboveMS1Bit(1u));
+  EXPECT_EQ(30u, Num0BitsAboveMS1Bit(2u));
+  EXPECT_EQ(63u, Num0BitsAboveMS1Bit(1ull));
+  EXPECT_EQ(62u, Num0BitsAboveMS1Bit(2ull));
+
+  EXPECT_EQ(0u, Num0BitsBelowLS1Bit(1u));
+  EXPECT_EQ(0u, Num0BitsBelowLS1Bit(1ull));
+  EXPECT_EQ(1u, Num0BitsBelowLS1Bit(2u));
+  EXPECT_EQ(1u, Num0BitsBelowLS1Bit(2ull));
+
+  EXPECT_EQ(0u, Num0BitsAboveMS1Bit(0x80000000u));
+  EXPECT_EQ(0u, Num0BitsAboveMS1Bit(0x8000000000000000ull));
+  EXPECT_EQ(31u, Num0BitsBelowLS1Bit(0x80000000u));
+  EXPECT_EQ(63u, Num0BitsBelowLS1Bit(0x8000000000000000ull));
+}
+
+TEST(BitsTest, TestFloorLog2) {
+  // for input = [1, 7]
+  const size_t expected[7] = {0, 1, 1, 2, 2, 2, 2};
+  for (uint32_t i = 1; i <= 7; ++i) {
+    EXPECT_EQ(expected[i - 1], FloorLog2Nonzero(i)) << " " << i;
+    EXPECT_EQ(expected[i - 1], FloorLog2Nonzero(uint64_t(i))) << " " << i;
+  }
+
+  EXPECT_EQ(11u, FloorLog2Nonzero(0x00000fffu));  // 4095
+  EXPECT_EQ(12u, FloorLog2Nonzero(0x00001000u));  // 4096
+  EXPECT_EQ(12u, FloorLog2Nonzero(0x00001001u));  // 4097
+
+  EXPECT_EQ(31u, FloorLog2Nonzero(0x80000000u));
+  EXPECT_EQ(31u, FloorLog2Nonzero(0x80000001u));
+  EXPECT_EQ(31u, FloorLog2Nonzero(0xFFFFFFFFu));
+
+  EXPECT_EQ(31u, FloorLog2Nonzero(0x80000000ull));
+  EXPECT_EQ(31u, FloorLog2Nonzero(0x80000001ull));
+  EXPECT_EQ(31u, FloorLog2Nonzero(0xFFFFFFFFull));
+
+  EXPECT_EQ(63u, FloorLog2Nonzero(0x8000000000000000ull));
+  EXPECT_EQ(63u, FloorLog2Nonzero(0x8000000000000001ull));
+  EXPECT_EQ(63u, FloorLog2Nonzero(0xFFFFFFFFFFFFFFFFull));
+}
+
+TEST(BitsTest, TestCeilLog2) {
+  // for input = [1, 7]
+  const size_t expected[7] = {0, 1, 2, 2, 3, 3, 3};
+  for (uint32_t i = 1; i <= 7; ++i) {
+    EXPECT_EQ(expected[i - 1], CeilLog2Nonzero(i)) << " " << i;
+    EXPECT_EQ(expected[i - 1], CeilLog2Nonzero(uint64_t(i))) << " " << i;
+  }
+
+  EXPECT_EQ(12u, CeilLog2Nonzero(0x00000fffu));  // 4095
+  EXPECT_EQ(12u, CeilLog2Nonzero(0x00001000u));  // 4096
+  EXPECT_EQ(13u, CeilLog2Nonzero(0x00001001u));  // 4097
+
+  EXPECT_EQ(31u, CeilLog2Nonzero(0x80000000u));
+  EXPECT_EQ(32u, CeilLog2Nonzero(0x80000001u));
+  EXPECT_EQ(32u, CeilLog2Nonzero(0xFFFFFFFFu));
+
+  EXPECT_EQ(31u, CeilLog2Nonzero(0x80000000ull));
+  EXPECT_EQ(32u, CeilLog2Nonzero(0x80000001ull));
+  EXPECT_EQ(32u, CeilLog2Nonzero(0xFFFFFFFFull));
+
+  EXPECT_EQ(63u, CeilLog2Nonzero(0x8000000000000000ull));
+  EXPECT_EQ(64u, CeilLog2Nonzero(0x8000000000000001ull));
+  EXPECT_EQ(64u, CeilLog2Nonzero(0xFFFFFFFFFFFFFFFFull));
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/blending.cc b/third_party/jpeg-xl/lib/jxl/blending.cc
new file mode 100644
index 0000000000..ab37fdabb5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/blending.cc
@@ -0,0 +1,152 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/blending.h"
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+bool NeedsBlending(PassesDecoderState* dec_state) {
+  const PassesSharedState& state = *dec_state->shared;
+  if (!(state.frame_header.frame_type == FrameType::kRegularFrame ||
+        state.frame_header.frame_type == FrameType::kSkipProgressive)) {
+    return false;
+  }
+  const auto& info = state.frame_header.blending_info;
+  bool replace_all = (info.mode == BlendMode::kReplace);
+  for (const auto& ec_i : state.frame_header.extra_channel_blending_info) {
+    if (ec_i.mode != BlendMode::kReplace) {
+      replace_all = false;
+    }
+  }
+  // Replace the full frame: nothing to do.
+  if (!state.frame_header.custom_size_or_origin && replace_all) {
+    return false;
+  }
+  return true;
+}
+
+void PerformBlending(const float* const* bg, const float* const* fg,
+                     float* const* out, size_t x0, size_t xsize,
+                     const PatchBlending& color_blending,
+                     const PatchBlending* ec_blending,
+                     const std::vector<ExtraChannelInfo>& extra_channel_info) {
+  bool has_alpha = false;
+  size_t num_ec = extra_channel_info.size();
+  for (size_t i = 0; i < num_ec; i++) {
+    if (extra_channel_info[i].type == jxl::ExtraChannel::kAlpha) {
+      has_alpha = true;
+      break;
+    }
+  }
+  ImageF tmp(xsize, 3 + num_ec);
+  // Blend extra channels first so that we use the pre-blending alpha.
+  for (size_t i = 0; i < num_ec; i++) {
+    if (ec_blending[i].mode == PatchBlendMode::kAdd) {
+      for (size_t x = 0; x < xsize; x++) {
+        tmp.Row(3 + i)[x] = bg[3 + i][x + x0] + fg[3 + i][x + x0];
+      }
+    } else if (ec_blending[i].mode == PatchBlendMode::kBlendAbove) {
+      size_t alpha = ec_blending[i].alpha_channel;
+      bool is_premultiplied = extra_channel_info[alpha].alpha_associated;
+      PerformAlphaBlending(bg[3 + i] + x0, bg[3 + alpha] + x0, fg[3 + i] + x0,
+                           fg[3 + alpha] + x0, tmp.Row(3 + i), xsize,
+                           is_premultiplied, ec_blending[i].clamp);
+    } else if (ec_blending[i].mode == PatchBlendMode::kBlendBelow) {
+      size_t alpha = ec_blending[i].alpha_channel;
+      bool is_premultiplied = extra_channel_info[alpha].alpha_associated;
+      PerformAlphaBlending(fg[3 + i] + x0, fg[3 + alpha] + x0, bg[3 + i] + x0,
+                           bg[3 + alpha] + x0, tmp.Row(3 + i), xsize,
+                           is_premultiplied, ec_blending[i].clamp);
+    } else if (ec_blending[i].mode == PatchBlendMode::kAlphaWeightedAddAbove) {
+      size_t alpha = ec_blending[i].alpha_channel;
+      PerformAlphaWeightedAdd(bg[3 + i] + x0, fg[3 + i] + x0,
+                              fg[3 + alpha] + x0, tmp.Row(3 + i), xsize,
+                              ec_blending[i].clamp);
+    } else if (ec_blending[i].mode == PatchBlendMode::kAlphaWeightedAddBelow) {
+      size_t alpha = ec_blending[i].alpha_channel;
+      PerformAlphaWeightedAdd(fg[3 + i] + x0, bg[3 + i] + x0,
+                              bg[3 + alpha] + x0, tmp.Row(3 + i), xsize,
+                              ec_blending[i].clamp);
+    } else if (ec_blending[i].mode == PatchBlendMode::kMul) {
+      PerformMulBlending(bg[3 + i] + x0, fg[3 + i] + x0, tmp.Row(3 + i), xsize,
+                         ec_blending[i].clamp);
+    } else if (ec_blending[i].mode == PatchBlendMode::kReplace) {
+      memcpy(tmp.Row(3 + i), fg[3 + i] + x0, xsize * sizeof(**fg));
+    } else if (ec_blending[i].mode == PatchBlendMode::kNone) {
+      if (xsize) memcpy(tmp.Row(3 + i), bg[3 + i] + x0, xsize * sizeof(**fg));
+    } else {
+      JXL_ABORT("Unreachable");
+    }
+  }
+  size_t alpha = color_blending.alpha_channel;
+
+  if (color_blending.mode == PatchBlendMode::kAdd ||
+      (color_blending.mode == PatchBlendMode::kAlphaWeightedAddAbove &&
+       !has_alpha) ||
+      (color_blending.mode == PatchBlendMode::kAlphaWeightedAddBelow &&
+       !has_alpha)) {
+    for (int p = 0; p < 3; p++) {
+      float* out = tmp.Row(p);
+      for (size_t x = 0; x < xsize; x++) {
+        out[x] = bg[p][x + x0] + fg[p][x + x0];
+      }
+    }
+  } else if (color_blending.mode == PatchBlendMode::kBlendAbove
+             // blend without alpha is just replace
+             && has_alpha) {
+    bool is_premultiplied = extra_channel_info[alpha].alpha_associated;
+    PerformAlphaBlending(
+        {bg[0] + x0, bg[1] + x0, bg[2] + x0, bg[3 + alpha] + x0},
+        {fg[0] + x0, fg[1] + x0, fg[2] + x0, fg[3 + alpha] + x0},
+        {tmp.Row(0), tmp.Row(1), tmp.Row(2), tmp.Row(3 + alpha)}, xsize,
+        is_premultiplied, color_blending.clamp);
+  } else if (color_blending.mode == PatchBlendMode::kBlendBelow
+             // blend without alpha is just replace
+             && has_alpha) {
+    bool is_premultiplied = extra_channel_info[alpha].alpha_associated;
+    PerformAlphaBlending(
+        {fg[0] + x0, fg[1] + x0, fg[2] + x0, fg[3 + alpha] + x0},
+        {bg[0] + x0, bg[1] + x0, bg[2] + x0, bg[3 + alpha] + x0},
+        {tmp.Row(0), tmp.Row(1), tmp.Row(2), tmp.Row(3 + alpha)}, xsize,
+        is_premultiplied, color_blending.clamp);
+  } else if (color_blending.mode == PatchBlendMode::kAlphaWeightedAddAbove) {
+    JXL_DASSERT(has_alpha);
+    for (size_t c = 0; c < 3; c++) {
+      PerformAlphaWeightedAdd(bg[c] + x0, fg[c] + x0, fg[3 + alpha] + x0,
+                              tmp.Row(c), xsize, color_blending.clamp);
+    }
+  } else if (color_blending.mode == PatchBlendMode::kAlphaWeightedAddBelow) {
+    JXL_DASSERT(has_alpha);
+    for (size_t c = 0; c < 3; c++) {
+      PerformAlphaWeightedAdd(fg[c] + x0, bg[c] + x0, bg[3 + alpha] + x0,
+                              tmp.Row(c), xsize, color_blending.clamp);
+    }
+  } else if (color_blending.mode == PatchBlendMode::kMul) {
+    for (int p = 0; p < 3; p++) {
+      PerformMulBlending(bg[p] + x0, fg[p] + x0, tmp.Row(p), xsize,
+                         color_blending.clamp);
+    }
+  } else if (color_blending.mode == PatchBlendMode::kReplace ||
+             color_blending.mode == PatchBlendMode::kBlendAbove ||
+             color_blending.mode == PatchBlendMode::kBlendBelow) {  // kReplace
+    for (size_t p = 0; p < 3; p++) {
+      memcpy(tmp.Row(p), fg[p] + x0, xsize * sizeof(**fg));
+    }
+  } else if (color_blending.mode == PatchBlendMode::kNone) {
+    for (size_t p = 0; p < 3; p++) {
+      memcpy(tmp.Row(p), bg[p] + x0, xsize * sizeof(**fg));
+    }
+  } else {
+    JXL_ABORT("Unreachable");
+  }
+  for (size_t i = 0; i < 3 + num_ec; i++) {
+    if (xsize != 0) memcpy(out[i] + x0, tmp.Row(i), xsize * sizeof(**out));
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/blending.h b/third_party/jpeg-xl/lib/jxl/blending.h
new file mode 100644
index 0000000000..7eab7d50cd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/blending.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BLENDING_H_
+#define LIB_JXL_BLENDING_H_
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/dec_patch_dictionary.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+bool NeedsBlending(PassesDecoderState* dec_state);
+
+void PerformBlending(const float* const* bg, const float* const* fg,
+                     float* const* out, size_t x0, size_t xsize,
+                     const PatchBlending& color_blending,
+                     const PatchBlending* ec_blending,
+                     const std::vector<ExtraChannelInfo>& extra_channel_info);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BLENDING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/blending_test.cc b/third_party/jpeg-xl/lib/jxl/blending_test.cc
new file mode 100644
index 0000000000..ff4c46c529
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/blending_test.cc
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+using ::testing::SizeIs;
+
+TEST(BlendingTest, Crops) {
+  const PaddedBytes compressed =
+      jxl::test::ReadTestData("jxl/blending/cropped_traffic_light.jxl");
+  CodecInOut decoded;
+  ASSERT_TRUE(test::DecodeFile({}, Span<const uint8_t>(compressed), &decoded));
+  ASSERT_THAT(decoded.frames, SizeIs(4));
+
+  int i = 0;
+  for (const ImageBundle& ib : decoded.frames) {
+    std::ostringstream filename;
+    filename << "jxl/blending/cropped_traffic_light_frame-" << i << ".png";
+    const PaddedBytes compressed_frame =
+        jxl::test::ReadTestData(filename.str());
+    CodecInOut frame;
+    ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(compressed_frame), &frame));
+    JXL_EXPECT_OK(SamePixels(ib.color(), *frame.Main().color(), _));
+    ++i;
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/box_content_decoder.cc b/third_party/jpeg-xl/lib/jxl/box_content_decoder.cc
new file mode 100644
index 0000000000..c4cba3a31a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/box_content_decoder.cc
@@ -0,0 +1,101 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/box_content_decoder.h"
+
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+
+JxlBoxContentDecoder::JxlBoxContentDecoder() {}
+
+JxlBoxContentDecoder::~JxlBoxContentDecoder() {
+  if (brotli_dec) {
+    BrotliDecoderDestroyInstance(brotli_dec);
+  }
+}
+
+void JxlBoxContentDecoder::StartBox(bool brob_decode, bool box_until_eof,
+                                    size_t contents_size) {
+  if (brotli_dec) {
+    BrotliDecoderDestroyInstance(brotli_dec);
+    brotli_dec = nullptr;
+  }
+  header_done_ = false;
+  brob_decode_ = brob_decode;
+  box_until_eof_ = box_until_eof;
+  remaining_ = box_until_eof ? 0 : contents_size;
+  pos_ = 0;
+}
+
+JxlDecoderStatus JxlBoxContentDecoder::Process(const uint8_t* next_in,
+                                               size_t avail_in, size_t box_pos,
+                                               uint8_t** next_out,
+                                               size_t* avail_out) {
+  next_in += pos_ - box_pos;
+  avail_in -= pos_ - box_pos;
+
+  if (brob_decode_) {
+    if (!header_done_) {
+      if (avail_in < 4) return JXL_DEC_NEED_MORE_INPUT;
+      if (!box_until_eof_) {
+        if (remaining_ < 4) return JXL_DEC_ERROR;
+        remaining_ -= 4;
+      }
+      next_in += 4;
+      avail_in -= 4;
+      pos_ += 4;
+      header_done_ = true;
+    }
+
+    if (!brotli_dec) {
+      brotli_dec = BrotliDecoderCreateInstance(nullptr, nullptr, nullptr);
+    }
+
+    const uint8_t* next_in_before = next_in;
+    uint8_t* next_out_before = *next_out;
+    msan::MemoryIsInitialized(next_in, avail_in);
+    BrotliDecoderResult res = BrotliDecoderDecompressStream(
+        brotli_dec, &avail_in, &next_in, avail_out, next_out, nullptr);
+    size_t consumed = next_in - next_in_before;
+    size_t produced = *next_out - next_out_before;
+    if (res == BROTLI_DECODER_RESULT_ERROR) {
+      return JXL_DEC_ERROR;
+    }
+    msan::UnpoisonMemory(next_out_before, produced);
+    pos_ += consumed;
+    if (!box_until_eof_) remaining_ -= consumed;
+    if (res == BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT) {
+      return JXL_DEC_NEED_MORE_INPUT;
+    }
+    if (res == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT) {
+      return JXL_DEC_BOX_NEED_MORE_OUTPUT;
+    }
+    if (res == BROTLI_DECODER_RESULT_SUCCESS) {
+      return JXL_DEC_SUCCESS;
+    }
+    // unknown Brotli result
+    return JXL_DEC_ERROR;
+  } else {
+    // remaining box bytes as seen from dec->file_pos
+    size_t can_read = avail_in;
+    if (!box_until_eof_) can_read = std::min<size_t>(can_read, remaining_);
+    size_t to_write = std::min<size_t>(can_read, *avail_out);
+    memcpy(*next_out, next_in, to_write);
+
+    *next_out += to_write;
+    *avail_out -= to_write;
+    if (!box_until_eof_) remaining_ -= to_write;
+    pos_ += to_write;
+
+    if (to_write < can_read) return JXL_DEC_BOX_NEED_MORE_OUTPUT;
+
+    if (!box_until_eof_ && remaining_ > 0) return JXL_DEC_NEED_MORE_INPUT;
+
+    return JXL_DEC_SUCCESS;
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/box_content_decoder.h b/third_party/jpeg-xl/lib/jxl/box_content_decoder.h
new file mode 100644
index 0000000000..6153360a8e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/box_content_decoder.h
@@ -0,0 +1,49 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BOX_CONTENT_DECODER_H_
+#define LIB_JXL_BOX_CONTENT_DECODER_H_
+
+#include <brotli/decode.h>
+#include <jxl/decode.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <memory>
+#include <vector>
+
+namespace jxl {
+
+/** Outputs the contents of a box in a streaming fashion, either directly, or
+ * optionally decoding with Brotli, in case of a brob box. The input must be
+ * the contents of a box, excluding the box header.
+ */
+class JxlBoxContentDecoder {
+ public:
+  JxlBoxContentDecoder();
+  ~JxlBoxContentDecoder();
+
+  void StartBox(bool brob_decode, bool box_until_eof, size_t contents_size);
+
+  // Outputs decoded bytes from the box, decoding with brotli if needed.
+  // box_pos is the position in the box content which next_in points to.
+  // Returns success, whether more input or output bytes are needed, or error.
+  JxlDecoderStatus Process(const uint8_t* next_in, size_t avail_in,
+                           size_t box_pos, uint8_t** next_out,
+                           size_t* avail_out);
+
+ private:
+  BrotliDecoderState* brotli_dec;
+
+  bool header_done_;
+  bool brob_decode_;
+  bool box_until_eof_;
+  size_t remaining_;
+  size_t pos_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BOX_CONTENT_DECODER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.cc b/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.cc
new file mode 100644
index 0000000000..a412becd0d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.cc
@@ -0,0 +1,1988 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Author: Jyrki Alakuijala (jyrki.alakuijala@gmail.com)
+//
+// The physical architecture of butteraugli is based on the following naming
+// convention:
+//   * Opsin - dynamics of the photosensitive chemicals in the retina
+//             with their immediate electrical processing
+//   * Xyb - hybrid opponent/trichromatic color space
+//     x is roughly red-subtract-green.
+//     y is yellow.
+//     b is blue.
+//     Xyb values are computed from Opsin mixing, not directly from rgb.
+//   * Mask - for visual masking
+//   * Hf - color modeling for spatially high-frequency features
+//   * Lf - color modeling for spatially low-frequency features
+//   * Diffmap - to cluster and build an image of error between the images
+//   * Blur - to hold the smoothing code
+
+#include "lib/jxl/butteraugli/butteraugli.h"
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <new>
+#include <vector>
+
+#if JXL_PROFILER_ENABLED
+#include <chrono>
+#endif  // JXL_PROFILER_ENABLED
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/butteraugli/butteraugli.cc"
+#include <hwy/foreach_target.h>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/gauss_blur.h"
+#include "lib/jxl/image_ops.h"
+
+#ifndef JXL_BUTTERAUGLI_ONCE
+#define JXL_BUTTERAUGLI_ONCE
+
+namespace jxl {
+
+std::vector<float> ComputeKernel(float sigma) {
+  const float m = 2.25;  // Accuracy increases when m is increased.
+  const double scaler = -1.0 / (2.0 * sigma * sigma);
+  const int diff = std::max<int>(1, m * std::fabs(sigma));
+  std::vector<float> kernel(2 * diff + 1);
+  for (int i = -diff; i <= diff; ++i) {
+    kernel[i + diff] = std::exp(scaler * i * i);
+  }
+  return kernel;
+}
+
+void ConvolveBorderColumn(const ImageF& in, const std::vector<float>& kernel,
+                          const size_t x, float* BUTTERAUGLI_RESTRICT row_out) {
+  const size_t offset = kernel.size() / 2;
+  int minx = x < offset ? 0 : x - offset;
+  int maxx = std::min<int>(in.xsize() - 1, x + offset);
+  float weight = 0.0f;
+  for (int j = minx; j <= maxx; ++j) {
+    weight += kernel[j - x + offset];
+  }
+  float scale = 1.0f / weight;
+  for (size_t y = 0; y < in.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y);
+    float sum = 0.0f;
+    for (int j = minx; j <= maxx; ++j) {
+      sum += row_in[j] * kernel[j - x + offset];
+    }
+    row_out[y] = sum * scale;
+  }
+}
+
+// Computes a horizontal convolution and transposes the result.
+void ConvolutionWithTranspose(const ImageF& in,
+                              const std::vector<float>& kernel,
+                              ImageF* BUTTERAUGLI_RESTRICT out) {
+  PROFILER_FUNC;
+  JXL_CHECK(out->xsize() == in.ysize());
+  JXL_CHECK(out->ysize() == in.xsize());
+  const size_t len = kernel.size();
+  const size_t offset = len / 2;
+  float weight_no_border = 0.0f;
+  for (size_t j = 0; j < len; ++j) {
+    weight_no_border += kernel[j];
+  }
+  const float scale_no_border = 1.0f / weight_no_border;
+  const size_t border1 = std::min(in.xsize(), offset);
+  const size_t border2 = in.xsize() > offset ? in.xsize() - offset : 0;
+  std::vector<float> scaled_kernel(len / 2 + 1);
+  for (size_t i = 0; i <= len / 2; ++i) {
+    scaled_kernel[i] = kernel[i] * scale_no_border;
+  }
+
+  // middle
+  switch (len) {
+    case 7: {
+      PROFILER_ZONE("conv7");
+      const float sk0 = scaled_kernel[0];
+      const float sk1 = scaled_kernel[1];
+      const float sk2 = scaled_kernel[2];
+      const float sk3 = scaled_kernel[3];
+      for (size_t y = 0; y < in.ysize(); ++y) {
+        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
+        for (size_t x = border1; x < border2; ++x, ++row_in) {
+          const float sum0 = (row_in[0] + row_in[6]) * sk0;
+          const float sum1 = (row_in[1] + row_in[5]) * sk1;
+          const float sum2 = (row_in[2] + row_in[4]) * sk2;
+          const float sum = (row_in[3]) * sk3 + sum0 + sum1 + sum2;
+          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
+          row_out[y] = sum;
+        }
+      }
+    } break;
+    case 13: {
+      PROFILER_ZONE("conv15");
+      for (size_t y = 0; y < in.ysize(); ++y) {
+        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
+        for (size_t x = border1; x < border2; ++x, ++row_in) {
+          float sum0 = (row_in[0] + row_in[12]) * scaled_kernel[0];
+          float sum1 = (row_in[1] + row_in[11]) * scaled_kernel[1];
+          float sum2 = (row_in[2] + row_in[10]) * scaled_kernel[2];
+          float sum3 = (row_in[3] + row_in[9]) * scaled_kernel[3];
+          sum0 += (row_in[4] + row_in[8]) * scaled_kernel[4];
+          sum1 += (row_in[5] + row_in[7]) * scaled_kernel[5];
+          const float sum = (row_in[6]) * scaled_kernel[6];
+          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
+          row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
+        }
+      }
+      break;
+    }
+    case 15: {
+      PROFILER_ZONE("conv15");
+      for (size_t y = 0; y < in.ysize(); ++y) {
+        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
+        for (size_t x = border1; x < border2; ++x, ++row_in) {
+          float sum0 = (row_in[0] + row_in[14]) * scaled_kernel[0];
+          float sum1 = (row_in[1] + row_in[13]) * scaled_kernel[1];
+          float sum2 = (row_in[2] + row_in[12]) * scaled_kernel[2];
+          float sum3 = (row_in[3] + row_in[11]) * scaled_kernel[3];
+          sum0 += (row_in[4] + row_in[10]) * scaled_kernel[4];
+          sum1 += (row_in[5] + row_in[9]) * scaled_kernel[5];
+          sum2 += (row_in[6] + row_in[8]) * scaled_kernel[6];
+          const float sum = (row_in[7]) * scaled_kernel[7];
+          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
+          row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
+        }
+      }
+      break;
+    }
+    case 33: {
+      PROFILER_ZONE("conv33");
+      for (size_t y = 0; y < in.ysize(); ++y) {
+        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
+        for (size_t x = border1; x < border2; ++x, ++row_in) {
+          float sum0 = (row_in[0] + row_in[32]) * scaled_kernel[0];
+          float sum1 = (row_in[1] + row_in[31]) * scaled_kernel[1];
+          float sum2 = (row_in[2] + row_in[30]) * scaled_kernel[2];
+          float sum3 = (row_in[3] + row_in[29]) * scaled_kernel[3];
+          sum0 += (row_in[4] + row_in[28]) * scaled_kernel[4];
+          sum1 += (row_in[5] + row_in[27]) * scaled_kernel[5];
+          sum2 += (row_in[6] + row_in[26]) * scaled_kernel[6];
+          sum3 += (row_in[7] + row_in[25]) * scaled_kernel[7];
+          sum0 += (row_in[8] + row_in[24]) * scaled_kernel[8];
+          sum1 += (row_in[9] + row_in[23]) * scaled_kernel[9];
+          sum2 += (row_in[10] + row_in[22]) * scaled_kernel[10];
+          sum3 += (row_in[11] + row_in[21]) * scaled_kernel[11];
+          sum0 += (row_in[12] + row_in[20]) * scaled_kernel[12];
+          sum1 += (row_in[13] + row_in[19]) * scaled_kernel[13];
+          sum2 += (row_in[14] + row_in[18]) * scaled_kernel[14];
+          sum3 += (row_in[15] + row_in[17]) * scaled_kernel[15];
+          const float sum = (row_in[16]) * scaled_kernel[16];
+          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
+          row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
+        }
+      }
+      break;
+    }
+    default:
+      printf("Warning: Unexpected kernel size! %" PRIuS "\n", len);
+      for (size_t y = 0; y < in.ysize(); ++y) {
+        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y);
+        for (size_t x = border1; x < border2; ++x) {
+          const int d = x - offset;
+          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
+          float sum = 0.0f;
+          size_t j;
+          for (j = 0; j <= len / 2; ++j) {
+            sum += row_in[d + j] * scaled_kernel[j];
+          }
+          for (; j < len; ++j) {
+            sum += row_in[d + j] * scaled_kernel[len - 1 - j];
+          }
+          row_out[y] = sum;
+        }
+      }
+  }
+  // left border
+  for (size_t x = 0; x < border1; ++x) {
+    ConvolveBorderColumn(in, kernel, x, out->Row(x));
+  }
+
+  // right border
+  for (size_t x = border2; x < in.xsize(); ++x) {
+    ConvolveBorderColumn(in, kernel, x, out->Row(x));
+  }
+}
+
+// A blur somewhat similar to a 2D Gaussian blur.
+// See: https://en.wikipedia.org/wiki/Gaussian_blur
+//
+// This is a bottleneck because the sigma can be quite large (>7). We can use
+// gauss_blur.cc (runtime independent of sigma, closer to a 4*sigma truncated
+// Gaussian and our 2.25 in ComputeKernel), but its boundary conditions are
+// zero-valued. This leads to noticeable differences at the edges of diffmaps.
+// We retain a special case for 5x5 kernels (even faster than gauss_blur),
+// optionally use gauss_blur followed by fixup of the borders for large images,
+// or fall back to the previous truncated FIR followed by a transpose.
+void Blur(const ImageF& in, float sigma, const ButteraugliParams& params,
+          BlurTemp* temp, ImageF* out) {
+  std::vector<float> kernel = ComputeKernel(sigma);
+  // Separable5 does an in-place convolution, so this fast path is not safe if
+  // in aliases out.
+  if (kernel.size() == 5 && &in != out) {
+    float sum_weights = 0.0f;
+    for (const float w : kernel) {
+      sum_weights += w;
+    }
+    const float scale = 1.0f / sum_weights;
+    const float w0 = kernel[2] * scale;
+    const float w1 = kernel[1] * scale;
+    const float w2 = kernel[0] * scale;
+    const WeightsSeparable5 weights = {
+        {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
+        {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
+    };
+    Separable5(in, Rect(in), weights, /*pool=*/nullptr, out);
+    return;
+  }
+
+  ImageF* JXL_RESTRICT temp_t = temp->GetTransposed(in);
+  ConvolutionWithTranspose(in, kernel, temp_t);
+  ConvolutionWithTranspose(*temp_t, kernel, out);
+}
+
+// Allows PaddedMaltaUnit to call either function via overloading.
+struct MaltaTagLF {};
+struct MaltaTag {};
+
+}  // namespace jxl
+
+#endif  // JXL_BUTTERAUGLI_ONCE
+
+#include <hwy/highway.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Gt;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::Lt;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::MulSub;
+using hwy::HWY_NAMESPACE::Neg;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+template <class D, class V>
+HWY_INLINE V MaximumClamp(D d, V v, double kMaxVal) {
+  static const double kMul = 0.724216145665;
+  const V mul = Set(d, kMul);
+  const V maxval = Set(d, kMaxVal);
+  // If greater than maxval or less than -maxval, replace with if_*.
+  const V if_pos = MulAdd(Sub(v, maxval), mul, maxval);
+  const V if_neg = MulSub(Add(v, maxval), mul, maxval);
+  const V pos_or_v = IfThenElse(Ge(v, maxval), if_pos, v);
+  return IfThenElse(Lt(v, Neg(maxval)), if_neg, pos_or_v);
+}
+
+// Make area around zero less important (remove it).
+template <class D, class V>
+HWY_INLINE V RemoveRangeAroundZero(const D d, const double kw, const V x) {
+  const auto w = Set(d, kw);
+  return IfThenElse(Gt(x, w), Sub(x, w),
+                    IfThenElseZero(Lt(x, Neg(w)), Add(x, w)));
+}
+
+// Make area around zero more important (2x it until the limit).
+template <class D, class V>
+HWY_INLINE V AmplifyRangeAroundZero(const D d, const double kw, const V x) {
+  const auto w = Set(d, kw);
+  return IfThenElse(Gt(x, w), Add(x, w),
+                    IfThenElse(Lt(x, Neg(w)), Sub(x, w), Add(x, x)));
+}
+
+// XybLowFreqToVals converts from low-frequency XYB space to the 'vals' space.
+// Vals space can be converted to L2-norm space (Euclidean and normalized)
+// through visual masking.
+template <class D, class V>
+HWY_INLINE void XybLowFreqToVals(const D d, const V& x, const V& y,
+                                 const V& b_arg, V* HWY_RESTRICT valx,
+                                 V* HWY_RESTRICT valy, V* HWY_RESTRICT valb) {
+  static const double xmul_scalar = 33.832837186260;
+  static const double ymul_scalar = 14.458268100570;
+  static const double bmul_scalar = 49.87984651440;
+  static const double y_to_b_mul_scalar = -0.362267051518;
+  const V xmul = Set(d, xmul_scalar);
+  const V ymul = Set(d, ymul_scalar);
+  const V bmul = Set(d, bmul_scalar);
+  const V y_to_b_mul = Set(d, y_to_b_mul_scalar);
+  const V b = MulAdd(y_to_b_mul, y, b_arg);
+  *valb = Mul(b, bmul);
+  *valx = Mul(x, xmul);
+  *valy = Mul(y, ymul);
+}
+
+void SuppressXByY(const ImageF& in_x, const ImageF& in_y, const double yw,
+                  ImageF* HWY_RESTRICT out) {
+  JXL_DASSERT(SameSize(in_x, in_y) && SameSize(in_x, *out));
+  const size_t xsize = in_x.xsize();
+  const size_t ysize = in_x.ysize();
+
+  const HWY_FULL(float) d;
+  static const double s = 0.653020556257;
+  const auto sv = Set(d, s);
+  const auto one_minus_s = Set(d, 1.0 - s);
+  const auto ywv = Set(d, yw);
+
+  for (size_t y = 0; y < ysize; ++y) {
+    const float* HWY_RESTRICT row_x = in_x.ConstRow(y);
+    const float* HWY_RESTRICT row_y = in_y.ConstRow(y);
+    float* HWY_RESTRICT row_out = out->Row(y);
+
+    for (size_t x = 0; x < xsize; x += Lanes(d)) {
+      const auto vx = Load(d, row_x + x);
+      const auto vy = Load(d, row_y + x);
+      const auto scaler =
+          MulAdd(Div(ywv, MulAdd(vy, vy, ywv)), one_minus_s, sv);
+      Store(Mul(scaler, vx), d, row_out + x);
+    }
+  }
+}
+
+static void SeparateFrequencies(size_t xsize, size_t ysize,
+                                const ButteraugliParams& params,
+                                BlurTemp* blur_temp, const Image3F& xyb,
+                                PsychoImage& ps) {
+  PROFILER_FUNC;
+  const HWY_FULL(float) d;
+
+  // Extract lf ...
+  static const double kSigmaLf = 7.15593339443;
+  static const double kSigmaHf = 3.22489901262;
+  static const double kSigmaUhf = 1.56416327805;
+  ps.mf = Image3F(xsize, ysize);
+  ps.hf[0] = ImageF(xsize, ysize);
+  ps.hf[1] = ImageF(xsize, ysize);
+  ps.lf = Image3F(xyb.xsize(), xyb.ysize());
+  ps.mf = Image3F(xyb.xsize(), xyb.ysize());
+  for (int i = 0; i < 3; ++i) {
+    Blur(xyb.Plane(i), kSigmaLf, params, blur_temp, &ps.lf.Plane(i));
+
+    // ... and keep everything else in mf.
+    for (size_t y = 0; y < ysize; ++y) {
+      const float* BUTTERAUGLI_RESTRICT row_xyb = xyb.PlaneRow(i, y);
+      const float* BUTTERAUGLI_RESTRICT row_lf = ps.lf.ConstPlaneRow(i, y);
+      float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(i, y);
+      for (size_t x = 0; x < xsize; x += Lanes(d)) {
+        const auto mf = Sub(Load(d, row_xyb + x), Load(d, row_lf + x));
+        Store(mf, d, row_mf + x);
+      }
+    }
+    if (i == 2) {
+      Blur(ps.mf.Plane(i), kSigmaHf, params, blur_temp, &ps.mf.Plane(i));
+      break;
+    }
+    // Divide mf into mf and hf.
+    for (size_t y = 0; y < ysize; ++y) {
+      float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(i, y);
+      float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[i].Row(y);
+      for (size_t x = 0; x < xsize; x += Lanes(d)) {
+        Store(Load(d, row_mf + x), d, row_hf + x);
+      }
+    }
+    Blur(ps.mf.Plane(i), kSigmaHf, params, blur_temp, &ps.mf.Plane(i));
+    static const double kRemoveMfRange = 0.29;
+    static const double kAddMfRange = 0.1;
+    if (i == 0) {
+      for (size_t y = 0; y < ysize; ++y) {
+        float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(0, y);
+        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[0].Row(y);
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          auto mf = Load(d, row_mf + x);
+          auto hf = Sub(Load(d, row_hf + x), mf);
+          mf = RemoveRangeAroundZero(d, kRemoveMfRange, mf);
+          Store(mf, d, row_mf + x);
+          Store(hf, d, row_hf + x);
+        }
+      }
+    } else {
+      for (size_t y = 0; y < ysize; ++y) {
+        float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(1, y);
+        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[1].Row(y);
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          auto mf = Load(d, row_mf + x);
+          auto hf = Sub(Load(d, row_hf + x), mf);
+
+          mf = AmplifyRangeAroundZero(d, kAddMfRange, mf);
+          Store(mf, d, row_mf + x);
+          Store(hf, d, row_hf + x);
+        }
+      }
+    }
+  }
+
+  // Temporarily used as output of SuppressXByY
+  ps.uhf[0] = ImageF(xsize, ysize);
+  ps.uhf[1] = ImageF(xsize, ysize);
+
+  // Suppress red-green by intensity change in the high freq channels.
+  static const double suppress = 46.0;
+  SuppressXByY(ps.hf[0], ps.hf[1], suppress, &ps.uhf[0]);
+  // hf is the SuppressXByY output, uhf will be written below.
+  ps.hf[0].Swap(ps.uhf[0]);
+
+  for (int i = 0; i < 2; ++i) {
+    // Divide hf into hf and uhf.
+    for (size_t y = 0; y < ysize; ++y) {
+      float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[i].Row(y);
+      float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[i].Row(y);
+      for (size_t x = 0; x < xsize; ++x) {
+        row_uhf[x] = row_hf[x];
+      }
+    }
+    Blur(ps.hf[i], kSigmaUhf, params, blur_temp, &ps.hf[i]);
+    static const double kRemoveHfRange = 1.5;
+    static const double kAddHfRange = 0.132;
+    static const double kRemoveUhfRange = 0.04;
+    static const double kMaxclampHf = 28.4691806922;
+    static const double kMaxclampUhf = 5.19175294647;
+    static double kMulYHf = 2.155;
+    static double kMulYUhf = 2.69313763794;
+    if (i == 0) {
+      for (size_t y = 0; y < ysize; ++y) {
+        float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[0].Row(y);
+        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[0].Row(y);
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          auto hf = Load(d, row_hf + x);
+          auto uhf = Sub(Load(d, row_uhf + x), hf);
+          hf = RemoveRangeAroundZero(d, kRemoveHfRange, hf);
+          uhf = RemoveRangeAroundZero(d, kRemoveUhfRange, uhf);
+          Store(hf, d, row_hf + x);
+          Store(uhf, d, row_uhf + x);
+        }
+      }
+    } else {
+      for (size_t y = 0; y < ysize; ++y) {
+        float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[1].Row(y);
+        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[1].Row(y);
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          auto hf = Load(d, row_hf + x);
+          hf = MaximumClamp(d, hf, kMaxclampHf);
+
+          auto uhf = Sub(Load(d, row_uhf + x), hf);
+          uhf = MaximumClamp(d, uhf, kMaxclampUhf);
+          uhf = Mul(uhf, Set(d, kMulYUhf));
+          Store(uhf, d, row_uhf + x);
+
+          hf = Mul(hf, Set(d, kMulYHf));
+          hf = AmplifyRangeAroundZero(d, kAddHfRange, hf);
+          Store(hf, d, row_hf + x);
+        }
+      }
+    }
+  }
+  // Modify range around zero code only concerns the high frequency
+  // planes and only the X and Y channels.
+  // Convert low freq xyb to vals space so that we can do a simple squared sum
+  // diff on the low frequencies later.
+  for (size_t y = 0; y < ysize; ++y) {
+    float* BUTTERAUGLI_RESTRICT row_x = ps.lf.PlaneRow(0, y);
+    float* BUTTERAUGLI_RESTRICT row_y = ps.lf.PlaneRow(1, y);
+    float* BUTTERAUGLI_RESTRICT row_b = ps.lf.PlaneRow(2, y);
+    for (size_t x = 0; x < xsize; x += Lanes(d)) {
+      auto valx = Undefined(d);
+      auto valy = Undefined(d);
+      auto valb = Undefined(d);
+      XybLowFreqToVals(d, Load(d, row_x + x), Load(d, row_y + x),
+                       Load(d, row_b + x), &valx, &valy, &valb);
+      Store(valx, d, row_x + x);
+      Store(valy, d, row_y + x);
+      Store(valb, d, row_b + x);
+    }
+  }
+}
+
+namespace {
+template <typename V>
+BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d) {
+  return Add(Add(a, b), Add(c, d));
+}
+template <typename V>
+BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e) {
+  return Sum(a, b, c, Add(d, e));
+}
+template <typename V>
+BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e, V f, V g) {
+  return Sum(a, b, c, Sum(d, e, f, g));
+}
+template <typename V>
+BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e, V f, V g, V h, V i) {
+  return Add(Add(Sum(a, b, c, d), Sum(e, f, g, h)), i);
+}
+}  // namespace
+
+template <class D>
+Vec<D> MaltaUnit(MaltaTagLF /*tag*/, const D df,
+                 const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) {
+  const intptr_t xs3 = 3 * xs;
+
+  const auto center = LoadU(df, d);
+
+  // x grows, y constant
+  const auto sum_yconst = Sum(LoadU(df, d - 4), LoadU(df, d - 2), center,
+                              LoadU(df, d + 2), LoadU(df, d + 4));
+  // Will return this, sum of all line kernels
+  auto retval = Mul(sum_yconst, sum_yconst);
+  {
+    // y grows, x constant
+    auto sum = Sum(LoadU(df, d - xs3 - xs), LoadU(df, d - xs - xs), center,
+                   LoadU(df, d + xs + xs), LoadU(df, d + xs3 + xs));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // both grow
+    auto sum = Sum(LoadU(df, d - xs3 - 3), LoadU(df, d - xs - xs - 2), center,
+                   LoadU(df, d + xs + xs + 2), LoadU(df, d + xs3 + 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // y grows, x shrinks
+    auto sum = Sum(LoadU(df, d - xs3 + 3), LoadU(df, d - xs - xs + 2), center,
+                   LoadU(df, d + xs + xs - 2), LoadU(df, d + xs3 - 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // y grows -4 to 4, x shrinks 1 -> -1
+    auto sum =
+        Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs - xs + 1), center,
+            LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 + xs - 1));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    //  y grows -4 to 4, x grows -1 -> 1
+    auto sum =
+        Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs - xs - 1), center,
+            LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + xs + 1));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // x grows -4 to 4, y grows -1 to 1
+    auto sum = Sum(LoadU(df, d - 4 - xs), LoadU(df, d - 2 - xs), center,
+                   LoadU(df, d + 2 + xs), LoadU(df, d + 4 + xs));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // x grows -4 to 4, y shrinks 1 to -1
+    auto sum = Sum(LoadU(df, d - 4 + xs), LoadU(df, d - 2 + xs), center,
+                   LoadU(df, d + 2 - xs), LoadU(df, d + 4 - xs));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1__*______
+       2___*_____
+       3_________
+       4____0____
+       5_________
+       6_____*___
+       7______*__
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs3 - 2), LoadU(df, d - xs - xs - 1), center,
+                   LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + 2));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1______*__
+       2_____*___
+       3_________
+       4____0____
+       5_________
+       6___*_____
+       7__*______
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs3 + 2), LoadU(df, d - xs - xs + 1), center,
+                   LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 - 2));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2_*_______
+       3__*______
+       4____0____
+       5______*__
+       6_______*_
+       7_________
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs - xs - 3), LoadU(df, d - xs - 2), center,
+                   LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2_______*_
+       3______*__
+       4____0____
+       5__*______
+       6_*_______
+       7_________
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs - xs + 3), LoadU(df, d - xs + 2), center,
+                   LoadU(df, d + xs - 2), LoadU(df, d + xs + xs - 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2________*
+       3______*__
+       4____0____
+       5__*______
+       6*________
+       7_________
+       8_________ */
+
+    auto sum = Sum(LoadU(df, d + xs + xs - 4), LoadU(df, d + xs - 2), center,
+                   LoadU(df, d - xs + 2), LoadU(df, d - xs - xs + 4));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2*________
+       3__*______
+       4____0____
+       5______*__
+       6________*
+       7_________
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs - xs - 4), LoadU(df, d - xs - 2), center,
+                   LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 4));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0__*______
+       1_________
+       2___*_____
+       3_________
+       4____0____
+       5_________
+       6_____*___
+       7_________
+       8______*__ */
+    auto sum =
+        Sum(LoadU(df, d - xs3 - xs - 2), LoadU(df, d - xs - xs - 1), center,
+            LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + xs + 2));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0______*__
+       1_________
+       2_____*___
+       3_________
+       4____0____
+       5_________
+       6___*_____
+       7_________
+       8__*______ */
+    auto sum =
+        Sum(LoadU(df, d - xs3 - xs + 2), LoadU(df, d - xs - xs + 1), center,
+            LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 + xs - 2));
+    retval = MulAdd(sum, sum, retval);
+  }
+  return retval;
+}
+
+template <class D>
+Vec<D> MaltaUnit(MaltaTag /*tag*/, const D df,
+                 const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) {
+  const intptr_t xs3 = 3 * xs;
+
+  const auto center = LoadU(df, d);
+
+  // x grows, y constant
+  const auto sum_yconst =
+      Sum(LoadU(df, d - 4), LoadU(df, d - 3), LoadU(df, d - 2),
+          LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2),
+          LoadU(df, d + 3), LoadU(df, d + 4));
+  // Will return this, sum of all line kernels
+  auto retval = Mul(sum_yconst, sum_yconst);
+
+  {
+    // y grows, x constant
+    auto sum = Sum(LoadU(df, d - xs3 - xs), LoadU(df, d - xs3),
+                   LoadU(df, d - xs - xs), LoadU(df, d - xs), center,
+                   LoadU(df, d + xs), LoadU(df, d + xs + xs),
+                   LoadU(df, d + xs3), LoadU(df, d + xs3 + xs));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // both grow
+    auto sum = Sum(LoadU(df, d - xs3 - 3), LoadU(df, d - xs - xs - 2),
+                   LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1),
+                   LoadU(df, d + xs + xs + 2), LoadU(df, d + xs3 + 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // y grows, x shrinks
+    auto sum = Sum(LoadU(df, d - xs3 + 3), LoadU(df, d - xs - xs + 2),
+                   LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1),
+                   LoadU(df, d + xs + xs - 2), LoadU(df, d + xs3 - 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // y grows -4 to 4, x shrinks 1 -> -1
+    auto sum = Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs3 + 1),
+                   LoadU(df, d - xs - xs + 1), LoadU(df, d - xs), center,
+                   LoadU(df, d + xs), LoadU(df, d + xs + xs - 1),
+                   LoadU(df, d + xs3 - 1), LoadU(df, d + xs3 + xs - 1));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    //  y grows -4 to 4, x grows -1 -> 1
+    auto sum = Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs3 - 1),
+                   LoadU(df, d - xs - xs - 1), LoadU(df, d - xs), center,
+                   LoadU(df, d + xs), LoadU(df, d + xs + xs + 1),
+                   LoadU(df, d + xs3 + 1), LoadU(df, d + xs3 + xs + 1));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // x grows -4 to 4, y grows -1 to 1
+    auto sum =
+        Sum(LoadU(df, d - 4 - xs), LoadU(df, d - 3 - xs), LoadU(df, d - 2 - xs),
+            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2 + xs),
+            LoadU(df, d + 3 + xs), LoadU(df, d + 4 + xs));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // x grows -4 to 4, y shrinks 1 to -1
+    auto sum =
+        Sum(LoadU(df, d - 4 + xs), LoadU(df, d - 3 + xs), LoadU(df, d - 2 + xs),
+            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2 - xs),
+            LoadU(df, d + 3 - xs), LoadU(df, d + 4 - xs));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1__*______
+       2___*_____
+       3___*_____
+       4____0____
+       5_____*___
+       6_____*___
+       7______*__
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs3 - 2), LoadU(df, d - xs - xs - 1),
+                   LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1),
+                   LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + 2));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1______*__
+       2_____*___
+       3_____*___
+       4____0____
+       5___*_____
+       6___*_____
+       7__*______
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs3 + 2), LoadU(df, d - xs - xs + 1),
+                   LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1),
+                   LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 - 2));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2_*_______
+       3__**_____
+       4____0____
+       5_____**__
+       6_______*_
+       7_________
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs - xs - 3), LoadU(df, d - xs - 2),
+                   LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1),
+                   LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2_______*_
+       3_____**__
+       4____0____
+       5__**_____
+       6_*_______
+       7_________
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs - xs + 3), LoadU(df, d - xs + 2),
+                   LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1),
+                   LoadU(df, d + xs - 2), LoadU(df, d + xs + xs - 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2_________
+       3______***
+       4___*0*___
+       5***______
+       6_________
+       7_________
+       8_________ */
+
+    auto sum =
+        Sum(LoadU(df, d + xs - 4), LoadU(df, d + xs - 3), LoadU(df, d + xs - 2),
+            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d - xs + 2),
+            LoadU(df, d - xs + 3), LoadU(df, d - xs + 4));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2_________
+       3***______
+       4___*0*___
+       5______***
+       6_________
+       7_________
+       8_________ */
+    auto sum =
+        Sum(LoadU(df, d - xs - 4), LoadU(df, d - xs - 3), LoadU(df, d - xs - 2),
+            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + xs + 2),
+            LoadU(df, d + xs + 3), LoadU(df, d + xs + 4));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0___*_____
+       1___*_____
+       2___*_____
+       3____*____
+       4____0____
+       5____*____
+       6_____*___
+       7_____*___
+       8_____*___ */
+    auto sum = Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs3 - 1),
+                   LoadU(df, d - xs - xs - 1), LoadU(df, d - xs), center,
+                   LoadU(df, d + xs), LoadU(df, d + xs + xs + 1),
+                   LoadU(df, d + xs3 + 1), LoadU(df, d + xs3 + xs + 1));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_____*___
+       1_____*___
+       2____ *___
+       3____*____
+       4____0____
+       5____*____
+       6___*_____
+       7___*_____
+       8___*_____ */
+    auto sum = Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs3 + 1),
+                   LoadU(df, d - xs - xs + 1), LoadU(df, d - xs), center,
+                   LoadU(df, d + xs), LoadU(df, d + xs + xs - 1),
+                   LoadU(df, d + xs3 - 1), LoadU(df, d + xs3 + xs - 1));
+    retval = MulAdd(sum, sum, retval);
+  }
+  return retval;
+}
+
+// Returns MaltaUnit. Avoids bounds-checks when x0 and y0 are known
+// to be far enough from the image borders. "diffs" is a packed image.
+template <class Tag>
+static BUTTERAUGLI_INLINE float PaddedMaltaUnit(const ImageF& diffs,
+                                                const size_t x0,
+                                                const size_t y0) {
+  const float* BUTTERAUGLI_RESTRICT d = diffs.ConstRow(y0) + x0;
+  const HWY_CAPPED(float, 1) df;
+  if ((x0 >= 4 && y0 >= 4 && x0 < (diffs.xsize() - 4) &&
+       y0 < (diffs.ysize() - 4))) {
+    return GetLane(MaltaUnit(Tag(), df, d, diffs.PixelsPerRow()));
+  }
+
+  PROFILER_ZONE("Padded Malta");
+  float borderimage[12 * 9];  // round up to 4
+  for (int dy = 0; dy < 9; ++dy) {
+    int y = y0 + dy - 4;
+    if (y < 0 || static_cast<size_t>(y) >= diffs.ysize()) {
+      for (int dx = 0; dx < 12; ++dx) {
+        borderimage[dy * 12 + dx] = 0.0f;
+      }
+      continue;
+    }
+
+    const float* row_diffs = diffs.ConstRow(y);
+    for (int dx = 0; dx < 9; ++dx) {
+      int x = x0 + dx - 4;
+      if (x < 0 || static_cast<size_t>(x) >= diffs.xsize()) {
+        borderimage[dy * 12 + dx] = 0.0f;
+      } else {
+        borderimage[dy * 12 + dx] = row_diffs[x];
+      }
+    }
+    std::fill(borderimage + dy * 12 + 9, borderimage + dy * 12 + 12, 0.0f);
+  }
+  return GetLane(MaltaUnit(Tag(), df, &borderimage[4 * 12 + 4], 12));
+}
+
+template <class Tag>
+static void MaltaDiffMapT(const Tag tag, const ImageF& lum0, const ImageF& lum1,
+                          const double w_0gt1, const double w_0lt1,
+                          const double norm1, const double len,
+                          const double mulli, ImageF* HWY_RESTRICT diffs,
+                          Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
+  JXL_DASSERT(SameSize(lum0, lum1) && SameSize(lum0, *diffs));
+  const size_t xsize_ = lum0.xsize();
+  const size_t ysize_ = lum0.ysize();
+
+  const float kWeight0 = 0.5;
+  const float kWeight1 = 0.33;
+
+  const double w_pre0gt1 = mulli * std::sqrt(kWeight0 * w_0gt1) / (len * 2 + 1);
+  const double w_pre0lt1 = mulli * std::sqrt(kWeight1 * w_0lt1) / (len * 2 + 1);
+  const float norm2_0gt1 = w_pre0gt1 * norm1;
+  const float norm2_0lt1 = w_pre0lt1 * norm1;
+
+  for (size_t y = 0; y < ysize_; ++y) {
+    const float* HWY_RESTRICT row0 = lum0.ConstRow(y);
+    const float* HWY_RESTRICT row1 = lum1.ConstRow(y);
+    float* HWY_RESTRICT row_diffs = diffs->Row(y);
+    for (size_t x = 0; x < xsize_; ++x) {
+      const float absval = 0.5f * (std::abs(row0[x]) + std::abs(row1[x]));
+      const float diff = row0[x] - row1[x];
+      const float scaler = norm2_0gt1 / (static_cast<float>(norm1) + absval);
+
+      // Primary symmetric quadratic objective.
+      row_diffs[x] = scaler * diff;
+
+      const float scaler2 = norm2_0lt1 / (static_cast<float>(norm1) + absval);
+      const double fabs0 = std::fabs(row0[x]);
+
+      // Secondary half-open quadratic objectives.
+      const double too_small = 0.55 * fabs0;
+      const double too_big = 1.05 * fabs0;
+
+      if (row0[x] < 0) {
+        if (row1[x] > -too_small) {
+          double impact = scaler2 * (row1[x] + too_small);
+          row_diffs[x] -= impact;
+        } else if (row1[x] < -too_big) {
+          double impact = scaler2 * (-row1[x] - too_big);
+          row_diffs[x] += impact;
+        }
+      } else {
+        if (row1[x] < too_small) {
+          double impact = scaler2 * (too_small - row1[x]);
+          row_diffs[x] += impact;
+        } else if (row1[x] > too_big) {
+          double impact = scaler2 * (row1[x] - too_big);
+          row_diffs[x] -= impact;
+        }
+      }
+    }
+  }
+
+  size_t y0 = 0;
+  // Top
+  for (; y0 < 4; ++y0) {
+    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0);
+    for (size_t x0 = 0; x0 < xsize_; ++x0) {
+      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
+    }
+  }
+
+  const HWY_FULL(float) df;
+  const size_t aligned_x = std::max(size_t(4), Lanes(df));
+  const intptr_t stride = diffs->PixelsPerRow();
+
+  // Middle
+  for (; y0 < ysize_ - 4; ++y0) {
+    const float* BUTTERAUGLI_RESTRICT row_in = diffs->ConstRow(y0);
+    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0);
+    size_t x0 = 0;
+    for (; x0 < aligned_x; ++x0) {
+      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
+    }
+    for (; x0 + Lanes(df) + 4 <= xsize_; x0 += Lanes(df)) {
+      auto diff = Load(df, row_diff + x0);
+      diff = Add(diff, MaltaUnit(Tag(), df, row_in + x0, stride));
+      Store(diff, df, row_diff + x0);
+    }
+
+    for (; x0 < xsize_; ++x0) {
+      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
+    }
+  }
+
+  // Bottom
+  for (; y0 < ysize_; ++y0) {
+    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0);
+    for (size_t x0 = 0; x0 < xsize_; ++x0) {
+      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
+    }
+  }
+}
+
+// Need non-template wrapper functions for HWY_EXPORT.
+void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
+                  const double w_0lt1, const double norm1, const double len,
+                  const double mulli, ImageF* HWY_RESTRICT diffs,
+                  Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
+  MaltaDiffMapT(MaltaTag(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli,
+                diffs, block_diff_ac, c);
+}
+
+void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
+                    const double w_0lt1, const double norm1, const double len,
+                    const double mulli, ImageF* HWY_RESTRICT diffs,
+                    Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
+  MaltaDiffMapT(MaltaTagLF(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli,
+                diffs, block_diff_ac, c);
+}
+
+void DiffPrecompute(const ImageF& xyb, float mul, float bias_arg, ImageF* out) {
+  PROFILER_FUNC;
+  const size_t xsize = xyb.xsize();
+  const size_t ysize = xyb.ysize();
+  const float bias = mul * bias_arg;
+  const float sqrt_bias = sqrt(bias);
+  for (size_t y = 0; y < ysize; ++y) {
+    const float* BUTTERAUGLI_RESTRICT row_in = xyb.Row(y);
+    float* BUTTERAUGLI_RESTRICT row_out = out->Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      // kBias makes sqrt behave more linearly.
+      row_out[x] = sqrt(mul * std::abs(row_in[x]) + bias) - sqrt_bias;
+    }
+  }
+}
+
+// std::log(80.0) / std::log(255.0);
+constexpr float kIntensityTargetNormalizationHack = 0.79079917404f;
+static const float kInternalGoodQualityThreshold =
+    17.83f * kIntensityTargetNormalizationHack;
+static const float kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
+
+void StoreMin3(const float v, float& min0, float& min1, float& min2) {
+  if (v < min2) {
+    if (v < min0) {
+      min2 = min1;
+      min1 = min0;
+      min0 = v;
+    } else if (v < min1) {
+      min2 = min1;
+      min1 = v;
+    } else {
+      min2 = v;
+    }
+  }
+}
+
+// Look for smooth areas near the area of degradation.
+// If the areas area generally smooth, don't do masking.
+void FuzzyErosion(const ImageF& from, ImageF* to) {
+  const size_t xsize = from.xsize();
+  const size_t ysize = from.ysize();
+  static const int kStep = 3;
+  for (size_t y = 0; y < ysize; ++y) {
+    for (size_t x = 0; x < xsize; ++x) {
+      float min0 = from.Row(y)[x];
+      float min1 = 2 * min0;
+      float min2 = min1;
+      if (x >= kStep) {
+        float v = from.Row(y)[x - kStep];
+        StoreMin3(v, min0, min1, min2);
+        if (y >= kStep) {
+          float v = from.Row(y - kStep)[x - kStep];
+          StoreMin3(v, min0, min1, min2);
+        }
+        if (y < ysize - kStep) {
+          float v = from.Row(y + kStep)[x - kStep];
+          StoreMin3(v, min0, min1, min2);
+        }
+      }
+      if (x < xsize - kStep) {
+        float v = from.Row(y)[x + kStep];
+        StoreMin3(v, min0, min1, min2);
+        if (y >= kStep) {
+          float v = from.Row(y - kStep)[x + kStep];
+          StoreMin3(v, min0, min1, min2);
+        }
+        if (y < ysize - kStep) {
+          float v = from.Row(y + kStep)[x + kStep];
+          StoreMin3(v, min0, min1, min2);
+        }
+      }
+      if (y >= kStep) {
+        float v = from.Row(y - kStep)[x];
+        StoreMin3(v, min0, min1, min2);
+      }
+      if (y < ysize - kStep) {
+        float v = from.Row(y + kStep)[x];
+        StoreMin3(v, min0, min1, min2);
+      }
+      to->Row(y)[x] = (0.45f * min0 + 0.3f * min1 + 0.25f * min2);
+    }
+  }
+}
+
+// Compute values of local frequency and dc masking based on the activity
+// in the two images. img_diff_ac may be null.
+void Mask(const ImageF& mask0, const ImageF& mask1,
+          const ButteraugliParams& params, BlurTemp* blur_temp,
+          ImageF* BUTTERAUGLI_RESTRICT mask,
+          ImageF* BUTTERAUGLI_RESTRICT diff_ac) {
+  // Only X and Y components are involved in masking. B's influence
+  // is considered less important in the high frequency area, and we
+  // don't model masking from lower frequency signals.
+  PROFILER_FUNC;
+  const size_t xsize = mask0.xsize();
+  const size_t ysize = mask0.ysize();
+  *mask = ImageF(xsize, ysize);
+  static const float kMul = 6.19424080439;
+  static const float kBias = 12.61050594197;
+  static const float kRadius = 2.7;
+  ImageF diff0(xsize, ysize);
+  ImageF diff1(xsize, ysize);
+  ImageF blurred0(xsize, ysize);
+  ImageF blurred1(xsize, ysize);
+  DiffPrecompute(mask0, kMul, kBias, &diff0);
+  DiffPrecompute(mask1, kMul, kBias, &diff1);
+  Blur(diff0, kRadius, params, blur_temp, &blurred0);
+  FuzzyErosion(blurred0, &diff0);
+  Blur(diff1, kRadius, params, blur_temp, &blurred1);
+  FuzzyErosion(blurred1, &diff1);
+  for (size_t y = 0; y < ysize; ++y) {
+    for (size_t x = 0; x < xsize; ++x) {
+      mask->Row(y)[x] = diff0.Row(y)[x];
+      if (diff_ac != nullptr) {
+        static const float kMaskToErrorMul = 10.0;
+        float diff = blurred0.Row(y)[x] - blurred1.Row(y)[x];
+        diff_ac->Row(y)[x] += kMaskToErrorMul * diff * diff;
+      }
+    }
+  }
+}
+
+// `diff_ac` may be null.
+void MaskPsychoImage(const PsychoImage& pi0, const PsychoImage& pi1,
+                     const size_t xsize, const size_t ysize,
+                     const ButteraugliParams& params, Image3F* temp,
+                     BlurTemp* blur_temp, ImageF* BUTTERAUGLI_RESTRICT mask,
+                     ImageF* BUTTERAUGLI_RESTRICT diff_ac) {
+  ImageF mask0(xsize, ysize);
+  ImageF mask1(xsize, ysize);
+  static const float muls[3] = {
+      2.5f,
+      0.4f,
+      0.4f,
+  };
+  // Silly and unoptimized approach here. TODO(jyrki): rework this.
+  for (size_t y = 0; y < ysize; ++y) {
+    const float* BUTTERAUGLI_RESTRICT row_y_hf0 = pi0.hf[1].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_y_hf1 = pi1.hf[1].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_y_uhf0 = pi0.uhf[1].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_y_uhf1 = pi1.uhf[1].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_x_hf0 = pi0.hf[0].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_x_hf1 = pi1.hf[0].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_x_uhf0 = pi0.uhf[0].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_x_uhf1 = pi1.uhf[0].Row(y);
+    float* BUTTERAUGLI_RESTRICT row0 = mask0.Row(y);
+    float* BUTTERAUGLI_RESTRICT row1 = mask1.Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      float xdiff0 = (row_x_uhf0[x] + row_x_hf0[x]) * muls[0];
+      float xdiff1 = (row_x_uhf1[x] + row_x_hf1[x]) * muls[0];
+      float ydiff0 = row_y_uhf0[x] * muls[1] + row_y_hf0[x] * muls[2];
+      float ydiff1 = row_y_uhf1[x] * muls[1] + row_y_hf1[x] * muls[2];
+      row0[x] = xdiff0 * xdiff0 + ydiff0 * ydiff0;
+      row0[x] = sqrt(row0[x]);
+      row1[x] = xdiff1 * xdiff1 + ydiff1 * ydiff1;
+      row1[x] = sqrt(row1[x]);
+    }
+  }
+  Mask(mask0, mask1, params, blur_temp, mask, diff_ac);
+}
+
+double MaskY(double delta) {
+  static const double offset = 0.829591754942;
+  static const double scaler = 0.451936922203;
+  static const double mul = 2.5485944793;
+  const double c = mul / ((scaler * delta) + offset);
+  const double retval = kGlobalScale * (1.0 + c);
+  return retval * retval;
+}
+
+double MaskDcY(double delta) {
+  static const double offset = 0.20025578522;
+  static const double scaler = 3.87449418804;
+  static const double mul = 0.505054525019;
+  const double c = mul / ((scaler * delta) + offset);
+  const double retval = kGlobalScale * (1.0 + c);
+  return retval * retval;
+}
+
+inline float MaskColor(const float color[3], const float mask) {
+  return color[0] * mask + color[1] * mask + color[2] * mask;
+}
+
+// Diffmap := sqrt of sum{diff images by multiplied by X and Y/B masks}
+void CombineChannelsToDiffmap(const ImageF& mask, const Image3F& block_diff_dc,
+                              const Image3F& block_diff_ac, float xmul,
+                              ImageF* result) {
+  PROFILER_FUNC;
+  JXL_CHECK(SameSize(mask, *result));
+  size_t xsize = mask.xsize();
+  size_t ysize = mask.ysize();
+  for (size_t y = 0; y < ysize; ++y) {
+    float* BUTTERAUGLI_RESTRICT row_out = result->Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      float val = mask.Row(y)[x];
+      float maskval = MaskY(val);
+      float dc_maskval = MaskDcY(val);
+      float diff_dc[3];
+      float diff_ac[3];
+      for (int i = 0; i < 3; ++i) {
+        diff_dc[i] = block_diff_dc.PlaneRow(i, y)[x];
+        diff_ac[i] = block_diff_ac.PlaneRow(i, y)[x];
+      }
+      diff_ac[0] *= xmul;
+      diff_dc[0] *= xmul;
+      row_out[x] =
+          sqrt(MaskColor(diff_dc, dc_maskval) + MaskColor(diff_ac, maskval));
+    }
+  }
+}
+
+// Adds weighted L2 difference between i0 and i1 to diffmap.
+static void L2Diff(const ImageF& i0, const ImageF& i1, const float w,
+                   Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) {
+  if (w == 0) return;
+
+  const HWY_FULL(float) d;
+  const auto weight = Set(d, w);
+
+  for (size_t y = 0; y < i0.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y);
+    const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y);
+    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y);
+
+    for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
+      const auto diff = Sub(Load(d, row0 + x), Load(d, row1 + x));
+      const auto diff2 = Mul(diff, diff);
+      const auto prev = Load(d, row_diff + x);
+      Store(MulAdd(diff2, weight, prev), d, row_diff + x);
+    }
+  }
+}
+
+// Initializes diffmap to the weighted L2 difference between i0 and i1.
+static void SetL2Diff(const ImageF& i0, const ImageF& i1, const float w,
+                      Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) {
+  if (w == 0) return;
+
+  const HWY_FULL(float) d;
+  const auto weight = Set(d, w);
+
+  for (size_t y = 0; y < i0.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y);
+    const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y);
+    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y);
+
+    for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
+      const auto diff = Sub(Load(d, row0 + x), Load(d, row1 + x));
+      const auto diff2 = Mul(diff, diff);
+      Store(Mul(diff2, weight), d, row_diff + x);
+    }
+  }
+}
+
+// i0 is the original image.
+// i1 is the deformed copy.
+static void L2DiffAsymmetric(const ImageF& i0, const ImageF& i1, float w_0gt1,
+                             float w_0lt1,
+                             Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) {
+  if (w_0gt1 == 0 && w_0lt1 == 0) {
+    return;
+  }
+
+  const HWY_FULL(float) d;
+  const auto vw_0gt1 = Set(d, w_0gt1 * 0.8);
+  const auto vw_0lt1 = Set(d, w_0lt1 * 0.8);
+
+  for (size_t y = 0; y < i0.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row0 = i0.Row(y);
+    const float* BUTTERAUGLI_RESTRICT row1 = i1.Row(y);
+    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y);
+
+    for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
+      const auto val0 = Load(d, row0 + x);
+      const auto val1 = Load(d, row1 + x);
+
+      // Primary symmetric quadratic objective.
+      const auto diff = Sub(val0, val1);
+      auto total = MulAdd(Mul(diff, diff), vw_0gt1, Load(d, row_diff + x));
+
+      // Secondary half-open quadratic objectives.
+      const auto fabs0 = Abs(val0);
+      const auto too_small = Mul(Set(d, 0.4), fabs0);
+      const auto too_big = fabs0;
+
+      const auto if_neg = IfThenElse(
+          Gt(val1, Neg(too_small)), Add(val1, too_small),
+          IfThenElseZero(Lt(val1, Neg(too_big)), Sub(Neg(val1), too_big)));
+      const auto if_pos =
+          IfThenElse(Lt(val1, too_small), Sub(too_small, val1),
+                     IfThenElseZero(Gt(val1, too_big), Sub(val1, too_big)));
+      const auto v = IfThenElse(Lt(val0, Zero(d)), if_neg, if_pos);
+      total = MulAdd(vw_0lt1, Mul(v, v), total);
+      Store(total, d, row_diff + x);
+    }
+  }
+}
+
+// A simple HDR compatible gamma function.
+template <class DF, class V>
+V Gamma(const DF df, V v) {
+  // ln(2) constant folded in because we want std::log but have FastLog2f.
+  const auto kRetMul = Set(df, 19.245013259874995f * 0.693147180559945f);
+  const auto kRetAdd = Set(df, -23.16046239805755);
+  // This should happen rarely, but may lead to a NaN in log, which is
+  // undesirable. Since negative photons don't exist we solve the NaNs by
+  // clamping here.
+  v = ZeroIfNegative(v);
+
+  const auto biased = Add(v, Set(df, 9.9710635769299145));
+  const auto log = FastLog2f(df, biased);
+  // We could fold this into a custom Log2 polynomial, but there would be
+  // relatively little gain.
+  return MulAdd(kRetMul, log, kRetAdd);
+}
+
+template <bool Clamp, class DF, class V>
+BUTTERAUGLI_INLINE void OpsinAbsorbance(const DF df, const V& in0, const V& in1,
+                                        const V& in2, V* JXL_RESTRICT out0,
+                                        V* JXL_RESTRICT out1,
+                                        V* JXL_RESTRICT out2) {
+  // https://en.wikipedia.org/wiki/Photopsin absorbance modeling.
+  static const double mixi0 = 0.29956550340058319;
+  static const double mixi1 = 0.63373087833825936;
+  static const double mixi2 = 0.077705617820981968;
+  static const double mixi3 = 1.7557483643287353;
+  static const double mixi4 = 0.22158691104574774;
+  static const double mixi5 = 0.69391388044116142;
+  static const double mixi6 = 0.0987313588422;
+  static const double mixi7 = 1.7557483643287353;
+  static const double mixi8 = 0.02;
+  static const double mixi9 = 0.02;
+  static const double mixi10 = 0.20480129041026129;
+  static const double mixi11 = 12.226454707163354;
+
+  const V mix0 = Set(df, mixi0);
+  const V mix1 = Set(df, mixi1);
+  const V mix2 = Set(df, mixi2);
+  const V mix3 = Set(df, mixi3);
+  const V mix4 = Set(df, mixi4);
+  const V mix5 = Set(df, mixi5);
+  const V mix6 = Set(df, mixi6);
+  const V mix7 = Set(df, mixi7);
+  const V mix8 = Set(df, mixi8);
+  const V mix9 = Set(df, mixi9);
+  const V mix10 = Set(df, mixi10);
+  const V mix11 = Set(df, mixi11);
+
+  *out0 = MulAdd(mix0, in0, MulAdd(mix1, in1, MulAdd(mix2, in2, mix3)));
+  *out1 = MulAdd(mix4, in0, MulAdd(mix5, in1, MulAdd(mix6, in2, mix7)));
+  *out2 = MulAdd(mix8, in0, MulAdd(mix9, in1, MulAdd(mix10, in2, mix11)));
+
+  if (Clamp) {
+    *out0 = Max(*out0, mix3);
+    *out1 = Max(*out1, mix7);
+    *out2 = Max(*out2, mix11);
+  }
+}
+
+// `blurred` is a temporary image used inside this function and not returned.
+Image3F OpsinDynamicsImage(const Image3F& rgb, const ButteraugliParams& params,
+                           Image3F* blurred, BlurTemp* blur_temp) {
+  PROFILER_FUNC;
+  Image3F xyb(rgb.xsize(), rgb.ysize());
+  const double kSigma = 1.2;
+  Blur(rgb.Plane(0), kSigma, params, blur_temp, &blurred->Plane(0));
+  Blur(rgb.Plane(1), kSigma, params, blur_temp, &blurred->Plane(1));
+  Blur(rgb.Plane(2), kSigma, params, blur_temp, &blurred->Plane(2));
+  const HWY_FULL(float) df;
+  const auto intensity_target_multiplier = Set(df, params.intensity_target);
+  for (size_t y = 0; y < rgb.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row_r = rgb.ConstPlaneRow(0, y);
+    const float* BUTTERAUGLI_RESTRICT row_g = rgb.ConstPlaneRow(1, y);
+    const float* BUTTERAUGLI_RESTRICT row_b = rgb.ConstPlaneRow(2, y);
+    const float* BUTTERAUGLI_RESTRICT row_blurred_r =
+        blurred->ConstPlaneRow(0, y);
+    const float* BUTTERAUGLI_RESTRICT row_blurred_g =
+        blurred->ConstPlaneRow(1, y);
+    const float* BUTTERAUGLI_RESTRICT row_blurred_b =
+        blurred->ConstPlaneRow(2, y);
+    float* BUTTERAUGLI_RESTRICT row_out_x = xyb.PlaneRow(0, y);
+    float* BUTTERAUGLI_RESTRICT row_out_y = xyb.PlaneRow(1, y);
+    float* BUTTERAUGLI_RESTRICT row_out_b = xyb.PlaneRow(2, y);
+    const auto min = Set(df, 1e-4f);
+    for (size_t x = 0; x < rgb.xsize(); x += Lanes(df)) {
+      auto sensitivity0 = Undefined(df);
+      auto sensitivity1 = Undefined(df);
+      auto sensitivity2 = Undefined(df);
+      {
+        // Calculate sensitivity based on the smoothed image gamma derivative.
+        auto pre_mixed0 = Undefined(df);
+        auto pre_mixed1 = Undefined(df);
+        auto pre_mixed2 = Undefined(df);
+        OpsinAbsorbance<true>(
+            df, Mul(Load(df, row_blurred_r + x), intensity_target_multiplier),
+            Mul(Load(df, row_blurred_g + x), intensity_target_multiplier),
+            Mul(Load(df, row_blurred_b + x), intensity_target_multiplier),
+            &pre_mixed0, &pre_mixed1, &pre_mixed2);
+        pre_mixed0 = Max(pre_mixed0, min);
+        pre_mixed1 = Max(pre_mixed1, min);
+        pre_mixed2 = Max(pre_mixed2, min);
+        sensitivity0 = Div(Gamma(df, pre_mixed0), pre_mixed0);
+        sensitivity1 = Div(Gamma(df, pre_mixed1), pre_mixed1);
+        sensitivity2 = Div(Gamma(df, pre_mixed2), pre_mixed2);
+        sensitivity0 = Max(sensitivity0, min);
+        sensitivity1 = Max(sensitivity1, min);
+        sensitivity2 = Max(sensitivity2, min);
+      }
+      auto cur_mixed0 = Undefined(df);
+      auto cur_mixed1 = Undefined(df);
+      auto cur_mixed2 = Undefined(df);
+      OpsinAbsorbance<false>(
+          df, Mul(Load(df, row_r + x), intensity_target_multiplier),
+          Mul(Load(df, row_g + x), intensity_target_multiplier),
+          Mul(Load(df, row_b + x), intensity_target_multiplier), &cur_mixed0,
+          &cur_mixed1, &cur_mixed2);
+      cur_mixed0 = Mul(cur_mixed0, sensitivity0);
+      cur_mixed1 = Mul(cur_mixed1, sensitivity1);
+      cur_mixed2 = Mul(cur_mixed2, sensitivity2);
+      // This is a kludge. The negative values should be zeroed away before
+      // blurring. Ideally there would be no negative values in the first place.
+      const auto min01 = Set(df, 1.7557483643287353f);
+      const auto min2 = Set(df, 12.226454707163354f);
+      cur_mixed0 = Max(cur_mixed0, min01);
+      cur_mixed1 = Max(cur_mixed1, min01);
+      cur_mixed2 = Max(cur_mixed2, min2);
+
+      Store(Sub(cur_mixed0, cur_mixed1), df, row_out_x + x);
+      Store(Add(cur_mixed0, cur_mixed1), df, row_out_y + x);
+      Store(cur_mixed2, df, row_out_b + x);
+    }
+  }
+  return xyb;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(SeparateFrequencies);       // Local function.
+HWY_EXPORT(MaskPsychoImage);           // Local function.
+HWY_EXPORT(L2DiffAsymmetric);          // Local function.
+HWY_EXPORT(L2Diff);                    // Local function.
+HWY_EXPORT(SetL2Diff);                 // Local function.
+HWY_EXPORT(CombineChannelsToDiffmap);  // Local function.
+HWY_EXPORT(MaltaDiffMap);              // Local function.
+HWY_EXPORT(MaltaDiffMapLF);            // Local function.
+HWY_EXPORT(OpsinDynamicsImage);        // Local function.
+
+#if BUTTERAUGLI_ENABLE_CHECKS
+
+static inline bool IsNan(const float x) {
+  uint32_t bits;
+  memcpy(&bits, &x, sizeof(bits));
+  const uint32_t bitmask_exp = 0x7F800000;
+  return (bits & bitmask_exp) == bitmask_exp && (bits & 0x7FFFFF);
+}
+
+static inline bool IsNan(const double x) {
+  uint64_t bits;
+  memcpy(&bits, &x, sizeof(bits));
+  return (0x7ff0000000000001ULL <= bits && bits <= 0x7fffffffffffffffULL) ||
+         (0xfff0000000000001ULL <= bits && bits <= 0xffffffffffffffffULL);
+}
+
+static inline void CheckImage(const ImageF& image, const char* name) {
+  PROFILER_FUNC;
+  for (size_t y = 0; y < image.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row = image.Row(y);
+    for (size_t x = 0; x < image.xsize(); ++x) {
+      if (IsNan(row[x])) {
+        printf("NAN: Image %s @ %" PRIuS ",%" PRIuS " (of %" PRIuS ",%" PRIuS
+               ")\n",
+               name, x, y, image.xsize(), image.ysize());
+        exit(1);
+      }
+    }
+  }
+}
+
+#define CHECK_NAN(x, str)                \
+  do {                                   \
+    if (IsNan(x)) {                      \
+      printf("%d: %s\n", __LINE__, str); \
+      abort();                           \
+    }                                    \
+  } while (0)
+
+#define CHECK_IMAGE(image, name) CheckImage(image, name)
+
+#else  // BUTTERAUGLI_ENABLE_CHECKS
+
+#define CHECK_NAN(x, str)
+#define CHECK_IMAGE(image, name)
+
+#endif  // BUTTERAUGLI_ENABLE_CHECKS
+
+// Calculate a 2x2 subsampled image for purposes of recursive butteraugli at
+// multiresolution.
+static Image3F SubSample2x(const Image3F& in) {
+  size_t xs = (in.xsize() + 1) / 2;
+  size_t ys = (in.ysize() + 1) / 2;
+  Image3F retval(xs, ys);
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < ys; ++y) {
+      for (size_t x = 0; x < xs; ++x) {
+        retval.PlaneRow(c, y)[x] = 0;
+      }
+    }
+  }
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < in.ysize(); ++y) {
+      for (size_t x = 0; x < in.xsize(); ++x) {
+        retval.PlaneRow(c, y / 2)[x / 2] += 0.25f * in.PlaneRow(c, y)[x];
+      }
+    }
+    if ((in.xsize() & 1) != 0) {
+      for (size_t y = 0; y < retval.ysize(); ++y) {
+        size_t last_column = retval.xsize() - 1;
+        retval.PlaneRow(c, y)[last_column] *= 2.0f;
+      }
+    }
+    if ((in.ysize() & 1) != 0) {
+      for (size_t x = 0; x < retval.xsize(); ++x) {
+        size_t last_row = retval.ysize() - 1;
+        retval.PlaneRow(c, last_row)[x] *= 2.0f;
+      }
+    }
+  }
+  return retval;
+}
+
+// Supersample src by 2x and add it to dest.
+static void AddSupersampled2x(const ImageF& src, float w, ImageF& dest) {
+  for (size_t y = 0; y < dest.ysize(); ++y) {
+    for (size_t x = 0; x < dest.xsize(); ++x) {
+      // There will be less errors from the more averaged images.
+      // We take it into account to some extent using a scaler.
+      static const double kHeuristicMixingValue = 0.3;
+      dest.Row(y)[x] *= 1.0 - kHeuristicMixingValue * w;
+      dest.Row(y)[x] += w * src.Row(y / 2)[x / 2];
+    }
+  }
+}
+
+Image3F* ButteraugliComparator::Temp() const {
+  bool was_in_use = temp_in_use_.test_and_set(std::memory_order_acq_rel);
+  JXL_ASSERT(!was_in_use);
+  (void)was_in_use;
+  return &temp_;
+}
+
+void ButteraugliComparator::ReleaseTemp() const { temp_in_use_.clear(); }
+
+ButteraugliComparator::ButteraugliComparator(const Image3F& rgb0,
+                                             const ButteraugliParams& params)
+    : xsize_(rgb0.xsize()),
+      ysize_(rgb0.ysize()),
+      params_(params),
+      temp_(xsize_, ysize_) {
+  if (xsize_ < 8 || ysize_ < 8) {
+    return;
+  }
+
+  Image3F xyb0 = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(rgb0, params, Temp(),
+                                                          &blur_temp_);
+  ReleaseTemp();
+  HWY_DYNAMIC_DISPATCH(SeparateFrequencies)
+  (xsize_, ysize_, params_, &blur_temp_, xyb0, pi0_);
+
+  // Awful recursive construction of samples of different resolution.
+  // This is an after-thought and possibly somewhat parallel in
+  // functionality with the PsychoImage multi-resolution approach.
+  sub_.reset(new ButteraugliComparator(SubSample2x(rgb0), params));
+}
+
+void ButteraugliComparator::Mask(ImageF* BUTTERAUGLI_RESTRICT mask) const {
+  HWY_DYNAMIC_DISPATCH(MaskPsychoImage)
+  (pi0_, pi0_, xsize_, ysize_, params_, Temp(), &blur_temp_, mask, nullptr);
+  ReleaseTemp();
+}
+
+void ButteraugliComparator::Diffmap(const Image3F& rgb1, ImageF& result) const {
+  PROFILER_FUNC;
+  if (xsize_ < 8 || ysize_ < 8) {
+    ZeroFillImage(&result);
+    return;
+  }
+  const Image3F xyb1 = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
+      rgb1, params_, Temp(), &blur_temp_);
+  ReleaseTemp();
+  DiffmapOpsinDynamicsImage(xyb1, result);
+  if (sub_) {
+    if (sub_->xsize_ < 8 || sub_->ysize_ < 8) {
+      return;
+    }
+    const Image3F sub_xyb = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
+        SubSample2x(rgb1), params_, sub_->Temp(), &sub_->blur_temp_);
+    sub_->ReleaseTemp();
+    ImageF subresult;
+    sub_->DiffmapOpsinDynamicsImage(sub_xyb, subresult);
+    AddSupersampled2x(subresult, 0.5, result);
+  }
+}
+
+void ButteraugliComparator::DiffmapOpsinDynamicsImage(const Image3F& xyb1,
+                                                      ImageF& result) const {
+  PROFILER_FUNC;
+  if (xsize_ < 8 || ysize_ < 8) {
+    ZeroFillImage(&result);
+    return;
+  }
+  PsychoImage pi1;
+  HWY_DYNAMIC_DISPATCH(SeparateFrequencies)
+  (xsize_, ysize_, params_, &blur_temp_, xyb1, pi1);
+  result = ImageF(xsize_, ysize_);
+  DiffmapPsychoImage(pi1, result);
+}
+
+namespace {
+
+void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
+                  const double w_0lt1, const double norm1,
+                  ImageF* HWY_RESTRICT diffs,
+                  Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
+  PROFILER_FUNC;
+  const double len = 3.75;
+  static const double mulli = 0.39905817637;
+  HWY_DYNAMIC_DISPATCH(MaltaDiffMap)
+  (lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, diffs, block_diff_ac, c);
+}
+
+void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
+                    const double w_0lt1, const double norm1,
+                    ImageF* HWY_RESTRICT diffs,
+                    Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
+  PROFILER_FUNC;
+  const double len = 3.75;
+  static const double mulli = 0.611612573796;
+  HWY_DYNAMIC_DISPATCH(MaltaDiffMapLF)
+  (lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, diffs, block_diff_ac, c);
+}
+
+}  // namespace
+
+void ButteraugliComparator::DiffmapPsychoImage(const PsychoImage& pi1,
+                                               ImageF& diffmap) const {
+  PROFILER_FUNC;
+  if (xsize_ < 8 || ysize_ < 8) {
+    ZeroFillImage(&diffmap);
+    return;
+  }
+
+  const float hf_asymmetry_ = params_.hf_asymmetry;
+  const float xmul_ = params_.xmul;
+
+  ImageF diffs(xsize_, ysize_);
+  Image3F block_diff_ac(xsize_, ysize_);
+  ZeroFillImage(&block_diff_ac);
+  static const double wUhfMalta = 1.10039032555;
+  static const double norm1Uhf = 71.7800275169;
+  MaltaDiffMap(pi0_.uhf[1], pi1.uhf[1], wUhfMalta * hf_asymmetry_,
+               wUhfMalta / hf_asymmetry_, norm1Uhf, &diffs, &block_diff_ac, 1);
+
+  static const double wUhfMaltaX = 173.5;
+  static const double norm1UhfX = 5.0;
+  MaltaDiffMap(pi0_.uhf[0], pi1.uhf[0], wUhfMaltaX * hf_asymmetry_,
+               wUhfMaltaX / hf_asymmetry_, norm1UhfX, &diffs, &block_diff_ac,
+               0);
+
+  static const double wHfMalta = 18.7237414387;
+  static const double norm1Hf = 4498534.45232;
+  MaltaDiffMapLF(pi0_.hf[1], pi1.hf[1], wHfMalta * std::sqrt(hf_asymmetry_),
+                 wHfMalta / std::sqrt(hf_asymmetry_), norm1Hf, &diffs,
+                 &block_diff_ac, 1);
+
+  static const double wHfMaltaX = 6923.99476109;
+  static const double norm1HfX = 8051.15833247;
+  MaltaDiffMapLF(pi0_.hf[0], pi1.hf[0], wHfMaltaX * std::sqrt(hf_asymmetry_),
+                 wHfMaltaX / std::sqrt(hf_asymmetry_), norm1HfX, &diffs,
+                 &block_diff_ac, 0);
+
+  static const double wMfMalta = 37.0819870399;
+  static const double norm1Mf = 130262059.556;
+  MaltaDiffMapLF(pi0_.mf.Plane(1), pi1.mf.Plane(1), wMfMalta, wMfMalta, norm1Mf,
+                 &diffs, &block_diff_ac, 1);
+
+  static const double wMfMaltaX = 8246.75321353;
+  static const double norm1MfX = 1009002.70582;
+  MaltaDiffMapLF(pi0_.mf.Plane(0), pi1.mf.Plane(0), wMfMaltaX, wMfMaltaX,
+                 norm1MfX, &diffs, &block_diff_ac, 0);
+
+  static const double wmul[9] = {
+      400.0,         1.50815703118,  0,
+      2150.0,        10.6195433239,  16.2176043152,
+      29.2353797994, 0.844626970982, 0.703646627719,
+  };
+  Image3F block_diff_dc(xsize_, ysize_);
+  for (size_t c = 0; c < 3; ++c) {
+    if (c < 2) {  // No blue channel error accumulated at HF.
+      HWY_DYNAMIC_DISPATCH(L2DiffAsymmetric)
+      (pi0_.hf[c], pi1.hf[c], wmul[c] * hf_asymmetry_, wmul[c] / hf_asymmetry_,
+       &block_diff_ac, c);
+    }
+    HWY_DYNAMIC_DISPATCH(L2Diff)
+    (pi0_.mf.Plane(c), pi1.mf.Plane(c), wmul[3 + c], &block_diff_ac, c);
+    HWY_DYNAMIC_DISPATCH(SetL2Diff)
+    (pi0_.lf.Plane(c), pi1.lf.Plane(c), wmul[6 + c], &block_diff_dc, c);
+  }
+
+  ImageF mask;
+  HWY_DYNAMIC_DISPATCH(MaskPsychoImage)
+  (pi0_, pi1, xsize_, ysize_, params_, Temp(), &blur_temp_, &mask,
+   &block_diff_ac.Plane(1));
+  ReleaseTemp();
+
+  HWY_DYNAMIC_DISPATCH(CombineChannelsToDiffmap)
+  (mask, block_diff_dc, block_diff_ac, xmul_, &diffmap);
+}
+
+double ButteraugliScoreFromDiffmap(const ImageF& diffmap,
+                                   const ButteraugliParams* params) {
+  PROFILER_FUNC;
+  float retval = 0.0f;
+  for (size_t y = 0; y < diffmap.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row = diffmap.ConstRow(y);
+    for (size_t x = 0; x < diffmap.xsize(); ++x) {
+      retval = std::max(retval, row[x]);
+    }
+  }
+  return retval;
+}
+
+bool ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1,
+                        double hf_asymmetry, double xmul, ImageF& diffmap) {
+  ButteraugliParams params;
+  params.hf_asymmetry = hf_asymmetry;
+  params.xmul = xmul;
+  return ButteraugliDiffmap(rgb0, rgb1, params, diffmap);
+}
+
+bool ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1,
+                        const ButteraugliParams& params, ImageF& diffmap) {
+  PROFILER_FUNC;
+  const size_t xsize = rgb0.xsize();
+  const size_t ysize = rgb0.ysize();
+  if (xsize < 1 || ysize < 1) {
+    return JXL_FAILURE("Zero-sized image");
+  }
+  if (!SameSize(rgb0, rgb1)) {
+    return JXL_FAILURE("Size mismatch");
+  }
+  static const int kMax = 8;
+  if (xsize < kMax || ysize < kMax) {
+    // Butteraugli values for small (where xsize or ysize is smaller
+    // than 8 pixels) images are non-sensical, but most likely it is
+    // less disruptive to try to compute something than just give up.
+    // Temporarily extend the borders of the image to fit 8 x 8 size.
+    size_t xborder = xsize < kMax ? (kMax - xsize) / 2 : 0;
+    size_t yborder = ysize < kMax ? (kMax - ysize) / 2 : 0;
+    size_t xscaled = std::max<size_t>(kMax, xsize);
+    size_t yscaled = std::max<size_t>(kMax, ysize);
+    Image3F scaled0(xscaled, yscaled);
+    Image3F scaled1(xscaled, yscaled);
+    for (int i = 0; i < 3; ++i) {
+      for (size_t y = 0; y < yscaled; ++y) {
+        for (size_t x = 0; x < xscaled; ++x) {
+          size_t x2 =
+              std::min<size_t>(xsize - 1, x > xborder ? x - xborder : 0);
+          size_t y2 =
+              std::min<size_t>(ysize - 1, y > yborder ? y - yborder : 0);
+          scaled0.PlaneRow(i, y)[x] = rgb0.PlaneRow(i, y2)[x2];
+          scaled1.PlaneRow(i, y)[x] = rgb1.PlaneRow(i, y2)[x2];
+        }
+      }
+    }
+    ImageF diffmap_scaled;
+    const bool ok =
+        ButteraugliDiffmap(scaled0, scaled1, params, diffmap_scaled);
+    diffmap = ImageF(xsize, ysize);
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        diffmap.Row(y)[x] = diffmap_scaled.Row(y + yborder)[x + xborder];
+      }
+    }
+    return ok;
+  }
+  ButteraugliComparator butteraugli(rgb0, params);
+  butteraugli.Diffmap(rgb1, diffmap);
+  return true;
+}
+
+bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1,
+                          float hf_asymmetry, float xmul, ImageF& diffmap,
+                          double& diffvalue) {
+  ButteraugliParams params;
+  params.hf_asymmetry = hf_asymmetry;
+  params.xmul = xmul;
+  return ButteraugliInterface(rgb0, rgb1, params, diffmap, diffvalue);
+}
+
+bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1,
+                          const ButteraugliParams& params, ImageF& diffmap,
+                          double& diffvalue) {
+#if JXL_PROFILER_ENABLED
+  auto trace_start = std::chrono::steady_clock::now();
+#endif
+  if (!ButteraugliDiffmap(rgb0, rgb1, params, diffmap)) {
+    return false;
+  }
+#if JXL_PROFILER_ENABLED
+  auto trace_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> elapsed = trace_end - trace_start;
+  const size_t mp = rgb0.xsize() * rgb0.ysize();
+  printf("diff MP/s %f\n", mp / elapsed.count() * 1E-6);
+#endif
+  diffvalue = ButteraugliScoreFromDiffmap(diffmap, &params);
+  return true;
+}
+
+double ButteraugliFuzzyClass(double score) {
+  static const double fuzzy_width_up = 4.8;
+  static const double fuzzy_width_down = 4.8;
+  static const double m0 = 2.0;
+  static const double scaler = 0.7777;
+  double val;
+  if (score < 1.0) {
+    // val in [scaler .. 2.0]
+    val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_down));
+    val -= 1.0;           // from [1 .. 2] to [0 .. 1]
+    val *= 2.0 - scaler;  // from [0 .. 1] to [0 .. 2.0 - scaler]
+    val += scaler;        // from [0 .. 2.0 - scaler] to [scaler .. 2.0]
+  } else {
+    // val in [0 .. scaler]
+    val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_up));
+    val *= scaler;
+  }
+  return val;
+}
+
+// #define PRINT_OUT_NORMALIZATION
+
+double ButteraugliFuzzyInverse(double seek) {
+  double pos = 0;
+  // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter)
+  for (double range = 1.0; range >= 1e-10; range *= 0.5) {
+    double cur = ButteraugliFuzzyClass(pos);
+    if (cur < seek) {
+      pos -= range;
+    } else {
+      pos += range;
+    }
+  }
+#ifdef PRINT_OUT_NORMALIZATION
+  if (seek == 1.0) {
+    fprintf(stderr, "Fuzzy inverse %g\n", pos);
+  }
+#endif
+  return pos;
+}
+
+#ifdef PRINT_OUT_NORMALIZATION
+static double print_out_normalization = ButteraugliFuzzyInverse(1.0);
+#endif
+
+namespace {
+
+void ScoreToRgb(double score, double good_threshold, double bad_threshold,
+                float rgb[3]) {
+  double heatmap[12][3] = {
+      {0, 0, 0},       {0, 0, 1},
+      {0, 1, 1},       {0, 1, 0},  // Good level
+      {1, 1, 0},       {1, 0, 0},  // Bad level
+      {1, 0, 1},       {0.5, 0.5, 1.0},
+      {1.0, 0.5, 0.5},  // Pastel colors for the very bad quality range.
+      {1.0, 1.0, 0.5}, {1, 1, 1},
+      {1, 1, 1},  // Last color repeated to have a solid range of white.
+  };
+  if (score < good_threshold) {
+    score = (score / good_threshold) * 0.3;
+  } else if (score < bad_threshold) {
+    score = 0.3 +
+            (score - good_threshold) / (bad_threshold - good_threshold) * 0.15;
+  } else {
+    score = 0.45 + (score - bad_threshold) / (bad_threshold * 12) * 0.5;
+  }
+  static const int kTableSize = sizeof(heatmap) / sizeof(heatmap[0]);
+  score = std::min<double>(std::max<double>(score * (kTableSize - 1), 0.0),
+                           kTableSize - 2);
+  int ix = static_cast<int>(score);
+  ix = std::min(std::max(0, ix), kTableSize - 2);  // Handle NaN
+  double mix = score - ix;
+  for (int i = 0; i < 3; ++i) {
+    double v = mix * heatmap[ix + 1][i] + (1 - mix) * heatmap[ix][i];
+    rgb[i] = pow(v, 0.5);
+  }
+}
+
+}  // namespace
+
+Image3F CreateHeatMapImage(const ImageF& distmap, double good_threshold,
+                           double bad_threshold) {
+  Image3F heatmap(distmap.xsize(), distmap.ysize());
+  for (size_t y = 0; y < distmap.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row_distmap = distmap.ConstRow(y);
+    float* BUTTERAUGLI_RESTRICT row_h0 = heatmap.PlaneRow(0, y);
+    float* BUTTERAUGLI_RESTRICT row_h1 = heatmap.PlaneRow(1, y);
+    float* BUTTERAUGLI_RESTRICT row_h2 = heatmap.PlaneRow(2, y);
+    for (size_t x = 0; x < distmap.xsize(); ++x) {
+      const float d = row_distmap[x];
+      float rgb[3];
+      ScoreToRgb(d, good_threshold, bad_threshold, rgb);
+      row_h0[x] = rgb[0];
+      row_h1[x] = rgb[1];
+      row_h2[x] = rgb[2];
+    }
+  }
+  return heatmap;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.h b/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.h
new file mode 100644
index 0000000000..652b9528c4
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.h
@@ -0,0 +1,209 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Author: Jyrki Alakuijala (jyrki.alakuijala@gmail.com)
+
+#ifndef LIB_JXL_BUTTERAUGLI_BUTTERAUGLI_H_
+#define LIB_JXL_BUTTERAUGLI_BUTTERAUGLI_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <atomic>
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+
+#define BUTTERAUGLI_ENABLE_CHECKS 0
+#define BUTTERAUGLI_RESTRICT JXL_RESTRICT
+
+// This is the main interface to butteraugli image similarity
+// analysis function.
+
+namespace jxl {
+
+struct ButteraugliParams {
+  // Multiplier for penalizing new HF artifacts more than blurring away
+  // features. 1.0=neutral.
+  float hf_asymmetry = 1.0f;
+
+  // Multiplier for the psychovisual difference in the X channel.
+  float xmul = 1.0f;
+
+  // Number of nits that correspond to 1.0f input values.
+  float intensity_target = 80.0f;
+};
+
+// ButteraugliInterface defines the public interface for butteraugli.
+//
+// It calculates the difference between rgb0 and rgb1.
+//
+// rgb0 and rgb1 contain the images. rgb0[c][px] and rgb1[c][px] contains
+// the red image for c == 0, green for c == 1, blue for c == 2. Location index
+// px is calculated as y * xsize + x.
+//
+// Value of pixels of images rgb0 and rgb1 need to be represented as raw
+// intensity. Most image formats store gamma corrected intensity in pixel
+// values. This gamma correction has to be removed, by applying the following
+// function to values in the 0-1 range:
+// butteraugli_val = pow(input_val, gamma);
+// A typical value of gamma is 2.2. It is usually stored in the image header.
+// Take care not to confuse that value with its inverse. The gamma value should
+// be always greater than one.
+// Butteraugli does not work as intended if the caller does not perform
+// gamma correction.
+//
+// hf_asymmetry is a multiplier for penalizing new HF artifacts more than
+// blurring away features (1.0 -> neutral).
+//
+// diffmap will contain an image of the size xsize * ysize, containing
+// localized differences for values px (indexed with the px the same as rgb0
+// and rgb1). diffvalue will give a global score of similarity.
+//
+// A diffvalue smaller than kButteraugliGood indicates that images can be
+// observed as the same image.
+// diffvalue larger than kButteraugliBad indicates that a difference between
+// the images can be observed.
+// A diffvalue between kButteraugliGood and kButteraugliBad indicates that
+// a subtle difference can be observed between the images.
+//
+// Returns true on success.
+bool ButteraugliInterface(const Image3F &rgb0, const Image3F &rgb1,
+                          const ButteraugliParams &params, ImageF &diffmap,
+                          double &diffvalue);
+
+// Deprecated (calls the previous function)
+bool ButteraugliInterface(const Image3F &rgb0, const Image3F &rgb1,
+                          float hf_asymmetry, float xmul, ImageF &diffmap,
+                          double &diffvalue);
+
+// Converts the butteraugli score into fuzzy class values that are continuous
+// at the class boundary. The class boundary location is based on human
+// raters, but the slope is arbitrary. Particularly, it does not reflect
+// the expectation value of probabilities of the human raters. It is just
+// expected that a smoother class boundary will allow for higher-level
+// optimization algorithms to work faster.
+//
+// Returns 2.0 for a perfect match, and 1.0 for 'ok', 0.0 for bad. Because the
+// scoring is fuzzy, a butteraugli score of 0.96 would return a class of
+// around 1.9.
+double ButteraugliFuzzyClass(double score);
+
+// Input values should be in range 0 (bad) to 2 (good). Use
+// kButteraugliNormalization as normalization.
+double ButteraugliFuzzyInverse(double seek);
+
+// Implementation details, don't use anything below or your code will
+// break in the future.
+
+#ifdef _MSC_VER
+#define BUTTERAUGLI_INLINE __forceinline
+#else
+#define BUTTERAUGLI_INLINE inline
+#endif
+
+#ifdef __clang__
+// Early versions of Clang did not support __builtin_assume_aligned.
+#define BUTTERAUGLI_HAS_ASSUME_ALIGNED __has_builtin(__builtin_assume_aligned)
+#elif defined(__GNUC__)
+#define BUTTERAUGLI_HAS_ASSUME_ALIGNED 1
+#else
+#define BUTTERAUGLI_HAS_ASSUME_ALIGNED 0
+#endif
+
+// Returns a void* pointer which the compiler then assumes is N-byte aligned.
+// Example: float* JXL_RESTRICT aligned = (float*)JXL_ASSUME_ALIGNED(in, 32);
+//
+// The assignment semantics are required by GCC/Clang. ICC provides an in-place
+// __assume_aligned, whereas MSVC's __assume appears unsuitable.
+#if BUTTERAUGLI_HAS_ASSUME_ALIGNED
+#define BUTTERAUGLI_ASSUME_ALIGNED(ptr, align) \
+  __builtin_assume_aligned((ptr), (align))
+#else
+#define BUTTERAUGLI_ASSUME_ALIGNED(ptr, align) (ptr)
+#endif  // BUTTERAUGLI_HAS_ASSUME_ALIGNED
+
+struct PsychoImage {
+  ImageF uhf[2];  // XY
+  ImageF hf[2];   // XY
+  Image3F mf;     // XYB
+  Image3F lf;     // XYB
+};
+
+// Blur needs a transposed image.
+// Hold it here and only allocate on demand to reduce memory usage.
+struct BlurTemp {
+  ImageF *GetTransposed(const ImageF &in) {
+    if (transposed_temp.xsize() == 0) {
+      transposed_temp = ImageF(in.ysize(), in.xsize());
+    }
+    return &transposed_temp;
+  }
+
+  ImageF transposed_temp;
+};
+
+class ButteraugliComparator {
+ public:
+  // Butteraugli is calibrated at xmul = 1.0. We add a multiplier here so that
+  // we can test the hypothesis that a higher weighing of the X channel would
+  // improve results at higher Butteraugli values.
+  ButteraugliComparator(const Image3F &rgb0, const ButteraugliParams &params);
+  virtual ~ButteraugliComparator() = default;
+
+  // Computes the butteraugli map between the original image given in the
+  // constructor and the distorted image give here.
+  void Diffmap(const Image3F &rgb1, ImageF &result) const;
+
+  // Same as above, but OpsinDynamicsImage() was already applied.
+  void DiffmapOpsinDynamicsImage(const Image3F &xyb1, ImageF &result) const;
+
+  // Same as above, but the frequency decomposition was already applied.
+  void DiffmapPsychoImage(const PsychoImage &pi1, ImageF &diffmap) const;
+
+  void Mask(ImageF *BUTTERAUGLI_RESTRICT mask) const;
+
+ private:
+  Image3F *Temp() const;
+  void ReleaseTemp() const;
+
+  const size_t xsize_;
+  const size_t ysize_;
+  ButteraugliParams params_;
+  PsychoImage pi0_;
+
+  // Shared temporary image storage to reduce the number of allocations;
+  // obtained via Temp(), must call ReleaseTemp when no longer needed.
+  mutable Image3F temp_;
+  mutable std::atomic_flag temp_in_use_ = ATOMIC_FLAG_INIT;
+
+  mutable BlurTemp blur_temp_;
+  std::unique_ptr<ButteraugliComparator> sub_;
+};
+
+// Deprecated.
+bool ButteraugliDiffmap(const Image3F &rgb0, const Image3F &rgb1,
+                        double hf_asymmetry, double xmul, ImageF &diffmap);
+
+bool ButteraugliDiffmap(const Image3F &rgb0, const Image3F &rgb1,
+                        const ButteraugliParams &params, ImageF &diffmap);
+
+double ButteraugliScoreFromDiffmap(const ImageF &diffmap,
+                                   const ButteraugliParams *params = nullptr);
+
+// Generate rgb-representation of the distance between two images.
+Image3F CreateHeatMapImage(const ImageF &distmap, double good_threshold,
+                           double bad_threshold);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BUTTERAUGLI_BUTTERAUGLI_H_
diff --git a/third_party/jpeg-xl/lib/jxl/butteraugli_test.cc b/third_party/jpeg-xl/lib/jxl/butteraugli_test.cc
new file mode 100644
index 0000000000..3fdec09725
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/butteraugli_test.cc
@@ -0,0 +1,103 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/butteraugli.h>
+#include <jxl/butteraugli_cxx.h>
+
+#include "lib/jxl/test_image.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+TEST(ButteraugliTest, Lossless) {
+  uint32_t xsize = 171;
+  uint32_t ysize = 219;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr));
+  JxlButteraugliResultPtr result(JxlButteraugliCompute(
+      api.get(), xsize, ysize, &pixel_format, pixels.data(), pixels.size(),
+      &pixel_format, pixels.data(), pixels.size()));
+  EXPECT_EQ(0.0, JxlButteraugliResultGetDistance(result.get(), 8.0));
+}
+
+TEST(ButteraugliTest, Distmap) {
+  uint32_t xsize = 171;
+  uint32_t ysize = 219;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr));
+  JxlButteraugliResultPtr result(JxlButteraugliCompute(
+      api.get(), xsize, ysize, &pixel_format, pixels.data(), pixels.size(),
+      &pixel_format, pixels.data(), pixels.size()));
+  EXPECT_EQ(0.0, JxlButteraugliResultGetDistance(result.get(), 8.0));
+  const float* distmap;
+  uint32_t row_stride;
+  JxlButteraugliResultGetDistmap(result.get(), &distmap, &row_stride);
+  for (uint32_t y = 0; y < ysize; y++) {
+    for (uint32_t x = 0; x < xsize; x++) {
+      EXPECT_EQ(0.0, distmap[y * row_stride + x]);
+    }
+  }
+}
+
+TEST(ButteraugliTest, Distorted) {
+  uint32_t xsize = 171;
+  uint32_t ysize = 219;
+  std::vector<uint8_t> orig_pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  std::vector<uint8_t> dist_pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  dist_pixels[0] += 128;
+
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr));
+  JxlButteraugliResultPtr result(JxlButteraugliCompute(
+      api.get(), xsize, ysize, &pixel_format, orig_pixels.data(),
+      orig_pixels.size(), &pixel_format, dist_pixels.data(),
+      dist_pixels.size()));
+  EXPECT_NE(0.0, JxlButteraugliResultGetDistance(result.get(), 8.0));
+}
+
+TEST(ButteraugliTest, Api) {
+  uint32_t xsize = 171;
+  uint32_t ysize = 219;
+  std::vector<uint8_t> orig_pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  std::vector<uint8_t> dist_pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  dist_pixels[0] += 128;
+
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr));
+  JxlButteraugliApiSetHFAsymmetry(api.get(), 1.0f);
+  JxlButteraugliApiSetIntensityTarget(api.get(), 250.0f);
+  JxlButteraugliResultPtr result(JxlButteraugliCompute(
+      api.get(), xsize, ysize, &pixel_format, orig_pixels.data(),
+      orig_pixels.size(), &pixel_format, dist_pixels.data(),
+      dist_pixels.size()));
+  double distance0 = JxlButteraugliResultGetDistance(result.get(), 8.0);
+
+  JxlButteraugliApiSetHFAsymmetry(api.get(), 2.0f);
+  result.reset(JxlButteraugliCompute(api.get(), xsize, ysize, &pixel_format,
+                                     orig_pixels.data(), orig_pixels.size(),
+                                     &pixel_format, dist_pixels.data(),
+                                     dist_pixels.size()));
+  double distance1 = JxlButteraugliResultGetDistance(result.get(), 8.0);
+
+  EXPECT_NE(distance0, distance1);
+
+  JxlButteraugliApiSetIntensityTarget(api.get(), 80.0f);
+  result.reset(JxlButteraugliCompute(api.get(), xsize, ysize, &pixel_format,
+                                     orig_pixels.data(), orig_pixels.size(),
+                                     &pixel_format, dist_pixels.data(),
+                                     dist_pixels.size()));
+  double distance2 = JxlButteraugliResultGetDistance(result.get(), 8.0);
+
+  EXPECT_NE(distance1, distance2);
+}
diff --git a/third_party/jpeg-xl/lib/jxl/butteraugli_wrapper.cc b/third_party/jpeg-xl/lib/jxl/butteraugli_wrapper.cc
new file mode 100644
index 0000000000..c5a1a8e506
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/butteraugli_wrapper.cc
@@ -0,0 +1,203 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/butteraugli.h>
+#include <jxl/parallel_runner.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <atomic>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/butteraugli/butteraugli.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_butteraugli_pnorm.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/memory_manager_internal.h"
+
+namespace {
+
+void SetMetadataFromPixelFormat(const JxlPixelFormat* pixel_format,
+                                jxl::ImageMetadata* metadata) {
+  uint32_t potential_alpha_bits = 0;
+  switch (pixel_format->data_type) {
+    case JXL_TYPE_FLOAT:
+      metadata->SetFloat32Samples();
+      potential_alpha_bits = 16;
+      break;
+    case JXL_TYPE_FLOAT16:
+      metadata->SetFloat16Samples();
+      potential_alpha_bits = 16;
+      break;
+    case JXL_TYPE_UINT16:
+      metadata->SetUintSamples(16);
+      potential_alpha_bits = 16;
+      break;
+    case JXL_TYPE_UINT8:
+      metadata->SetUintSamples(8);
+      potential_alpha_bits = 8;
+      break;
+    default:
+      JXL_ABORT("Unhandled JxlDataType");
+  }
+  if (pixel_format->num_channels == 2 || pixel_format->num_channels == 4) {
+    metadata->SetAlphaBits(potential_alpha_bits);
+  }
+}
+
+}  // namespace
+
+struct JxlButteraugliResultStruct {
+  JxlMemoryManager memory_manager;
+
+  jxl::ImageF distmap;
+  jxl::ButteraugliParams params;
+};
+
+struct JxlButteraugliApiStruct {
+  // Multiplier for penalizing new HF artifacts more than blurring away
+  // features. 1.0=neutral.
+  float hf_asymmetry = 1.0f;
+
+  // Multiplier for the psychovisual difference in the X channel.
+  float xmul = 1.0f;
+
+  // Number of nits that correspond to 1.0f input values.
+  float intensity_target = jxl::kDefaultIntensityTarget;
+
+  JxlCmsInterface cms;
+  JxlMemoryManager memory_manager;
+  std::unique_ptr<jxl::ThreadPool> thread_pool{nullptr};
+};
+
+JxlButteraugliApi* JxlButteraugliApiCreate(
+    const JxlMemoryManager* memory_manager) {
+  JxlMemoryManager local_memory_manager;
+  if (!jxl::MemoryManagerInit(&local_memory_manager, memory_manager))
+    return nullptr;
+
+  void* alloc =
+      jxl::MemoryManagerAlloc(&local_memory_manager, sizeof(JxlButteraugliApi));
+  if (!alloc) return nullptr;
+  // Placement new constructor on allocated memory
+  JxlButteraugliApi* ret = new (alloc) JxlButteraugliApi();
+  ret->cms = jxl::GetJxlCms();
+  ret->memory_manager = local_memory_manager;
+  return ret;
+}
+
+void JxlButteraugliApiSetParallelRunner(JxlButteraugliApi* api,
+                                        JxlParallelRunner parallel_runner,
+                                        void* parallel_runner_opaque) {
+  api->thread_pool = jxl::make_unique<jxl::ThreadPool>(parallel_runner,
+                                                       parallel_runner_opaque);
+}
+
+void JxlButteraugliApiSetHFAsymmetry(JxlButteraugliApi* api, float v) {
+  api->hf_asymmetry = v;
+}
+
+void JxlButteraugliApiSetIntensityTarget(JxlButteraugliApi* api, float v) {
+  api->intensity_target = v;
+}
+
+void JxlButteraugliApiDestroy(JxlButteraugliApi* api) {
+  if (api) {
+    JxlMemoryManager local_memory_manager = api->memory_manager;
+    // Call destructor directly since custom free function is used.
+    api->~JxlButteraugliApi();
+    jxl::MemoryManagerFree(&local_memory_manager, api);
+  }
+}
+
+JxlButteraugliResult* JxlButteraugliCompute(
+    const JxlButteraugliApi* api, uint32_t xsize, uint32_t ysize,
+    const JxlPixelFormat* pixel_format_orig, const void* buffer_orig,
+    size_t size_orig, const JxlPixelFormat* pixel_format_dist,
+    const void* buffer_dist, size_t size_dist) {
+  jxl::ImageMetadata orig_metadata;
+  SetMetadataFromPixelFormat(pixel_format_orig, &orig_metadata);
+  jxl::ImageBundle orig_ib(&orig_metadata);
+  jxl::ColorEncoding c_current;
+  if (pixel_format_orig->data_type == JXL_TYPE_FLOAT) {
+    c_current =
+        jxl::ColorEncoding::LinearSRGB(pixel_format_orig->num_channels < 3);
+  } else {
+    c_current = jxl::ColorEncoding::SRGB(pixel_format_orig->num_channels < 3);
+  }
+  if (!jxl::BufferToImageBundle(*pixel_format_orig, xsize, ysize, buffer_orig,
+                                size_orig, api->thread_pool.get(), c_current,
+                                &orig_ib)) {
+    return nullptr;
+  }
+
+  jxl::ImageMetadata dist_metadata;
+  SetMetadataFromPixelFormat(pixel_format_dist, &dist_metadata);
+  jxl::ImageBundle dist_ib(&dist_metadata);
+  if (pixel_format_dist->data_type == JXL_TYPE_FLOAT) {
+    c_current =
+        jxl::ColorEncoding::LinearSRGB(pixel_format_dist->num_channels < 3);
+  } else {
+    c_current = jxl::ColorEncoding::SRGB(pixel_format_dist->num_channels < 3);
+  }
+  if (!jxl::BufferToImageBundle(*pixel_format_dist, xsize, ysize, buffer_dist,
+                                size_dist, api->thread_pool.get(), c_current,
+                                &dist_ib)) {
+    return nullptr;
+  }
+
+  void* alloc = jxl::MemoryManagerAlloc(&api->memory_manager,
+                                        sizeof(JxlButteraugliResult));
+  if (!alloc) return nullptr;
+  // Placement new constructor on allocated memory
+  JxlButteraugliResult* result = new (alloc) JxlButteraugliResult();
+  result->memory_manager = api->memory_manager;
+  result->params.hf_asymmetry = api->hf_asymmetry;
+  result->params.xmul = api->xmul;
+  result->params.intensity_target = api->intensity_target;
+  jxl::ButteraugliDistance(orig_ib, dist_ib, result->params, api->cms,
+                           &result->distmap, api->thread_pool.get());
+
+  return result;
+}
+
+float JxlButteraugliResultGetDistance(const JxlButteraugliResult* result,
+                                      float pnorm) {
+  return static_cast<float>(
+      jxl::ComputeDistanceP(result->distmap, result->params, pnorm));
+}
+
+void JxlButteraugliResultGetDistmap(const JxlButteraugliResult* result,
+                                    const float** buffer,
+                                    uint32_t* row_stride) {
+  *buffer = result->distmap.Row(0);
+  *row_stride = result->distmap.PixelsPerRow();
+}
+
+float JxlButteraugliResultGetMaxDistance(const JxlButteraugliResult* result) {
+  float max_distance = 0.0;
+  for (uint32_t y = 0; y < result->distmap.ysize(); y++) {
+    for (uint32_t x = 0; x < result->distmap.xsize(); x++) {
+      if (result->distmap.ConstRow(y)[x] > max_distance) {
+        max_distance = result->distmap.ConstRow(y)[x];
+      }
+    }
+  }
+  return max_distance;
+}
+
+void JxlButteraugliResultDestroy(JxlButteraugliResult* result) {
+  if (result) {
+    JxlMemoryManager local_memory_manager = result->memory_manager;
+    // Call destructor directly since custom free function is used.
+    result->~JxlButteraugliResult();
+    jxl::MemoryManagerFree(&local_memory_manager, result);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jxl/byte_order_test.cc b/third_party/jpeg-xl/lib/jxl/byte_order_test.cc
new file mode 100644
index 0000000000..17d7ef6643
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/byte_order_test.cc
@@ -0,0 +1,53 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/byte_order.h"
+
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(ByteOrderTest, TestRoundTripBE16) {
+  const uint32_t in = 0x1234;
+  uint8_t buf[2];
+  StoreBE16(in, buf);
+  EXPECT_EQ(in, LoadBE16(buf));
+  EXPECT_NE(in, LoadLE16(buf));
+}
+
+TEST(ByteOrderTest, TestRoundTripLE16) {
+  const uint32_t in = 0x1234;
+  uint8_t buf[2];
+  StoreLE16(in, buf);
+  EXPECT_EQ(in, LoadLE16(buf));
+  EXPECT_NE(in, LoadBE16(buf));
+}
+
+TEST(ByteOrderTest, TestRoundTripBE32) {
+  const uint32_t in = 0xFEDCBA98u;
+  uint8_t buf[4];
+  StoreBE32(in, buf);
+  EXPECT_EQ(in, LoadBE32(buf));
+  EXPECT_NE(in, LoadLE32(buf));
+}
+
+TEST(ByteOrderTest, TestRoundTripLE32) {
+  const uint32_t in = 0xFEDCBA98u;
+  uint8_t buf[4];
+  StoreLE32(in, buf);
+  EXPECT_EQ(in, LoadLE32(buf));
+  EXPECT_NE(in, LoadBE32(buf));
+}
+
+TEST(ByteOrderTest, TestRoundTripLE64) {
+  const uint64_t in = 0xFEDCBA9876543210ull;
+  uint8_t buf[8];
+  StoreLE64(in, buf);
+  EXPECT_EQ(in, LoadLE64(buf));
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc b/third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc
new file mode 100644
index 0000000000..63d21cbb4b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/chroma_from_luma.h"
+
+namespace jxl {
+
+ColorCorrelationMap::ColorCorrelationMap(size_t xsize, size_t ysize, bool XYB)
+    : ytox_map(DivCeil(xsize, kColorTileDim), DivCeil(ysize, kColorTileDim)),
+      ytob_map(DivCeil(xsize, kColorTileDim), DivCeil(ysize, kColorTileDim)) {
+  ZeroFillImage(&ytox_map);
+  ZeroFillImage(&ytob_map);
+  if (!XYB) {
+    base_correlation_b_ = 0;
+  }
+  RecomputeDCFactors();
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/chroma_from_luma.h b/third_party/jpeg-xl/lib/jxl/chroma_from_luma.h
new file mode 100644
index 0000000000..9a7f3d45bc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/chroma_from_luma.h
@@ -0,0 +1,147 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_CHROMA_FROM_LUMA_H_
+#define LIB_JXL_CHROMA_FROM_LUMA_H_
+
+// Chroma-from-luma, computed using heuristics to determine the best linear
+// model for the X and B channels from the Y channel.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/quant_weights.h"
+
+namespace jxl {
+
+// Tile is the rectangular grid of blocks that share color correlation
+// parameters ("factor_x/b" such that residual_b = blue - Y * factor_b).
+static constexpr size_t kColorTileDim = 64;
+
+static_assert(kColorTileDim % kBlockDim == 0,
+              "Color tile dim should be divisible by block dim");
+static constexpr size_t kColorTileDimInBlocks = kColorTileDim / kBlockDim;
+
+static_assert(kGroupDimInBlocks % kColorTileDimInBlocks == 0,
+              "Group dim should be divisible by color tile dim");
+
+static constexpr uint8_t kDefaultColorFactor = 84;
+
+// JPEG DCT coefficients are at most 1024. CfL constants are at most 127, and
+// the ratio of two entries in a JPEG quantization table is at most 255. Thus,
+// since the CfL denominator is 84, this leaves 12 bits of mantissa to be used.
+// For extra caution, we use 11.
+static constexpr uint8_t kCFLFixedPointPrecision = 11;
+
+static constexpr U32Enc kColorFactorDist(Val(kDefaultColorFactor), Val(256),
+                                         BitsOffset(8, 2), BitsOffset(16, 258));
+
+struct ColorCorrelationMap {
+  ColorCorrelationMap() = default;
+  // xsize/ysize are in pixels
+  // set XYB=false to do something close to no-op cmap (needed for now since
+  // cmap is mandatory)
+  ColorCorrelationMap(size_t xsize, size_t ysize, bool XYB = true);
+
+  float YtoXRatio(int32_t x_factor) const {
+    return base_correlation_x_ + x_factor * color_scale_;
+  }
+
+  float YtoBRatio(int32_t b_factor) const {
+    return base_correlation_b_ + b_factor * color_scale_;
+  }
+
+  Status DecodeDC(BitReader* br) {
+    if (br->ReadFixedBits<1>() == 1) {
+      // All default.
+      return true;
+    }
+    SetColorFactor(U32Coder::Read(kColorFactorDist, br));
+    JXL_RETURN_IF_ERROR(F16Coder::Read(br, &base_correlation_x_));
+    if (std::abs(base_correlation_x_) > 4.0f) {
+      return JXL_FAILURE("Base X correlation is out of range");
+    }
+    JXL_RETURN_IF_ERROR(F16Coder::Read(br, &base_correlation_b_));
+    if (std::abs(base_correlation_b_) > 4.0f) {
+      return JXL_FAILURE("Base B correlation is out of range");
+    }
+    ytox_dc_ = static_cast<int>(br->ReadFixedBits<kBitsPerByte>()) +
+               std::numeric_limits<int8_t>::min();
+    ytob_dc_ = static_cast<int>(br->ReadFixedBits<kBitsPerByte>()) +
+               std::numeric_limits<int8_t>::min();
+    RecomputeDCFactors();
+    return true;
+  }
+
+  // We consider a CfL map to be JPEG-reconstruction-compatible if base
+  // correlation is 0, no DC correlation is used, and we use the default color
+  // factor.
+  bool IsJPEGCompatible() const {
+    return base_correlation_x_ == 0 && base_correlation_b_ == 0 &&
+           ytob_dc_ == 0 && ytox_dc_ == 0 &&
+           color_factor_ == kDefaultColorFactor;
+  }
+
+  int32_t RatioJPEG(int32_t factor) const {
+    return factor * (1 << kCFLFixedPointPrecision) / kDefaultColorFactor;
+  }
+
+  void SetColorFactor(uint32_t factor) {
+    color_factor_ = factor;
+    color_scale_ = 1.0f / color_factor_;
+    RecomputeDCFactors();
+  }
+
+  void SetYToBDC(int32_t ytob_dc) {
+    ytob_dc_ = ytob_dc;
+    RecomputeDCFactors();
+  }
+  void SetYToXDC(int32_t ytox_dc) {
+    ytox_dc_ = ytox_dc;
+    RecomputeDCFactors();
+  }
+
+  int32_t GetYToXDC() const { return ytox_dc_; }
+  int32_t GetYToBDC() const { return ytob_dc_; }
+  float GetColorFactor() const { return color_factor_; }
+  float GetBaseCorrelationX() const { return base_correlation_x_; }
+  float GetBaseCorrelationB() const { return base_correlation_b_; }
+
+  const float* DCFactors() const { return dc_factors_; }
+
+  void RecomputeDCFactors() {
+    dc_factors_[0] = YtoXRatio(ytox_dc_);
+    dc_factors_[2] = YtoBRatio(ytob_dc_);
+  }
+
+  ImageSB ytox_map;
+  ImageSB ytob_map;
+
+ private:
+  float dc_factors_[4] = {};
+  // range of factor: -1.51 to +1.52
+  uint32_t color_factor_ = kDefaultColorFactor;
+  float color_scale_ = 1.0f / color_factor_;
+  float base_correlation_x_ = 0.0f;
+  float base_correlation_b_ = kYToBRatio;
+  int32_t ytox_dc_ = 0;
+  int32_t ytob_dc_ = 0;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_CHROMA_FROM_LUMA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/codec_in_out.h b/third_party/jpeg-xl/lib/jxl/codec_in_out.h
new file mode 100644
index 0000000000..9e48b5e937
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/codec_in_out.h
@@ -0,0 +1,116 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_CODEC_IN_OUT_H_
+#define LIB_JXL_CODEC_IN_OUT_H_
+
+// Holds inputs/outputs for decoding/encoding images.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/luminance.h"
+
+namespace jxl {
+
+// Optional text/EXIF metadata.
+struct Blobs {
+  std::vector<uint8_t> exif;
+  std::vector<uint8_t> iptc;
+  std::vector<uint8_t> jumbf;
+  std::vector<uint8_t> xmp;
+};
+
+// Holds a preview, a main image or one or more frames, plus the inputs/outputs
+// to/from decoding/encoding.
+class CodecInOut {
+ public:
+  CodecInOut() : preview_frame(&metadata.m) {
+    frames.reserve(1);
+    frames.emplace_back(&metadata.m);
+  }
+
+  // Move-only.
+  CodecInOut(CodecInOut&&) = default;
+  CodecInOut& operator=(CodecInOut&&) = default;
+
+  size_t LastStillFrame() const {
+    JXL_DASSERT(!frames.empty());
+    size_t last = 0;
+    for (size_t i = 0; i < frames.size(); i++) {
+      last = i;
+      if (frames[i].duration > 0) break;
+    }
+    return last;
+  }
+
+  ImageBundle& Main() { return frames[LastStillFrame()]; }
+  const ImageBundle& Main() const { return frames[LastStillFrame()]; }
+
+  // If c_current.IsGray(), all planes must be identical.
+  void SetFromImage(Image3F&& color, const ColorEncoding& c_current) {
+    Main().SetFromImage(std::move(color), c_current);
+    SetIntensityTarget(&this->metadata.m);
+    SetSize(Main().xsize(), Main().ysize());
+  }
+
+  void SetSize(size_t xsize, size_t ysize) {
+    JXL_CHECK(metadata.size.Set(xsize, ysize));
+  }
+
+  void CheckMetadata() const {
+    JXL_CHECK(metadata.m.bit_depth.bits_per_sample != 0);
+    JXL_CHECK(!metadata.m.color_encoding.ICC().empty());
+
+    if (preview_frame.xsize() != 0) preview_frame.VerifyMetadata();
+    JXL_CHECK(preview_frame.metadata() == &metadata.m);
+
+    for (const ImageBundle& ib : frames) {
+      ib.VerifyMetadata();
+      JXL_CHECK(ib.metadata() == &metadata.m);
+    }
+  }
+
+  size_t xsize() const { return metadata.size.xsize(); }
+  size_t ysize() const { return metadata.size.ysize(); }
+  void ShrinkTo(size_t xsize, size_t ysize) {
+    // preview is unaffected.
+    for (ImageBundle& ib : frames) {
+      ib.ShrinkTo(xsize, ysize);
+    }
+    SetSize(xsize, ysize);
+  }
+
+  // -- DECODER OUTPUT, ENCODER INPUT:
+
+  // Metadata stored into / retrieved from bitstreams.
+
+  Blobs blobs;
+
+  CodecMetadata metadata;  // applies to preview and all frames
+
+  // If metadata.have_preview:
+  ImageBundle preview_frame;
+
+  std::vector<ImageBundle> frames;  // size=1 if !metadata.have_animation
+
+  // If the image should be written to a JPEG, use this quality for encoding.
+  size_t jpeg_quality;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_CODEC_IN_OUT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/coeff_order.cc b/third_party/jpeg-xl/lib/jxl/coeff_order.cc
new file mode 100644
index 0000000000..43adafd82a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/coeff_order.cc
@@ -0,0 +1,153 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/coeff_order.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/lehmer_code.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/modular_image.h"
+
+namespace jxl {
+
+uint32_t CoeffOrderContext(uint32_t val) {
+  uint32_t token, nbits, bits;
+  HybridUintConfig(0, 0, 0).Encode(val, &token, &nbits, &bits);
+  return std::min(token, kPermutationContexts - 1);
+}
+
+namespace {
+Status ReadPermutation(size_t skip, size_t size, coeff_order_t* order,
+                       BitReader* br, ANSSymbolReader* reader,
+                       const std::vector<uint8_t>& context_map) {
+  std::vector<LehmerT> lehmer(size);
+  // temp space needs to be as large as the next power of 2, so doubling the
+  // allocated size is enough.
+  std::vector<uint32_t> temp(size * 2);
+  uint32_t end =
+      reader->ReadHybridUint(CoeffOrderContext(size), br, context_map) + skip;
+  if (end > size) {
+    return JXL_FAILURE("Invalid permutation size");
+  }
+  uint32_t last = 0;
+  for (size_t i = skip; i < end; ++i) {
+    lehmer[i] =
+        reader->ReadHybridUint(CoeffOrderContext(last), br, context_map);
+    last = lehmer[i];
+    if (lehmer[i] + i >= size) {
+      return JXL_FAILURE("Invalid lehmer code");
+    }
+  }
+  if (order == nullptr) return true;
+  DecodeLehmerCode(lehmer.data(), temp.data(), size, order);
+  return true;
+}
+
+}  // namespace
+
+Status DecodePermutation(size_t skip, size_t size, coeff_order_t* order,
+                         BitReader* br) {
+  std::vector<uint8_t> context_map;
+  ANSCode code;
+  JXL_RETURN_IF_ERROR(
+      DecodeHistograms(br, kPermutationContexts, &code, &context_map));
+  ANSSymbolReader reader(&code, br);
+  JXL_RETURN_IF_ERROR(
+      ReadPermutation(skip, size, order, br, &reader, context_map));
+  if (!reader.CheckANSFinalState()) {
+    return JXL_FAILURE("Invalid ANS stream");
+  }
+  return true;
+}
+
+namespace {
+
+Status DecodeCoeffOrder(AcStrategy acs, coeff_order_t* order, BitReader* br,
+                        ANSSymbolReader* reader,
+                        std::vector<coeff_order_t>& natural_order,
+                        const std::vector<uint8_t>& context_map) {
+  PROFILER_FUNC;
+  const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y();
+  const size_t size = kDCTBlockSize * llf;
+
+  JXL_RETURN_IF_ERROR(
+      ReadPermutation(llf, size, order, br, reader, context_map));
+  if (order == nullptr) return true;
+  for (size_t k = 0; k < size; ++k) {
+    order[k] = natural_order[order[k]];
+  }
+  return true;
+}
+
+}  // namespace
+
+Status DecodeCoeffOrders(uint16_t used_orders, uint32_t used_acs,
+                         coeff_order_t* order, BitReader* br) {
+  uint16_t computed = 0;
+  std::vector<uint8_t> context_map;
+  ANSCode code;
+  std::unique_ptr<ANSSymbolReader> reader;
+  std::vector<coeff_order_t> natural_order;
+  // Bitstream does not have histograms if no coefficient order is used.
+  if (used_orders != 0) {
+    JXL_RETURN_IF_ERROR(
+        DecodeHistograms(br, kPermutationContexts, &code, &context_map));
+    reader = make_unique<ANSSymbolReader>(&code, br);
+  }
+  uint32_t acs_mask = 0;
+  for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
+    if ((used_acs & (1 << o)) == 0) continue;
+    acs_mask |= 1 << kStrategyOrder[o];
+  }
+  for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
+    uint8_t ord = kStrategyOrder[o];
+    if (computed & (1 << ord)) continue;
+    computed |= 1 << ord;
+    AcStrategy acs = AcStrategy::FromRawStrategy(o);
+    bool used = (acs_mask & (1 << ord)) != 0;
+
+    const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y();
+    const size_t size = kDCTBlockSize * llf;
+
+    if (used || (used_orders & (1 << ord))) {
+      if (natural_order.size() < size) natural_order.resize(size);
+      acs.ComputeNaturalCoeffOrder(natural_order.data());
+    }
+
+    if ((used_orders & (1 << ord)) == 0) {
+      // No need to set the default order if no ACS uses this order.
+      if (used) {
+        for (size_t c = 0; c < 3; c++) {
+          memcpy(&order[CoeffOrderOffset(ord, c)], natural_order.data(),
+                 size * sizeof(*order));
+        }
+      }
+    } else {
+      for (size_t c = 0; c < 3; c++) {
+        coeff_order_t* dest = used ? &order[CoeffOrderOffset(ord, c)] : nullptr;
+        JXL_RETURN_IF_ERROR(DecodeCoeffOrder(acs, dest, br, reader.get(),
+                                             natural_order, context_map));
+      }
+    }
+  }
+  if (used_orders && !reader->CheckANSFinalState()) {
+    return JXL_FAILURE("Invalid ANS stream");
+  }
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/coeff_order.h b/third_party/jpeg-xl/lib/jxl/coeff_order.h
new file mode 100644
index 0000000000..fb32499f2f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/coeff_order.h
@@ -0,0 +1,64 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_COEFF_ORDER_H_
+#define LIB_JXL_COEFF_ORDER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_util.h"
+
+namespace jxl {
+
+class BitReader;
+
+// Those offsets get multiplied by kDCTBlockSize.
+static constexpr size_t kCoeffOrderOffset[] = {
+    0,    1,    2,    3,    4,    5,    6,    10,   14,   18,
+    34,   50,   66,   68,   70,   72,   76,   80,   84,   92,
+    100,  108,  172,  236,  300,  332,  364,  396,  652,  908,
+    1164, 1292, 1420, 1548, 2572, 3596, 4620, 5132, 5644, 6156,
+};
+static_assert(3 * kNumOrders + 1 ==
+                  sizeof(kCoeffOrderOffset) / sizeof(*kCoeffOrderOffset),
+              "Update this array when adding or removing order types.");
+
+static constexpr size_t CoeffOrderOffset(size_t order, size_t c) {
+  return kCoeffOrderOffset[3 * order + c] * kDCTBlockSize;
+}
+
+static constexpr size_t kCoeffOrderMaxSize =
+    kCoeffOrderOffset[3 * kNumOrders] * kDCTBlockSize;
+
+// Mapping from AC strategy to order bucket. Strategies with different natural
+// orders must have different buckets.
+constexpr uint8_t kStrategyOrder[] = {
+    0, 1, 1, 1, 2, 3, 4, 4, 5,  5,  6,  6,  1,  1,
+    1, 1, 1, 1, 7, 8, 8, 9, 10, 10, 11, 12, 12,
+};
+
+static_assert(AcStrategy::kNumValidStrategies ==
+                  sizeof(kStrategyOrder) / sizeof(*kStrategyOrder),
+              "Update this array when adding or removing AC strategies.");
+
+constexpr uint32_t kPermutationContexts = 8;
+
+uint32_t CoeffOrderContext(uint32_t val);
+
+Status DecodeCoeffOrders(uint16_t used_orders, uint32_t used_acs,
+                         coeff_order_t* order, BitReader* br);
+
+Status DecodePermutation(size_t skip, size_t size, coeff_order_t* order,
+                         BitReader* br);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_COEFF_ORDER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/coeff_order_fwd.h b/third_party/jpeg-xl/lib/jxl/coeff_order_fwd.h
new file mode 100644
index 0000000000..26306575c1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/coeff_order_fwd.h
@@ -0,0 +1,47 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_COEFF_ORDER_FWD_H_
+#define LIB_JXL_COEFF_ORDER_FWD_H_
+
+// Breaks circular dependency between ac_strategy and coeff_order.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+
+// Needs at least 16 bits. A 32-bit type speeds up DecodeAC by 2% at the cost of
+// more memory.
+using coeff_order_t = uint32_t;
+
+// Maximum number of orders to be used. Note that this needs to be multiplied by
+// the number of channels. One per "size class" (plus one extra for DCT8),
+// shared between transforms of size XxY and of size YxX.
+constexpr uint8_t kNumOrders = 13;
+
+// DCT coefficients are laid out in such a way that the number of rows of
+// coefficients is always the smaller coordinate.
+JXL_INLINE constexpr size_t CoefficientRows(size_t rows, size_t columns) {
+  return rows < columns ? rows : columns;
+}
+
+JXL_INLINE constexpr size_t CoefficientColumns(size_t rows, size_t columns) {
+  return rows < columns ? columns : rows;
+}
+
+JXL_INLINE void CoefficientLayout(size_t* JXL_RESTRICT rows,
+                                  size_t* JXL_RESTRICT columns) {
+  size_t r = *rows;
+  size_t c = *columns;
+  *rows = CoefficientRows(r, c);
+  *columns = CoefficientColumns(r, c);
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_COEFF_ORDER_FWD_H_
diff --git a/third_party/jpeg-xl/lib/jxl/coeff_order_test.cc b/third_party/jpeg-xl/lib/jxl/coeff_order_test.cc
new file mode 100644
index 0000000000..6fa0775697
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/coeff_order_test.cc
@@ -0,0 +1,97 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/coeff_order.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <numeric>  // iota
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_coeff_order.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+void RoundtripPermutation(coeff_order_t* perm, coeff_order_t* out, size_t len,
+                          size_t* size) {
+  BitWriter writer;
+  EncodePermutation(perm, 0, len, &writer, 0, nullptr);
+  writer.ZeroPadToByte();
+  Status status = true;
+  {
+    BitReader reader(writer.GetSpan());
+    BitReaderScopedCloser closer(&reader, &status);
+    ASSERT_TRUE(DecodePermutation(0, len, out, &reader));
+  }
+  ASSERT_TRUE(status);
+  *size = writer.GetSpan().size();
+}
+
+enum Permutation { kIdentity, kFewSwaps, kFewSlides, kRandom };
+
+constexpr size_t kSwaps = 32;
+
+void TestPermutation(Permutation kind, size_t len) {
+  std::vector<coeff_order_t> perm(len);
+  std::iota(perm.begin(), perm.end(), 0);
+  Rng rng(0);
+  if (kind == kFewSwaps) {
+    for (size_t i = 0; i < kSwaps; i++) {
+      size_t a = rng.UniformU(0, len - 1);
+      size_t b = rng.UniformU(0, len - 1);
+      std::swap(perm[a], perm[b]);
+    }
+  }
+  if (kind == kFewSlides) {
+    for (size_t i = 0; i < kSwaps; i++) {
+      size_t a = rng.UniformU(0, len - 1);
+      size_t b = rng.UniformU(0, len - 1);
+      size_t from = std::min(a, b);
+      size_t to = std::max(a, b);
+      size_t start = perm[from];
+      for (size_t j = from; j < to; j++) {
+        perm[j] = perm[j + 1];
+      }
+      perm[to] = start;
+    }
+  }
+  if (kind == kRandom) {
+    rng.Shuffle(perm.data(), perm.size());
+  }
+  std::vector<coeff_order_t> out(len);
+  size_t size = 0;
+  RoundtripPermutation(perm.data(), out.data(), len, &size);
+  for (size_t idx = 0; idx < len; idx++) {
+    EXPECT_EQ(perm[idx], out[idx]);
+  }
+  printf("Encoded size: %" PRIuS "\n", size);
+}
+
+TEST(CoeffOrderTest, IdentitySmall) { TestPermutation(kIdentity, 256); }
+TEST(CoeffOrderTest, FewSlidesSmall) { TestPermutation(kFewSlides, 256); }
+TEST(CoeffOrderTest, FewSwapsSmall) { TestPermutation(kFewSwaps, 256); }
+TEST(CoeffOrderTest, RandomSmall) { TestPermutation(kRandom, 256); }
+
+TEST(CoeffOrderTest, IdentityMedium) { TestPermutation(kIdentity, 1 << 12); }
+TEST(CoeffOrderTest, FewSlidesMedium) { TestPermutation(kFewSlides, 1 << 12); }
+TEST(CoeffOrderTest, FewSwapsMedium) { TestPermutation(kFewSwaps, 1 << 12); }
+TEST(CoeffOrderTest, RandomMedium) { TestPermutation(kRandom, 1 << 12); }
+
+TEST(CoeffOrderTest, IdentityBig) { TestPermutation(kIdentity, 1 << 16); }
+TEST(CoeffOrderTest, FewSlidesBig) { TestPermutation(kFewSlides, 1 << 16); }
+TEST(CoeffOrderTest, FewSwapsBig) { TestPermutation(kFewSwaps, 1 << 16); }
+TEST(CoeffOrderTest, RandomBig) { TestPermutation(kRandom, 1 << 16); }
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/color_encoding_internal.cc b/third_party/jpeg-xl/lib/jxl/color_encoding_internal.cc
new file mode 100644
index 0000000000..e496accfed
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/color_encoding_internal.cc
@@ -0,0 +1,753 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/color_encoding_internal.h"
+
+#include <errno.h>
+
+#include <array>
+#include <cmath>
+
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/matrix_ops.h"
+
+namespace jxl {
+namespace {
+
+// Highest reasonable value for the gamma of a transfer curve.
+constexpr uint32_t kMaxGamma = 8192;
+
+// These strings are baked into Description - do not change.
+
+std::string ToString(ColorSpace color_space) {
+  switch (color_space) {
+    case ColorSpace::kRGB:
+      return "RGB";
+    case ColorSpace::kGray:
+      return "Gra";
+    case ColorSpace::kXYB:
+      return "XYB";
+    case ColorSpace::kUnknown:
+      return "CS?";
+  }
+  // Should not happen - visitor fails if enum is invalid.
+  JXL_ABORT("Invalid ColorSpace %u", static_cast<uint32_t>(color_space));
+}
+
+std::string ToString(WhitePoint white_point) {
+  switch (white_point) {
+    case WhitePoint::kD65:
+      return "D65";
+    case WhitePoint::kCustom:
+      return "Cst";
+    case WhitePoint::kE:
+      return "EER";
+    case WhitePoint::kDCI:
+      return "DCI";
+  }
+  // Should not happen - visitor fails if enum is invalid.
+  JXL_ABORT("Invalid WhitePoint %u", static_cast<uint32_t>(white_point));
+}
+
+std::string ToString(Primaries primaries) {
+  switch (primaries) {
+    case Primaries::kSRGB:
+      return "SRG";
+    case Primaries::k2100:
+      return "202";
+    case Primaries::kP3:
+      return "DCI";
+    case Primaries::kCustom:
+      return "Cst";
+  }
+  // Should not happen - visitor fails if enum is invalid.
+  JXL_ABORT("Invalid Primaries %u", static_cast<uint32_t>(primaries));
+}
+
+std::string ToString(TransferFunction transfer_function) {
+  switch (transfer_function) {
+    case TransferFunction::kSRGB:
+      return "SRG";
+    case TransferFunction::kLinear:
+      return "Lin";
+    case TransferFunction::k709:
+      return "709";
+    case TransferFunction::kPQ:
+      return "PeQ";
+    case TransferFunction::kHLG:
+      return "HLG";
+    case TransferFunction::kDCI:
+      return "DCI";
+    case TransferFunction::kUnknown:
+      return "TF?";
+  }
+  // Should not happen - visitor fails if enum is invalid.
+  JXL_ABORT("Invalid TransferFunction %u",
+            static_cast<uint32_t>(transfer_function));
+}
+
+std::string ToString(RenderingIntent rendering_intent) {
+  switch (rendering_intent) {
+    case RenderingIntent::kPerceptual:
+      return "Per";
+    case RenderingIntent::kRelative:
+      return "Rel";
+    case RenderingIntent::kSaturation:
+      return "Sat";
+    case RenderingIntent::kAbsolute:
+      return "Abs";
+  }
+  // Should not happen - visitor fails if enum is invalid.
+  JXL_ABORT("Invalid RenderingIntent %u",
+            static_cast<uint32_t>(rendering_intent));
+}
+
+static double F64FromCustomxyI32(const int32_t i) { return i * 1E-6; }
+static Status F64ToCustomxyI32(const double f, int32_t* JXL_RESTRICT i) {
+  if (!(-4 <= f && f <= 4)) {
+    return JXL_FAILURE("F64 out of bounds for CustomxyI32");
+  }
+  *i = static_cast<int32_t>(roundf(f * 1E6));
+  return true;
+}
+
+Status ConvertExternalToInternalWhitePoint(const JxlWhitePoint external,
+                                           WhitePoint* internal) {
+  switch (external) {
+    case JXL_WHITE_POINT_D65:
+      *internal = WhitePoint::kD65;
+      return true;
+    case JXL_WHITE_POINT_CUSTOM:
+      *internal = WhitePoint::kCustom;
+      return true;
+    case JXL_WHITE_POINT_E:
+      *internal = WhitePoint::kE;
+      return true;
+    case JXL_WHITE_POINT_DCI:
+      *internal = WhitePoint::kDCI;
+      return true;
+  }
+  return JXL_FAILURE("Invalid WhitePoint enum value");
+}
+
+Status ConvertExternalToInternalPrimaries(const JxlPrimaries external,
+                                          Primaries* internal) {
+  switch (external) {
+    case JXL_PRIMARIES_SRGB:
+      *internal = Primaries::kSRGB;
+      return true;
+    case JXL_PRIMARIES_CUSTOM:
+      *internal = Primaries::kCustom;
+      return true;
+    case JXL_PRIMARIES_2100:
+      *internal = Primaries::k2100;
+      return true;
+    case JXL_PRIMARIES_P3:
+      *internal = Primaries::kP3;
+      return true;
+  }
+  return JXL_FAILURE("Invalid Primaries enum value");
+}
+
+Status ConvertExternalToInternalTransferFunction(
+    const JxlTransferFunction external, TransferFunction* internal) {
+  switch (external) {
+    case JXL_TRANSFER_FUNCTION_709:
+      *internal = TransferFunction::k709;
+      return true;
+    case JXL_TRANSFER_FUNCTION_UNKNOWN:
+      *internal = TransferFunction::kUnknown;
+      return true;
+    case JXL_TRANSFER_FUNCTION_LINEAR:
+      *internal = TransferFunction::kLinear;
+      return true;
+    case JXL_TRANSFER_FUNCTION_SRGB:
+      *internal = TransferFunction::kSRGB;
+      return true;
+    case JXL_TRANSFER_FUNCTION_PQ:
+      *internal = TransferFunction::kPQ;
+      return true;
+    case JXL_TRANSFER_FUNCTION_DCI:
+      *internal = TransferFunction::kDCI;
+      return true;
+    case JXL_TRANSFER_FUNCTION_HLG:
+      *internal = TransferFunction::kHLG;
+      return true;
+    case JXL_TRANSFER_FUNCTION_GAMMA:
+      return JXL_FAILURE("Gamma should be handled separately");
+  }
+  return JXL_FAILURE("Invalid TransferFunction enum value");
+}
+
+Status ConvertExternalToInternalRenderingIntent(
+    const JxlRenderingIntent external, RenderingIntent* internal) {
+  switch (external) {
+    case JXL_RENDERING_INTENT_PERCEPTUAL:
+      *internal = RenderingIntent::kPerceptual;
+      return true;
+    case JXL_RENDERING_INTENT_RELATIVE:
+      *internal = RenderingIntent::kRelative;
+      return true;
+    case JXL_RENDERING_INTENT_SATURATION:
+      *internal = RenderingIntent::kSaturation;
+      return true;
+    case JXL_RENDERING_INTENT_ABSOLUTE:
+      *internal = RenderingIntent::kAbsolute;
+      return true;
+  }
+  return JXL_FAILURE("Invalid RenderingIntent enum value");
+}
+
+}  // namespace
+
+CIExy Customxy::Get() const {
+  CIExy xy;
+  xy.x = F64FromCustomxyI32(x);
+  xy.y = F64FromCustomxyI32(y);
+  return xy;
+}
+
+Status Customxy::Set(const CIExy& xy) {
+  JXL_RETURN_IF_ERROR(F64ToCustomxyI32(xy.x, &x));
+  JXL_RETURN_IF_ERROR(F64ToCustomxyI32(xy.y, &y));
+  size_t extension_bits, total_bits;
+  if (!Bundle::CanEncode(*this, &extension_bits, &total_bits)) {
+    return JXL_FAILURE("Unable to encode XY %f %f", xy.x, xy.y);
+  }
+  return true;
+}
+
+bool CustomTransferFunction::SetImplicit() {
+  if (nonserialized_color_space == ColorSpace::kXYB) {
+    if (!SetGamma(1.0 / 3)) JXL_ASSERT(false);
+    return true;
+  }
+  return false;
+}
+
+Status CustomTransferFunction::SetGamma(double gamma) {
+  if (gamma < (1.0f / kMaxGamma) || gamma > 1.0) {
+    return JXL_FAILURE("Invalid gamma %f", gamma);
+  }
+
+  have_gamma_ = false;
+  if (ApproxEq(gamma, 1.0)) {
+    transfer_function_ = TransferFunction::kLinear;
+    return true;
+  }
+  if (ApproxEq(gamma, 1.0 / 2.6)) {
+    transfer_function_ = TransferFunction::kDCI;
+    return true;
+  }
+  // Don't translate 0.45.. to kSRGB nor k709 - that might change pixel
+  // values because those curves also have a linear part.
+
+  have_gamma_ = true;
+  gamma_ = roundf(gamma * kGammaMul);
+  transfer_function_ = TransferFunction::kUnknown;
+  return true;
+}
+
+namespace {
+
+std::array<ColorEncoding, 2> CreateC2(const Primaries pr,
+                                      const TransferFunction tf) {
+  std::array<ColorEncoding, 2> c2;
+
+  {
+    ColorEncoding* c_rgb = c2.data() + 0;
+    c_rgb->SetColorSpace(ColorSpace::kRGB);
+    c_rgb->white_point = WhitePoint::kD65;
+    c_rgb->primaries = pr;
+    c_rgb->tf.SetTransferFunction(tf);
+    JXL_CHECK(c_rgb->CreateICC());
+  }
+
+  {
+    ColorEncoding* c_gray = c2.data() + 1;
+    c_gray->SetColorSpace(ColorSpace::kGray);
+    c_gray->white_point = WhitePoint::kD65;
+    c_gray->primaries = pr;
+    c_gray->tf.SetTransferFunction(tf);
+    JXL_CHECK(c_gray->CreateICC());
+  }
+
+  return c2;
+}
+
+}  // namespace
+
+const ColorEncoding& ColorEncoding::SRGB(bool is_gray) {
+  static std::array<ColorEncoding, 2> c2 =
+      CreateC2(Primaries::kSRGB, TransferFunction::kSRGB);
+  return c2[is_gray];
+}
+const ColorEncoding& ColorEncoding::LinearSRGB(bool is_gray) {
+  static std::array<ColorEncoding, 2> c2 =
+      CreateC2(Primaries::kSRGB, TransferFunction::kLinear);
+  return c2[is_gray];
+}
+
+CIExy ColorEncoding::GetWhitePoint() const {
+  JXL_DASSERT(have_fields_);
+  CIExy xy;
+  switch (white_point) {
+    case WhitePoint::kCustom:
+      return white_.Get();
+
+    case WhitePoint::kD65:
+      xy.x = 0.3127;
+      xy.y = 0.3290;
+      return xy;
+
+    case WhitePoint::kDCI:
+      // From https://ieeexplore.ieee.org/document/7290729 C.2 page 11
+      xy.x = 0.314;
+      xy.y = 0.351;
+      return xy;
+
+    case WhitePoint::kE:
+      xy.x = xy.y = 1.0 / 3;
+      return xy;
+  }
+  JXL_ABORT("Invalid WhitePoint %u", static_cast<uint32_t>(white_point));
+}
+
+Status ColorEncoding::SetWhitePoint(const CIExy& xy) {
+  JXL_DASSERT(have_fields_);
+  if (xy.x == 0.0 || xy.y == 0.0) {
+    return JXL_FAILURE("Invalid white point %f %f", xy.x, xy.y);
+  }
+  if (ApproxEq(xy.x, 0.3127) && ApproxEq(xy.y, 0.3290)) {
+    white_point = WhitePoint::kD65;
+    return true;
+  }
+  if (ApproxEq(xy.x, 1.0 / 3) && ApproxEq(xy.y, 1.0 / 3)) {
+    white_point = WhitePoint::kE;
+    return true;
+  }
+  if (ApproxEq(xy.x, 0.314) && ApproxEq(xy.y, 0.351)) {
+    white_point = WhitePoint::kDCI;
+    return true;
+  }
+  white_point = WhitePoint::kCustom;
+  return white_.Set(xy);
+}
+
+PrimariesCIExy ColorEncoding::GetPrimaries() const {
+  JXL_DASSERT(have_fields_);
+  JXL_ASSERT(HasPrimaries());
+  PrimariesCIExy xy;
+  switch (primaries) {
+    case Primaries::kCustom:
+      xy.r = red_.Get();
+      xy.g = green_.Get();
+      xy.b = blue_.Get();
+      return xy;
+
+    case Primaries::kSRGB:
+      xy.r.x = 0.639998686;
+      xy.r.y = 0.330010138;
+      xy.g.x = 0.300003784;
+      xy.g.y = 0.600003357;
+      xy.b.x = 0.150002046;
+      xy.b.y = 0.059997204;
+      return xy;
+
+    case Primaries::k2100:
+      xy.r.x = 0.708;
+      xy.r.y = 0.292;
+      xy.g.x = 0.170;
+      xy.g.y = 0.797;
+      xy.b.x = 0.131;
+      xy.b.y = 0.046;
+      return xy;
+
+    case Primaries::kP3:
+      xy.r.x = 0.680;
+      xy.r.y = 0.320;
+      xy.g.x = 0.265;
+      xy.g.y = 0.690;
+      xy.b.x = 0.150;
+      xy.b.y = 0.060;
+      return xy;
+  }
+  JXL_ABORT("Invalid Primaries %u", static_cast<uint32_t>(primaries));
+}
+
+Status ColorEncoding::SetPrimaries(const PrimariesCIExy& xy) {
+  JXL_DASSERT(have_fields_);
+  JXL_ASSERT(HasPrimaries());
+  if (xy.r.x == 0.0 || xy.r.y == 0.0 || xy.g.x == 0.0 || xy.g.y == 0.0 ||
+      xy.b.x == 0.0 || xy.b.y == 0.0) {
+    return JXL_FAILURE("Invalid primaries %f %f %f %f %f %f", xy.r.x, xy.r.y,
+                       xy.g.x, xy.g.y, xy.b.x, xy.b.y);
+  }
+
+  if (ApproxEq(xy.r.x, 0.64) && ApproxEq(xy.r.y, 0.33) &&
+      ApproxEq(xy.g.x, 0.30) && ApproxEq(xy.g.y, 0.60) &&
+      ApproxEq(xy.b.x, 0.15) && ApproxEq(xy.b.y, 0.06)) {
+    primaries = Primaries::kSRGB;
+    return true;
+  }
+
+  if (ApproxEq(xy.r.x, 0.708) && ApproxEq(xy.r.y, 0.292) &&
+      ApproxEq(xy.g.x, 0.170) && ApproxEq(xy.g.y, 0.797) &&
+      ApproxEq(xy.b.x, 0.131) && ApproxEq(xy.b.y, 0.046)) {
+    primaries = Primaries::k2100;
+    return true;
+  }
+  if (ApproxEq(xy.r.x, 0.680) && ApproxEq(xy.r.y, 0.320) &&
+      ApproxEq(xy.g.x, 0.265) && ApproxEq(xy.g.y, 0.690) &&
+      ApproxEq(xy.b.x, 0.150) && ApproxEq(xy.b.y, 0.060)) {
+    primaries = Primaries::kP3;
+    return true;
+  }
+
+  primaries = Primaries::kCustom;
+  JXL_RETURN_IF_ERROR(red_.Set(xy.r));
+  JXL_RETURN_IF_ERROR(green_.Set(xy.g));
+  JXL_RETURN_IF_ERROR(blue_.Set(xy.b));
+  return true;
+}
+
+Status ColorEncoding::CreateICC() {
+  InternalRemoveICC();
+  if (!MaybeCreateProfile(*this, &icc_)) {
+    return JXL_FAILURE("Failed to create profile from fields");
+  }
+  return true;
+}
+
+std::string Description(const ColorEncoding& c_in) {
+  // Copy required for Implicit*
+  ColorEncoding c = c_in;
+
+  std::string d = ToString(c.GetColorSpace());
+
+  if (!c.ImplicitWhitePoint()) {
+    d += '_';
+    if (c.white_point == WhitePoint::kCustom) {
+      const CIExy wp = c.GetWhitePoint();
+      d += ToString(wp.x) + ';';
+      d += ToString(wp.y);
+    } else {
+      d += ToString(c.white_point);
+    }
+  }
+
+  if (c.HasPrimaries()) {
+    d += '_';
+    if (c.primaries == Primaries::kCustom) {
+      const PrimariesCIExy pr = c.GetPrimaries();
+      d += ToString(pr.r.x) + ';';
+      d += ToString(pr.r.y) + ';';
+      d += ToString(pr.g.x) + ';';
+      d += ToString(pr.g.y) + ';';
+      d += ToString(pr.b.x) + ';';
+      d += ToString(pr.b.y);
+    } else {
+      d += ToString(c.primaries);
+    }
+  }
+
+  d += '_';
+  d += ToString(c.rendering_intent);
+
+  if (!c.tf.SetImplicit()) {
+    d += '_';
+    if (c.tf.IsGamma()) {
+      d += 'g';
+      d += ToString(c.tf.GetGamma());
+    } else {
+      d += ToString(c.tf.GetTransferFunction());
+    }
+  }
+
+  return d;
+}
+
+Customxy::Customxy() { Bundle::Init(this); }
+Status Customxy::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  uint32_t ux = PackSigned(x);
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Bits(19), BitsOffset(19, 524288),
+                                         BitsOffset(20, 1048576),
+                                         BitsOffset(21, 2097152), 0, &ux));
+  x = UnpackSigned(ux);
+  uint32_t uy = PackSigned(y);
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Bits(19), BitsOffset(19, 524288),
+                                         BitsOffset(20, 1048576),
+                                         BitsOffset(21, 2097152), 0, &uy));
+  y = UnpackSigned(uy);
+  return true;
+}
+
+CustomTransferFunction::CustomTransferFunction() { Bundle::Init(this); }
+Status CustomTransferFunction::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->Conditional(!SetImplicit())) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_gamma_));
+
+    if (visitor->Conditional(have_gamma_)) {
+      // Gamma is represented as a 24-bit int, the exponent used is
+      // gamma_ / 1e7. Valid values are (0, 1]. On the low end side, we also
+      // limit it to kMaxGamma/1e7.
+      JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(24, kGammaMul, &gamma_));
+      if (gamma_ > kGammaMul ||
+          static_cast<uint64_t>(gamma_) * kMaxGamma < kGammaMul) {
+        return JXL_FAILURE("Invalid gamma %u", gamma_);
+      }
+    }
+
+    if (visitor->Conditional(!have_gamma_)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->Enum(TransferFunction::kSRGB, &transfer_function_));
+    }
+  }
+
+  return true;
+}
+
+ColorEncoding::ColorEncoding() { Bundle::Init(this); }
+Status ColorEncoding::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &want_icc_));
+
+  // Always send even if want_icc_ because this affects decoding.
+  // We can skip the white point/primaries because they do not.
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(ColorSpace::kRGB, &color_space_));
+
+  if (visitor->Conditional(!WantICC())) {
+    // Serialize enums. NOTE: we set the defaults to the most common values so
+    // ImageMetadata.all_default is true in the common case.
+
+    if (visitor->Conditional(!ImplicitWhitePoint())) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(WhitePoint::kD65, &white_point));
+      if (visitor->Conditional(white_point == WhitePoint::kCustom)) {
+        JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&white_));
+      }
+    }
+
+    if (visitor->Conditional(HasPrimaries())) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(Primaries::kSRGB, &primaries));
+      if (visitor->Conditional(primaries == Primaries::kCustom)) {
+        JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&red_));
+        JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&green_));
+        JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&blue_));
+      }
+    }
+
+    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&tf));
+
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->Enum(RenderingIntent::kRelative, &rendering_intent));
+
+    // We didn't have ICC, so all fields should be known.
+    if (color_space_ == ColorSpace::kUnknown || tf.IsUnknown()) {
+      return JXL_FAILURE(
+          "No ICC but cs %u and tf %u%s",
+          static_cast<unsigned int>(color_space_),
+          tf.IsGamma() ? 0
+                       : static_cast<unsigned int>(tf.GetTransferFunction()),
+          tf.IsGamma() ? "(gamma)" : "");
+    }
+
+    JXL_RETURN_IF_ERROR(CreateICC());
+  }
+
+  if (WantICC() && visitor->IsReading()) {
+    // Haven't called SetICC() yet, do nothing.
+  } else {
+    if (ICC().empty()) return JXL_FAILURE("Empty ICC");
+  }
+
+  return true;
+}
+
+void ConvertInternalToExternalColorEncoding(const ColorEncoding& internal,
+                                            JxlColorEncoding* external) {
+  external->color_space = static_cast<JxlColorSpace>(internal.GetColorSpace());
+
+  external->white_point = static_cast<JxlWhitePoint>(internal.white_point);
+
+  jxl::CIExy whitepoint = internal.GetWhitePoint();
+  external->white_point_xy[0] = whitepoint.x;
+  external->white_point_xy[1] = whitepoint.y;
+
+  if (external->color_space == JXL_COLOR_SPACE_RGB ||
+      external->color_space == JXL_COLOR_SPACE_UNKNOWN) {
+    external->primaries = static_cast<JxlPrimaries>(internal.primaries);
+    jxl::PrimariesCIExy primaries = internal.GetPrimaries();
+    external->primaries_red_xy[0] = primaries.r.x;
+    external->primaries_red_xy[1] = primaries.r.y;
+    external->primaries_green_xy[0] = primaries.g.x;
+    external->primaries_green_xy[1] = primaries.g.y;
+    external->primaries_blue_xy[0] = primaries.b.x;
+    external->primaries_blue_xy[1] = primaries.b.y;
+  }
+
+  if (internal.tf.IsGamma()) {
+    external->transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
+    external->gamma = internal.tf.GetGamma();
+  } else {
+    external->transfer_function =
+        static_cast<JxlTransferFunction>(internal.tf.GetTransferFunction());
+    external->gamma = 0;
+  }
+
+  external->rendering_intent =
+      static_cast<JxlRenderingIntent>(internal.rendering_intent);
+}
+
+Status ConvertExternalToInternalColorEncoding(const JxlColorEncoding& external,
+                                              ColorEncoding* internal) {
+  internal->SetColorSpace(static_cast<ColorSpace>(external.color_space));
+
+  JXL_RETURN_IF_ERROR(ConvertExternalToInternalWhitePoint(
+      external.white_point, &internal->white_point));
+  if (external.white_point == JXL_WHITE_POINT_CUSTOM) {
+    CIExy wp;
+    wp.x = external.white_point_xy[0];
+    wp.y = external.white_point_xy[1];
+    JXL_RETURN_IF_ERROR(internal->SetWhitePoint(wp));
+  }
+
+  if (external.color_space == JXL_COLOR_SPACE_RGB ||
+      external.color_space == JXL_COLOR_SPACE_UNKNOWN) {
+    JXL_RETURN_IF_ERROR(ConvertExternalToInternalPrimaries(
+        external.primaries, &internal->primaries));
+    if (external.primaries == JXL_PRIMARIES_CUSTOM) {
+      PrimariesCIExy primaries;
+      primaries.r.x = external.primaries_red_xy[0];
+      primaries.r.y = external.primaries_red_xy[1];
+      primaries.g.x = external.primaries_green_xy[0];
+      primaries.g.y = external.primaries_green_xy[1];
+      primaries.b.x = external.primaries_blue_xy[0];
+      primaries.b.y = external.primaries_blue_xy[1];
+      JXL_RETURN_IF_ERROR(internal->SetPrimaries(primaries));
+    }
+  }
+  CustomTransferFunction tf;
+  tf.nonserialized_color_space = internal->GetColorSpace();
+  if (external.transfer_function == JXL_TRANSFER_FUNCTION_GAMMA) {
+    JXL_RETURN_IF_ERROR(tf.SetGamma(external.gamma));
+  } else {
+    TransferFunction tf_enum;
+    // JXL_TRANSFER_FUNCTION_GAMMA is not handled by this function since there's
+    // no internal enum value for it.
+    JXL_RETURN_IF_ERROR(ConvertExternalToInternalTransferFunction(
+        external.transfer_function, &tf_enum));
+    tf.SetTransferFunction(tf_enum);
+  }
+  internal->tf = tf;
+
+  JXL_RETURN_IF_ERROR(ConvertExternalToInternalRenderingIntent(
+      external.rendering_intent, &internal->rendering_intent));
+
+  // The ColorEncoding caches an ICC profile it created earlier that may no
+  // longer match the profile with the changed fields, so re-create it.
+  if (!(internal->CreateICC())) {
+    // This is not an error: for example, it doesn't have ICC profile creation
+    // implemented for XYB. This should not be returned as error, since
+    // ConvertExternalToInternalColorEncoding still worked correctly, and what
+    // matters is that internal->ICC() will not return the wrong profile.
+  }
+
+  return true;
+}
+
+/* Chromatic adaptation matrices*/
+static const float kBradford[9] = {
+    0.8951f, 0.2664f, -0.1614f, -0.7502f, 1.7135f,
+    0.0367f, 0.0389f, -0.0685f, 1.0296f,
+};
+
+static const float kBradfordInv[9] = {
+    0.9869929f, -0.1470543f, 0.1599627f, 0.4323053f, 0.5183603f,
+    0.0492912f, -0.0085287f, 0.0400428f, 0.9684867f,
+};
+
+// Adapts whitepoint x, y to D50
+Status AdaptToXYZD50(float wx, float wy, float matrix[9]) {
+  if (wx < 0 || wx > 1 || wy <= 0 || wy > 1) {
+    // Out of range values can cause division through zero
+    // further down with the bradford adaptation too.
+    return JXL_FAILURE("Invalid white point");
+  }
+  float w[3] = {wx / wy, 1.0f, (1.0f - wx - wy) / wy};
+  // 1 / tiny float can still overflow
+  JXL_RETURN_IF_ERROR(std::isfinite(w[0]) && std::isfinite(w[2]));
+  float w50[3] = {0.96422f, 1.0f, 0.82521f};
+
+  float lms[3];
+  float lms50[3];
+
+  Mul3x3Vector(kBradford, w, lms);
+  Mul3x3Vector(kBradford, w50, lms50);
+
+  if (lms[0] == 0 || lms[1] == 0 || lms[2] == 0) {
+    return JXL_FAILURE("Invalid white point");
+  }
+  float a[9] = {
+      //       /----> 0, 1, 2, 3,          /----> 4, 5, 6, 7,          /----> 8,
+      lms50[0] / lms[0], 0, 0, 0, lms50[1] / lms[1], 0, 0, 0, lms50[2] / lms[2],
+  };
+  if (!std::isfinite(a[0]) || !std::isfinite(a[4]) || !std::isfinite(a[8])) {
+    return JXL_FAILURE("Invalid white point");
+  }
+
+  float b[9];
+  Mul3x3Matrix(a, kBradford, b);
+  Mul3x3Matrix(kBradfordInv, b, matrix);
+
+  return true;
+}
+
+Status PrimariesToXYZ(float rx, float ry, float gx, float gy, float bx,
+                      float by, float wx, float wy, float matrix[9]) {
+  if (wx < 0 || wx > 1 || wy <= 0 || wy > 1) {
+    return JXL_FAILURE("Invalid white point");
+  }
+  // TODO(lode): also require rx, ry, gx, gy, bx, to be in range 0-1? ICC
+  // profiles in theory forbid negative XYZ values, but in practice the ACES P0
+  // color space uses a negative y for the blue primary.
+  float primaries[9] = {
+      rx, gx, bx, ry, gy, by, 1.0f - rx - ry, 1.0f - gx - gy, 1.0f - bx - by};
+  float primaries_inv[9];
+  memcpy(primaries_inv, primaries, sizeof(float) * 9);
+  JXL_RETURN_IF_ERROR(Inv3x3Matrix(primaries_inv));
+
+  float w[3] = {wx / wy, 1.0f, (1.0f - wx - wy) / wy};
+  // 1 / tiny float can still overflow
+  JXL_RETURN_IF_ERROR(std::isfinite(w[0]) && std::isfinite(w[2]));
+  float xyz[3];
+  Mul3x3Vector(primaries_inv, w, xyz);
+
+  float a[9] = {
+      xyz[0], 0, 0, 0, xyz[1], 0, 0, 0, xyz[2],
+  };
+
+  Mul3x3Matrix(primaries, a, matrix);
+  return true;
+}
+
+Status PrimariesToXYZD50(float rx, float ry, float gx, float gy, float bx,
+                         float by, float wx, float wy, float matrix[9]) {
+  float toXYZ[9];
+  JXL_RETURN_IF_ERROR(PrimariesToXYZ(rx, ry, gx, gy, bx, by, wx, wy, toXYZ));
+  float d50[9];
+  JXL_RETURN_IF_ERROR(AdaptToXYZD50(wx, wy, d50));
+
+  Mul3x3Matrix(d50, toXYZ, matrix);
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/color_encoding_internal.h b/third_party/jpeg-xl/lib/jxl/color_encoding_internal.h
new file mode 100644
index 0000000000..713f216538
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/color_encoding_internal.h
@@ -0,0 +1,463 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_COLOR_ENCODING_INTERNAL_H_
+#define LIB_JXL_COLOR_ENCODING_INTERNAL_H_
+
+// Metadata for color space conversions.
+
+#include <jxl/color_encoding.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <cmath>  // std::abs
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/field_encodings.h"
+
+namespace jxl {
+
+// (All CIE units are for the standard 1931 2 degree observer)
+
+// Color space the color pixel data is encoded in. The color pixel data is
+// 3-channel in all cases except in case of kGray, where it uses only 1 channel.
+// This also determines the amount of channels used in modular encoding.
+enum class ColorSpace : uint32_t {
+  // Trichromatic color data. This also includes CMYK if a kBlack
+  // ExtraChannelInfo is present. This implies, if there is an ICC profile, that
+  // the ICC profile uses a 3-channel color space if no kBlack extra channel is
+  // present, or uses color space 'CMYK' if a kBlack extra channel is present.
+  kRGB,
+  // Single-channel data. This implies, if there is an ICC profile, that the ICC
+  // profile also represents single-channel data and has the appropriate color
+  // space ('GRAY').
+  kGray,
+  // Like kRGB, but implies fixed values for primaries etc.
+  kXYB,
+  // For non-RGB/gray data, e.g. from non-electro-optical sensors. Otherwise
+  // the same conditions as kRGB apply.
+  kUnknown
+};
+
+static inline const char* EnumName(ColorSpace /*unused*/) {
+  return "ColorSpace";
+}
+static inline constexpr uint64_t EnumBits(ColorSpace /*unused*/) {
+  using CS = ColorSpace;
+  return MakeBit(CS::kRGB) | MakeBit(CS::kGray) | MakeBit(CS::kXYB) |
+         MakeBit(CS::kUnknown);
+}
+
+// Values from CICP ColourPrimaries.
+enum class WhitePoint : uint32_t {
+  kD65 = 1,     // sRGB/BT.709/Display P3/BT.2020
+  kCustom = 2,  // Actual values encoded in separate fields
+  kE = 10,      // XYZ
+  kDCI = 11,    // DCI-P3
+};
+
+static inline const char* EnumName(WhitePoint /*unused*/) {
+  return "WhitePoint";
+}
+static inline constexpr uint64_t EnumBits(WhitePoint /*unused*/) {
+  return MakeBit(WhitePoint::kD65) | MakeBit(WhitePoint::kCustom) |
+         MakeBit(WhitePoint::kE) | MakeBit(WhitePoint::kDCI);
+}
+
+// Values from CICP ColourPrimaries
+enum class Primaries : uint32_t {
+  kSRGB = 1,    // Same as BT.709
+  kCustom = 2,  // Actual values encoded in separate fields
+  k2100 = 9,    // Same as BT.2020
+  kP3 = 11,
+};
+
+static inline const char* EnumName(Primaries /*unused*/) { return "Primaries"; }
+static inline constexpr uint64_t EnumBits(Primaries /*unused*/) {
+  using Pr = Primaries;
+  return MakeBit(Pr::kSRGB) | MakeBit(Pr::kCustom) | MakeBit(Pr::k2100) |
+         MakeBit(Pr::kP3);
+}
+
+// Values from CICP TransferCharacteristics
+enum class TransferFunction : uint32_t {
+  k709 = 1,
+  kUnknown = 2,
+  kLinear = 8,
+  kSRGB = 13,
+  kPQ = 16,   // from BT.2100
+  kDCI = 17,  // from SMPTE RP 431-2 reference projector
+  kHLG = 18,  // from BT.2100
+};
+
+static inline const char* EnumName(TransferFunction /*unused*/) {
+  return "TransferFunction";
+}
+static inline constexpr uint64_t EnumBits(TransferFunction /*unused*/) {
+  using TF = TransferFunction;
+  return MakeBit(TF::k709) | MakeBit(TF::kLinear) | MakeBit(TF::kSRGB) |
+         MakeBit(TF::kPQ) | MakeBit(TF::kDCI) | MakeBit(TF::kHLG) |
+         MakeBit(TF::kUnknown);
+}
+
+enum class RenderingIntent : uint32_t {
+  // Values match ICC sRGB encodings.
+  kPerceptual = 0,  // good for photos, requires a profile with LUT.
+  kRelative,        // good for logos.
+  kSaturation,      // perhaps useful for CG with fully saturated colors.
+  kAbsolute,        // leaves white point unchanged; good for proofing.
+};
+
+static inline const char* EnumName(RenderingIntent /*unused*/) {
+  return "RenderingIntent";
+}
+static inline constexpr uint64_t EnumBits(RenderingIntent /*unused*/) {
+  using RI = RenderingIntent;
+  return MakeBit(RI::kPerceptual) | MakeBit(RI::kRelative) |
+         MakeBit(RI::kSaturation) | MakeBit(RI::kAbsolute);
+}
+
+// Chromaticity (Y is omitted because it is 1 for primaries/white points)
+struct CIExy {
+  double x = 0.0;
+  double y = 0.0;
+};
+
+struct PrimariesCIExy {
+  CIExy r;
+  CIExy g;
+  CIExy b;
+};
+
+// Serializable form of CIExy.
+struct Customxy : public Fields {
+  Customxy();
+  JXL_FIELDS_NAME(Customxy)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  CIExy Get() const;
+  // Returns false if x or y do not fit in the encoding.
+  Status Set(const CIExy& xy);
+
+  int32_t x;
+  int32_t y;
+};
+
+struct CustomTransferFunction : public Fields {
+  CustomTransferFunction();
+  JXL_FIELDS_NAME(CustomTransferFunction)
+
+  // Sets fields and returns true if nonserialized_color_space has an implicit
+  // transfer function, otherwise leaves fields unchanged and returns false.
+  bool SetImplicit();
+
+  // Gamma: only used for PNG inputs
+  bool IsGamma() const { return have_gamma_; }
+  double GetGamma() const {
+    JXL_ASSERT(IsGamma());
+    return gamma_ * 1E-7;  // (0, 1)
+  }
+  Status SetGamma(double gamma);
+
+  TransferFunction GetTransferFunction() const {
+    JXL_ASSERT(!IsGamma());
+    return transfer_function_;
+  }
+  void SetTransferFunction(const TransferFunction tf) {
+    have_gamma_ = false;
+    transfer_function_ = tf;
+  }
+
+  bool IsUnknown() const {
+    return !have_gamma_ && (transfer_function_ == TransferFunction::kUnknown);
+  }
+  bool IsSRGB() const {
+    return !have_gamma_ && (transfer_function_ == TransferFunction::kSRGB);
+  }
+  bool IsLinear() const {
+    return !have_gamma_ && (transfer_function_ == TransferFunction::kLinear);
+  }
+  bool IsPQ() const {
+    return !have_gamma_ && (transfer_function_ == TransferFunction::kPQ);
+  }
+  bool IsHLG() const {
+    return !have_gamma_ && (transfer_function_ == TransferFunction::kHLG);
+  }
+  bool Is709() const {
+    return !have_gamma_ && (transfer_function_ == TransferFunction::k709);
+  }
+  bool IsDCI() const {
+    return !have_gamma_ && (transfer_function_ == TransferFunction::kDCI);
+  }
+  bool IsSame(const CustomTransferFunction& other) const {
+    if (have_gamma_ != other.have_gamma_) return false;
+    if (have_gamma_) {
+      if (gamma_ != other.gamma_) return false;
+    } else {
+      if (transfer_function_ != other.transfer_function_) return false;
+    }
+    return true;
+  }
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  // Must be set before calling VisitFields!
+  ColorSpace nonserialized_color_space = ColorSpace::kRGB;
+
+ private:
+  static constexpr uint32_t kGammaMul = 10000000;
+
+  bool have_gamma_;
+
+  // OETF exponent to go from linear to gamma-compressed.
+  uint32_t gamma_;  // Only used if have_gamma_.
+
+  // Can be kUnknown.
+  TransferFunction transfer_function_;  // Only used if !have_gamma_.
+};
+
+// Compact encoding of data required to interpret and translate pixels to a
+// known color space. Stored in Metadata. Thread-compatible.
+struct ColorEncoding : public Fields {
+  ColorEncoding();
+  JXL_FIELDS_NAME(ColorEncoding)
+
+  // Returns ready-to-use color encodings (initialized on-demand).
+  static const ColorEncoding& SRGB(bool is_gray = false);
+  static const ColorEncoding& LinearSRGB(bool is_gray = false);
+
+  // Returns true if an ICC profile was successfully created from fields.
+  // Must be called after modifying fields. Defined in color_management.cc.
+  Status CreateICC();
+
+  // Returns non-empty and valid ICC profile, unless:
+  // - between calling InternalRemoveICC() and CreateICC() in tests;
+  // - WantICC() == true and SetICC() was not yet called;
+  // - after a failed call to SetSRGB(), SetICC(), or CreateICC().
+  const PaddedBytes& ICC() const { return icc_; }
+
+  // Internal only, do not call except from tests.
+  void InternalRemoveICC() { icc_.clear(); }
+
+  // Returns true if `icc` is assigned and decoded successfully. If so,
+  // subsequent WantICC() will return true until DecideIfWantICC() changes it.
+  // Returning false indicates data has been lost.
+  Status SetICC(PaddedBytes&& icc) {
+    if (icc.empty()) return false;
+    icc_ = std::move(icc);
+
+    if (!SetFieldsFromICC()) {
+      InternalRemoveICC();
+      return false;
+    }
+
+    want_icc_ = true;
+    return true;
+  }
+
+  // Sets the raw ICC profile bytes, without parsing the ICC, and without
+  // updating the direct fields such as whitepoint, primaries and color
+  // space. Functions to get and set fields, such as SetWhitePoint, cannot be
+  // used anymore after this and functions such as IsSRGB return false no matter
+  // what the contents of the icc profile.
+  Status SetICCRaw(PaddedBytes&& icc) {
+    if (icc.empty()) return false;
+    icc_ = std::move(icc);
+
+    want_icc_ = true;
+    have_fields_ = false;
+    return true;
+  }
+
+  // Returns whether to send the ICC profile in the codestream.
+  bool WantICC() const { return want_icc_; }
+
+  // Return whether the direct fields are set, if false but ICC is set, only
+  // raw ICC bytes are known.
+  bool HaveFields() const { return have_fields_; }
+
+  // Causes WantICC() to return false if ICC() can be reconstructed from fields.
+  // Defined in color_management.cc.
+  void DecideIfWantICC();
+
+  bool IsGray() const { return color_space_ == ColorSpace::kGray; }
+  bool IsCMYK() const { return cmyk_; }
+  size_t Channels() const { return IsGray() ? 1 : 3; }
+
+  // Returns false if the field is invalid and unusable.
+  bool HasPrimaries() const {
+    return !IsGray() && color_space_ != ColorSpace::kXYB;
+  }
+
+  // Returns true after setting the field to a value defined by color_space,
+  // otherwise false and leaves the field unchanged.
+  bool ImplicitWhitePoint() {
+    if (color_space_ == ColorSpace::kXYB) {
+      white_point = WhitePoint::kD65;
+      return true;
+    }
+    return false;
+  }
+
+  // Returns whether the color space is known to be sRGB. If a raw unparsed ICC
+  // profile is set without the fields being set, this returns false, even if
+  // the content of the ICC profile would match sRGB.
+  bool IsSRGB() const {
+    if (!have_fields_) return false;
+    if (!IsGray() && color_space_ != ColorSpace::kRGB) return false;
+    if (white_point != WhitePoint::kD65) return false;
+    if (primaries != Primaries::kSRGB) return false;
+    if (!tf.IsSRGB()) return false;
+    return true;
+  }
+
+  // Returns whether the color space is known to be linear sRGB. If a raw
+  // unparsed ICC profile is set without the fields being set, this returns
+  // false, even if the content of the ICC profile would match linear sRGB.
+  bool IsLinearSRGB() const {
+    if (!have_fields_) return false;
+    if (!IsGray() && color_space_ != ColorSpace::kRGB) return false;
+    if (white_point != WhitePoint::kD65) return false;
+    if (primaries != Primaries::kSRGB) return false;
+    if (!tf.IsLinear()) return false;
+    return true;
+  }
+
+  Status SetSRGB(const ColorSpace cs,
+                 const RenderingIntent ri = RenderingIntent::kRelative) {
+    InternalRemoveICC();
+    JXL_ASSERT(cs == ColorSpace::kGray || cs == ColorSpace::kRGB);
+    color_space_ = cs;
+    white_point = WhitePoint::kD65;
+    primaries = Primaries::kSRGB;
+    tf.SetTransferFunction(TransferFunction::kSRGB);
+    rendering_intent = ri;
+    return CreateICC();
+  }
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  // Accessors ensure tf.nonserialized_color_space is updated at the same time.
+  ColorSpace GetColorSpace() const { return color_space_; }
+  void SetColorSpace(const ColorSpace cs) {
+    color_space_ = cs;
+    tf.nonserialized_color_space = cs;
+  }
+
+  CIExy GetWhitePoint() const;
+  Status SetWhitePoint(const CIExy& xy);
+
+  PrimariesCIExy GetPrimaries() const;
+  Status SetPrimaries(const PrimariesCIExy& xy);
+
+  // Checks if the color spaces (including white point / primaries) are the
+  // same, but ignores the transfer function, rendering intent and ICC bytes.
+  bool SameColorSpace(const ColorEncoding& other) const {
+    if (color_space_ != other.color_space_) return false;
+
+    if (white_point != other.white_point) return false;
+    if (white_point == WhitePoint::kCustom) {
+      if (white_.x != other.white_.x || white_.y != other.white_.y)
+        return false;
+    }
+
+    if (HasPrimaries() != other.HasPrimaries()) return false;
+    if (HasPrimaries()) {
+      if (primaries != other.primaries) return false;
+      if (primaries == Primaries::kCustom) {
+        if (red_.x != other.red_.x || red_.y != other.red_.y) return false;
+        if (green_.x != other.green_.x || green_.y != other.green_.y)
+          return false;
+        if (blue_.x != other.blue_.x || blue_.y != other.blue_.y) return false;
+      }
+    }
+    return true;
+  }
+
+  // Checks if the color space and transfer function are the same, ignoring
+  // rendering intent and ICC bytes
+  bool SameColorEncoding(const ColorEncoding& other) const {
+    return SameColorSpace(other) && tf.IsSame(other.tf);
+  }
+
+  mutable bool all_default;
+
+  // Only valid if HaveFields()
+  WhitePoint white_point;
+  Primaries primaries;  // Only valid if HasPrimaries()
+  CustomTransferFunction tf;
+  RenderingIntent rendering_intent;
+
+ private:
+  // Returns true if all fields have been initialized (possibly to kUnknown).
+  // Returns false if the ICC profile is invalid or decoding it fails.
+  // Defined in enc_color_management.cc.
+  Status SetFieldsFromICC();
+
+  // If true, the codestream contains an ICC profile and we do not serialize
+  // fields. Otherwise, fields are serialized and we create an ICC profile.
+  bool want_icc_;
+
+  // When false, fields such as white_point and tf are invalid and must not be
+  // used. This occurs after setting a raw bytes-only ICC profile, only the
+  // ICC bytes may be used. The color_space_ field is still valid.
+  bool have_fields_ = true;
+
+  PaddedBytes icc_;  // Valid ICC profile
+
+  ColorSpace color_space_;  // Can be kUnknown
+  bool cmyk_ = false;
+
+  // Only used if white_point == kCustom.
+  Customxy white_;
+
+  // Only used if primaries == kCustom.
+  Customxy red_;
+  Customxy green_;
+  Customxy blue_;
+};
+
+// Returns whether the two inputs are approximately equal.
+static inline bool ApproxEq(const double a, const double b,
+#if JPEGXL_ENABLE_SKCMS
+                            double max_l1 = 1E-3) {
+#else
+                            double max_l1 = 8E-5) {
+#endif
+  // Threshold should be sufficient for ICC's 15-bit fixed-point numbers.
+  // We have seen differences of 7.1E-5 with lcms2 and 1E-3 with skcms.
+  return std::abs(a - b) <= max_l1;
+}
+
+// Returns a representation of the ColorEncoding fields (not icc).
+// Example description: "RGB_D65_SRG_Rel_Lin"
+std::string Description(const ColorEncoding& c);
+static inline std::ostream& operator<<(std::ostream& os,
+                                       const ColorEncoding& c) {
+  return os << Description(c);
+}
+
+void ConvertInternalToExternalColorEncoding(const jxl::ColorEncoding& internal,
+                                            JxlColorEncoding* external);
+
+Status ConvertExternalToInternalColorEncoding(const JxlColorEncoding& external,
+                                              jxl::ColorEncoding* internal);
+
+Status PrimariesToXYZ(float rx, float ry, float gx, float gy, float bx,
+                      float by, float wx, float wy, float matrix[9]);
+Status PrimariesToXYZD50(float rx, float ry, float gx, float gy, float bx,
+                         float by, float wx, float wy, float matrix[9]);
+Status AdaptToXYZD50(float wx, float wy, float matrix[9]);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_COLOR_ENCODING_INTERNAL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/color_encoding_internal_test.cc b/third_party/jpeg-xl/lib/jxl/color_encoding_internal_test.cc
new file mode 100644
index 0000000000..6ad47e1923
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/color_encoding_internal_test.cc
@@ -0,0 +1,157 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/color_encoding_internal.h"
+
+#include <stdio.h>
+
+#include "lib/jxl/encode_internal.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(ColorEncodingTest, RoundTripAll) {
+  for (const test::ColorEncodingDescriptor& cdesc : test::AllEncodings()) {
+    const ColorEncoding c_original = test::ColorEncodingFromDescriptor(cdesc);
+    // Verify Set(Get) yields the same white point/primaries/gamma.
+    {
+      ColorEncoding c;
+      EXPECT_TRUE(c.SetWhitePoint(c_original.GetWhitePoint()));
+      EXPECT_EQ(c_original.white_point, c.white_point);
+    }
+    {
+      ColorEncoding c;
+      EXPECT_TRUE(c.SetPrimaries(c_original.GetPrimaries()));
+      EXPECT_EQ(c_original.primaries, c.primaries);
+    }
+    if (c_original.tf.IsGamma()) {
+      ColorEncoding c;
+      EXPECT_TRUE(c.tf.SetGamma(c_original.tf.GetGamma()));
+      EXPECT_TRUE(c_original.tf.IsSame(c.tf));
+    }
+  }
+}
+
+TEST(ColorEncodingTest, CustomWhitePoint) {
+  ColorEncoding c;
+  // Nonsensical values
+  CIExy xy_in;
+  xy_in.x = 0.8;
+  xy_in.y = 0.01;
+  EXPECT_TRUE(c.SetWhitePoint(xy_in));
+  const CIExy xy = c.GetWhitePoint();
+
+  ColorEncoding c2;
+  EXPECT_TRUE(c2.SetWhitePoint(xy));
+  EXPECT_TRUE(c.SameColorSpace(c2));
+}
+
+TEST(ColorEncodingTest, CustomPrimaries) {
+  ColorEncoding c;
+  PrimariesCIExy xy_in;
+  // Nonsensical values
+  xy_in.r.x = -0.01;
+  xy_in.r.y = 0.2;
+  xy_in.g.x = 0.4;
+  xy_in.g.y = 0.401;
+  xy_in.b.x = 1.1;
+  xy_in.b.y = -1.2;
+  EXPECT_TRUE(c.SetPrimaries(xy_in));
+  const PrimariesCIExy xy = c.GetPrimaries();
+
+  ColorEncoding c2;
+  EXPECT_TRUE(c2.SetPrimaries(xy));
+  EXPECT_TRUE(c.SameColorSpace(c2));
+}
+
+TEST(ColorEncodingTest, CustomGamma) {
+  ColorEncoding c;
+#ifndef JXL_CRASH_ON_ERROR
+  EXPECT_FALSE(c.tf.SetGamma(0.0));
+  EXPECT_FALSE(c.tf.SetGamma(-1E-6));
+  EXPECT_FALSE(c.tf.SetGamma(1.001));
+#endif
+  EXPECT_TRUE(c.tf.SetGamma(1.0));
+  EXPECT_FALSE(c.tf.IsGamma());
+  EXPECT_TRUE(c.tf.IsLinear());
+
+  EXPECT_TRUE(c.tf.SetGamma(0.123));
+  EXPECT_TRUE(c.tf.IsGamma());
+  const double gamma = c.tf.GetGamma();
+
+  ColorEncoding c2;
+  EXPECT_TRUE(c2.tf.SetGamma(gamma));
+  EXPECT_TRUE(c.SameColorEncoding(c2));
+  EXPECT_TRUE(c2.tf.IsGamma());
+}
+
+TEST(ColorEncodingTest, InternalExternalConversion) {
+  ColorEncoding source_internal;
+  JxlColorEncoding external;
+  ColorEncoding destination_internal;
+
+  for (int i = 0; i < 100; i++) {
+    source_internal.SetColorSpace(static_cast<ColorSpace>(rand() % 4));
+    CIExy wp;
+    wp.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+    wp.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+    EXPECT_TRUE(source_internal.SetWhitePoint(wp));
+    if (source_internal.HasPrimaries()) {
+      PrimariesCIExy primaries;
+      primaries.r.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+      primaries.r.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+      primaries.g.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+      primaries.g.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+      primaries.b.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+      primaries.b.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+      EXPECT_TRUE(source_internal.SetPrimaries(primaries));
+    }
+    CustomTransferFunction tf;
+    EXPECT_TRUE(tf.SetGamma((float(rand()) / float((RAND_MAX)) * 0.5) + 0.25));
+    source_internal.tf = tf;
+    source_internal.rendering_intent = static_cast<RenderingIntent>(rand() % 4);
+
+    ConvertInternalToExternalColorEncoding(source_internal, &external);
+    EXPECT_TRUE(ConvertExternalToInternalColorEncoding(external,
+                                                       &destination_internal));
+
+    EXPECT_EQ(source_internal.GetColorSpace(),
+              destination_internal.GetColorSpace());
+    EXPECT_EQ(source_internal.white_point, destination_internal.white_point);
+    EXPECT_EQ(source_internal.GetWhitePoint().x,
+              destination_internal.GetWhitePoint().x);
+    EXPECT_EQ(source_internal.GetWhitePoint().y,
+              destination_internal.GetWhitePoint().y);
+    if (source_internal.HasPrimaries()) {
+      EXPECT_EQ(source_internal.GetPrimaries().r.x,
+                destination_internal.GetPrimaries().r.x);
+      EXPECT_EQ(source_internal.GetPrimaries().r.y,
+                destination_internal.GetPrimaries().r.y);
+      EXPECT_EQ(source_internal.GetPrimaries().g.x,
+                destination_internal.GetPrimaries().g.x);
+      EXPECT_EQ(source_internal.GetPrimaries().g.y,
+                destination_internal.GetPrimaries().g.y);
+      EXPECT_EQ(source_internal.GetPrimaries().b.x,
+                destination_internal.GetPrimaries().b.x);
+      EXPECT_EQ(source_internal.GetPrimaries().b.y,
+                destination_internal.GetPrimaries().b.y);
+    }
+    EXPECT_EQ(source_internal.tf.IsGamma(), destination_internal.tf.IsGamma());
+    if (source_internal.tf.IsGamma()) {
+      EXPECT_EQ(source_internal.tf.GetGamma(),
+                destination_internal.tf.GetGamma());
+    } else {
+      EXPECT_EQ(source_internal.tf.GetTransferFunction(),
+                destination_internal.tf.GetTransferFunction());
+    }
+    EXPECT_EQ(source_internal.rendering_intent,
+              destination_internal.rendering_intent);
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/color_management.cc b/third_party/jpeg-xl/lib/jxl/color_management.cc
new file mode 100644
index 0000000000..d656888a8b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/color_management.cc
@@ -0,0 +1,682 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/color_management.h"
+
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <memory>
+#include <string>
+#include <utility>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/color_management.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// NOTE: this is only used to provide a reasonable ICC profile that other
+// software can read. Our own transforms use ExtraTF instead because that is
+// more precise and supports unbounded mode.
+std::vector<uint16_t> CreateTableCurve(uint32_t N, const ExtraTF tf) {
+  JXL_ASSERT(N <= 4096);  // ICC MFT2 only allows 4K entries
+  JXL_ASSERT(tf == ExtraTF::kPQ || tf == ExtraTF::kHLG);
+  // No point using float - LCMS converts to 16-bit for A2B/MFT.
+  std::vector<uint16_t> table(N);
+  for (uint32_t i = 0; i < N; ++i) {
+    const float x = static_cast<float>(i) / (N - 1);  // 1.0 at index N - 1.
+    const double dx = static_cast<double>(x);
+    // LCMS requires EOTF (e.g. 2.4 exponent).
+    double y = (tf == ExtraTF::kHLG) ? TF_HLG().DisplayFromEncoded(dx)
+                                     : TF_PQ().DisplayFromEncoded(dx);
+    JXL_ASSERT(y >= 0.0);
+    // Clamp to table range - necessary for HLG.
+    if (y > 1.0) y = 1.0;
+    // 1.0 corresponds to table value 0xFFFF.
+    table[i] = static_cast<uint16_t>(roundf(y * 65535.0));
+  }
+  return table;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(CreateTableCurve);  // Local function.
+
+Status CIEXYZFromWhiteCIExy(const CIExy& xy, float XYZ[3]) {
+  // Target Y = 1.
+  if (std::abs(xy.y) < 1e-12) return JXL_FAILURE("Y value is too small");
+  const float factor = 1 / xy.y;
+  XYZ[0] = xy.x * factor;
+  XYZ[1] = 1;
+  XYZ[2] = (1 - xy.x - xy.y) * factor;
+  return true;
+}
+
+namespace {
+
+// NOTE: this is only used to provide a reasonable ICC profile that other
+// software can read. Our own transforms use ExtraTF instead because that is
+// more precise and supports unbounded mode.
+template <class Func>
+std::vector<uint16_t> CreateTableCurve(uint32_t N, const Func& func) {
+  JXL_ASSERT(N <= 4096);  // ICC MFT2 only allows 4K entries
+  // No point using float - LCMS converts to 16-bit for A2B/MFT.
+  std::vector<uint16_t> table(N);
+  for (uint32_t i = 0; i < N; ++i) {
+    const float x = static_cast<float>(i) / (N - 1);  // 1.0 at index N - 1.
+    // LCMS requires EOTF (e.g. 2.4 exponent).
+    double y = func.DisplayFromEncoded(static_cast<double>(x));
+    JXL_ASSERT(y >= 0.0);
+    // Clamp to table range - necessary for HLG.
+    if (y > 1.0) y = 1.0;
+    // 1.0 corresponds to table value 0xFFFF.
+    table[i] = static_cast<uint16_t>(roundf(y * 65535.0));
+  }
+  return table;
+}
+
+void ICCComputeMD5(const PaddedBytes& data, uint8_t sum[16])
+    JXL_NO_SANITIZE("unsigned-integer-overflow") {
+  PaddedBytes data64 = data;
+  data64.push_back(128);
+  // Add bytes such that ((size + 8) & 63) == 0.
+  size_t extra = ((64 - ((data64.size() + 8) & 63)) & 63);
+  data64.resize(data64.size() + extra, 0);
+  for (uint64_t i = 0; i < 64; i += 8) {
+    data64.push_back(static_cast<uint64_t>(data.size() << 3u) >> i);
+  }
+
+  static const uint32_t sineparts[64] = {
+      0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a,
+      0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
+      0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340,
+      0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
+      0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8,
+      0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
+      0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa,
+      0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+      0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92,
+      0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
+      0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
+  };
+  static const uint32_t shift[64] = {
+      7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
+      5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20,
+      4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
+      6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21,
+  };
+
+  uint32_t a0 = 0x67452301, b0 = 0xefcdab89, c0 = 0x98badcfe, d0 = 0x10325476;
+
+  for (size_t i = 0; i < data64.size(); i += 64) {
+    uint32_t a = a0, b = b0, c = c0, d = d0, f, g;
+    for (size_t j = 0; j < 64; j++) {
+      if (j < 16) {
+        f = (b & c) | ((~b) & d);
+        g = j;
+      } else if (j < 32) {
+        f = (d & b) | ((~d) & c);
+        g = (5 * j + 1) & 0xf;
+      } else if (j < 48) {
+        f = b ^ c ^ d;
+        g = (3 * j + 5) & 0xf;
+      } else {
+        f = c ^ (b | (~d));
+        g = (7 * j) & 0xf;
+      }
+      uint32_t dg0 = data64[i + g * 4 + 0], dg1 = data64[i + g * 4 + 1],
+               dg2 = data64[i + g * 4 + 2], dg3 = data64[i + g * 4 + 3];
+      uint32_t u = dg0 | (dg1 << 8u) | (dg2 << 16u) | (dg3 << 24u);
+      f += a + sineparts[j] + u;
+      a = d;
+      d = c;
+      c = b;
+      b += (f << shift[j]) | (f >> (32u - shift[j]));
+    }
+    a0 += a;
+    b0 += b;
+    c0 += c;
+    d0 += d;
+  }
+  sum[0] = a0;
+  sum[1] = a0 >> 8u;
+  sum[2] = a0 >> 16u;
+  sum[3] = a0 >> 24u;
+  sum[4] = b0;
+  sum[5] = b0 >> 8u;
+  sum[6] = b0 >> 16u;
+  sum[7] = b0 >> 24u;
+  sum[8] = c0;
+  sum[9] = c0 >> 8u;
+  sum[10] = c0 >> 16u;
+  sum[11] = c0 >> 24u;
+  sum[12] = d0;
+  sum[13] = d0 >> 8u;
+  sum[14] = d0 >> 16u;
+  sum[15] = d0 >> 24u;
+}
+
+Status CreateICCChadMatrix(CIExy w, float result[9]) {
+  float m[9];
+  if (w.y == 0) {  // WhitePoint can not be pitch-black.
+    return JXL_FAILURE("Invalid WhitePoint");
+  }
+  JXL_RETURN_IF_ERROR(AdaptToXYZD50(w.x, w.y, m));
+  memcpy(result, m, sizeof(float) * 9);
+  return true;
+}
+
+// Creates RGB to XYZ matrix given RGB primaries and whitepoint in xy.
+Status CreateICCRGBMatrix(CIExy r, CIExy g, CIExy b, CIExy w, float result[9]) {
+  float m[9];
+  JXL_RETURN_IF_ERROR(
+      PrimariesToXYZD50(r.x, r.y, g.x, g.y, b.x, b.y, w.x, w.y, m));
+  memcpy(result, m, sizeof(float) * 9);
+  return true;
+}
+
+void WriteICCUint32(uint32_t value, size_t pos, PaddedBytes* JXL_RESTRICT icc) {
+  if (icc->size() < pos + 4) icc->resize(pos + 4);
+  (*icc)[pos + 0] = (value >> 24u) & 255;
+  (*icc)[pos + 1] = (value >> 16u) & 255;
+  (*icc)[pos + 2] = (value >> 8u) & 255;
+  (*icc)[pos + 3] = value & 255;
+}
+
+void WriteICCUint16(uint16_t value, size_t pos, PaddedBytes* JXL_RESTRICT icc) {
+  if (icc->size() < pos + 2) icc->resize(pos + 2);
+  (*icc)[pos + 0] = (value >> 8u) & 255;
+  (*icc)[pos + 1] = value & 255;
+}
+
+void WriteICCUint8(uint8_t value, size_t pos, PaddedBytes* JXL_RESTRICT icc) {
+  if (icc->size() < pos + 1) icc->resize(pos + 1);
+  (*icc)[pos] = value;
+}
+
+// Writes a 4-character tag
+void WriteICCTag(const char* value, size_t pos, PaddedBytes* JXL_RESTRICT icc) {
+  if (icc->size() < pos + 4) icc->resize(pos + 4);
+  memcpy(icc->data() + pos, value, 4);
+}
+
+Status WriteICCS15Fixed16(float value, size_t pos,
+                          PaddedBytes* JXL_RESTRICT icc) {
+  // "nextafterf" for 32768.0f towards zero are:
+  // 32767.998046875, 32767.99609375, 32767.994140625
+  // Even the first value works well,...
+  bool ok = (-32767.995f <= value) && (value <= 32767.995f);
+  if (!ok) return JXL_FAILURE("ICC value is out of range / NaN");
+  int32_t i = value * 65536.0f + 0.5f;
+  // Use two's complement
+  uint32_t u = static_cast<uint32_t>(i);
+  WriteICCUint32(u, pos, icc);
+  return true;
+}
+
+Status CreateICCHeader(const ColorEncoding& c,
+                       PaddedBytes* JXL_RESTRICT header) {
+  // TODO(lode): choose color management engine name, e.g. "skia" if
+  // integrated in skia.
+  static const char* kCmm = "jxl ";
+
+  header->resize(128, 0);
+
+  WriteICCUint32(0, 0, header);  // size, correct value filled in at end
+  WriteICCTag(kCmm, 4, header);
+  WriteICCUint32(0x04400000u, 8, header);
+  const char* profile_type =
+      c.GetColorSpace() == ColorSpace::kXYB ? "scnr" : "mntr";
+  WriteICCTag(profile_type, 12, header);
+  WriteICCTag(c.IsGray() ? "GRAY" : "RGB ", 16, header);
+  WriteICCTag("XYZ ", 20, header);
+
+  // Three uint32_t's date/time encoding.
+  // TODO(lode): encode actual date and time, this is a placeholder
+  uint32_t year = 2019, month = 12, day = 1;
+  uint32_t hour = 0, minute = 0, second = 0;
+  WriteICCUint16(year, 24, header);
+  WriteICCUint16(month, 26, header);
+  WriteICCUint16(day, 28, header);
+  WriteICCUint16(hour, 30, header);
+  WriteICCUint16(minute, 32, header);
+  WriteICCUint16(second, 34, header);
+
+  WriteICCTag("acsp", 36, header);
+  WriteICCTag("APPL", 40, header);
+  WriteICCUint32(0, 44, header);  // flags
+  WriteICCUint32(0, 48, header);  // device manufacturer
+  WriteICCUint32(0, 52, header);  // device model
+  WriteICCUint32(0, 56, header);  // device attributes
+  WriteICCUint32(0, 60, header);  // device attributes
+  WriteICCUint32(static_cast<uint32_t>(c.rendering_intent), 64, header);
+
+  // Mandatory D50 white point of profile connection space
+  WriteICCUint32(0x0000f6d6, 68, header);
+  WriteICCUint32(0x00010000, 72, header);
+  WriteICCUint32(0x0000d32d, 76, header);
+
+  WriteICCTag(kCmm, 80, header);
+
+  return true;
+}
+
+void AddToICCTagTable(const char* tag, size_t offset, size_t size,
+                      PaddedBytes* JXL_RESTRICT tagtable,
+                      std::vector<size_t>* offsets) {
+  WriteICCTag(tag, tagtable->size(), tagtable);
+  // writing true offset deferred to later
+  WriteICCUint32(0, tagtable->size(), tagtable);
+  offsets->push_back(offset);
+  WriteICCUint32(size, tagtable->size(), tagtable);
+}
+
+void FinalizeICCTag(PaddedBytes* JXL_RESTRICT tags, size_t* offset,
+                    size_t* size) {
+  while ((tags->size() & 3) != 0) {
+    tags->push_back(0);
+  }
+  *offset += *size;
+  *size = tags->size() - *offset;
+}
+
+// The input text must be ASCII, writing other characters to UTF-16 is not
+// implemented.
+void CreateICCMlucTag(const std::string& text, PaddedBytes* JXL_RESTRICT tags) {
+  WriteICCTag("mluc", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  WriteICCUint32(1, tags->size(), tags);
+  WriteICCUint32(12, tags->size(), tags);
+  WriteICCTag("enUS", tags->size(), tags);
+  WriteICCUint32(text.size() * 2, tags->size(), tags);
+  WriteICCUint32(28, tags->size(), tags);
+  for (size_t i = 0; i < text.size(); i++) {
+    tags->push_back(0);  // prepend 0 for UTF-16
+    tags->push_back(text[i]);
+  }
+}
+
+Status CreateICCXYZTag(float xyz[3], PaddedBytes* JXL_RESTRICT tags) {
+  WriteICCTag("XYZ ", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  for (size_t i = 0; i < 3; ++i) {
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(xyz[i], tags->size(), tags));
+  }
+  return true;
+}
+
+Status CreateICCChadTag(float chad[9], PaddedBytes* JXL_RESTRICT tags) {
+  WriteICCTag("sf32", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  for (size_t i = 0; i < 9; i++) {
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(chad[i], tags->size(), tags));
+  }
+  return true;
+}
+
+void MaybeCreateICCCICPTag(const ColorEncoding& c,
+                           PaddedBytes* JXL_RESTRICT tags, size_t* offset,
+                           size_t* size, PaddedBytes* JXL_RESTRICT tagtable,
+                           std::vector<size_t>* offsets) {
+  if (c.GetColorSpace() != ColorSpace::kRGB) {
+    return;
+  }
+  uint8_t primaries = 0;
+  if (c.primaries == Primaries::kP3) {
+    if (c.white_point == WhitePoint::kD65) {
+      primaries = 12;
+    } else if (c.white_point == WhitePoint::kDCI) {
+      primaries = 11;
+    } else {
+      return;
+    }
+  } else if (c.primaries != Primaries::kCustom &&
+             c.white_point == WhitePoint::kD65) {
+    primaries = static_cast<uint8_t>(c.primaries);
+  } else {
+    return;
+  }
+  if (c.tf.IsUnknown() || c.tf.IsGamma()) {
+    return;
+  }
+  WriteICCTag("cicp", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  WriteICCUint8(primaries, tags->size(), tags);
+  WriteICCUint8(static_cast<uint8_t>(c.tf.GetTransferFunction()), tags->size(),
+                tags);
+  // Matrix
+  WriteICCUint8(0, tags->size(), tags);
+  // Full range
+  WriteICCUint8(1, tags->size(), tags);
+  FinalizeICCTag(tags, offset, size);
+  AddToICCTagTable("cicp", *offset, *size, tagtable, offsets);
+}
+
+void CreateICCCurvCurvTag(const std::vector<uint16_t>& curve,
+                          PaddedBytes* JXL_RESTRICT tags) {
+  size_t pos = tags->size();
+  tags->resize(tags->size() + 12 + curve.size() * 2, 0);
+  WriteICCTag("curv", pos, tags);
+  WriteICCUint32(0, pos + 4, tags);
+  WriteICCUint32(curve.size(), pos + 8, tags);
+  for (size_t i = 0; i < curve.size(); i++) {
+    WriteICCUint16(curve[i], pos + 12 + i * 2, tags);
+  }
+}
+
+// Writes 12 + 4*params.size() bytes
+Status CreateICCCurvParaTag(std::vector<float> params, size_t curve_type,
+                            PaddedBytes* JXL_RESTRICT tags) {
+  WriteICCTag("para", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  WriteICCUint16(curve_type, tags->size(), tags);
+  WriteICCUint16(0, tags->size(), tags);
+  for (size_t i = 0; i < params.size(); i++) {
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(params[i], tags->size(), tags));
+  }
+  return true;
+}
+
+Status CreateICCLutAtoBTagForXYB(PaddedBytes* JXL_RESTRICT tags) {
+  WriteICCTag("mAB ", tags->size(), tags);
+  // 4 reserved bytes set to 0
+  WriteICCUint32(0, tags->size(), tags);
+  // number of input channels
+  WriteICCUint8(3, tags->size(), tags);
+  // number of output channels
+  WriteICCUint8(3, tags->size(), tags);
+  // 2 reserved bytes for padding
+  WriteICCUint16(0, tags->size(), tags);
+  // offset to first B curve
+  WriteICCUint32(32, tags->size(), tags);
+  // offset to matrix
+  WriteICCUint32(244, tags->size(), tags);
+  // offset to first M curve
+  WriteICCUint32(148, tags->size(), tags);
+  // offset to CLUT
+  WriteICCUint32(80, tags->size(), tags);
+  // offset to first A curve
+  // (reuse linear B curves)
+  WriteICCUint32(32, tags->size(), tags);
+
+  // offset = 32
+  // no-op curves
+  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
+  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
+  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
+  // offset = 80
+  // number of grid points for each input channel
+  for (int i = 0; i < 16; ++i) {
+    WriteICCUint8(i < 3 ? 2 : 0, tags->size(), tags);
+  }
+  // precision = 2
+  WriteICCUint8(2, tags->size(), tags);
+  // 3 bytes of padding
+  WriteICCUint8(0, tags->size(), tags);
+  WriteICCUint16(0, tags->size(), tags);
+  const float kOffsets[3] = {
+      kScaledXYBOffset[0] + kScaledXYBOffset[1],
+      kScaledXYBOffset[1] - kScaledXYBOffset[0] + 1.0f / kScaledXYBScale[0],
+      kScaledXYBOffset[1] + kScaledXYBOffset[2]};
+  const float kScaling[3] = {
+      1.0f / (1.0f / kScaledXYBScale[0] + 1.0f / kScaledXYBScale[1]),
+      1.0f / (1.0f / kScaledXYBScale[0] + 1.0f / kScaledXYBScale[1]),
+      1.0f / (1.0f / kScaledXYBScale[1] + 1.0f / kScaledXYBScale[2])};
+  // 2*2*2*3 entries of 2 bytes each = 48 bytes
+  for (size_t ix = 0; ix < 2; ++ix) {
+    for (size_t iy = 0; iy < 2; ++iy) {
+      for (size_t ib = 0; ib < 2; ++ib) {
+        float in_f[3] = {ix * 1.0f, iy * 1.0f, ib * 1.0f};
+        for (size_t c = 0; c < 3; ++c) {
+          in_f[c] /= kScaledXYBScale[c];
+          in_f[c] -= kScaledXYBOffset[c];
+        }
+        float out_f[3];
+        out_f[0] = in_f[1] + in_f[0];
+        out_f[1] = in_f[1] - in_f[0];
+        out_f[2] = in_f[2] + in_f[1];
+        for (int i = 0; i < 3; ++i) {
+          out_f[i] += kOffsets[i];
+          out_f[i] *= kScaling[i];
+        }
+        for (int i = 0; i < 3; ++i) {
+          JXL_RETURN_IF_ERROR(out_f[i] >= 0.f && out_f[i] <= 1.f);
+          uint16_t val = static_cast<uint16_t>(
+              0.5f + 65535 * std::max(0.f, std::min(1.f, out_f[i])));
+          WriteICCUint16(val, tags->size(), tags);
+        }
+      }
+    }
+  }
+  // offset = 148
+  // 3 curves with 5 parameters = 3 * (12 + 5 * 4) = 96 bytes
+  for (size_t i = 0; i < 3; ++i) {
+    const float b =
+        -kOffsets[i] - std::cbrt(jxl::kNegOpsinAbsorbanceBiasRGB[i]);
+    std::vector<float> params = {
+        3,
+        1.0f / kScaling[i],
+        b,
+        0,                                // unused
+        std::max(0.f, -b * kScaling[i]),  // make skcms happy
+    };
+    JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(params, 3, tags));
+  }
+  // offset = 244
+  const double matrix[] = {1.5170095, -1.1065225, 0.071623,
+                           -0.050022, 0.5683655,  -0.018344,
+                           -1.387676, 1.1145555,  0.6857255};
+  // 12 * 4 = 48 bytes
+  for (size_t i = 0; i < 9; ++i) {
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(matrix[i], tags->size(), tags));
+  }
+  for (size_t i = 0; i < 3; ++i) {
+    float intercept = 0;
+    for (size_t j = 0; j < 3; ++j) {
+      intercept += matrix[i * 3 + j] * jxl::kNegOpsinAbsorbanceBiasRGB[j];
+    }
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(intercept, tags->size(), tags));
+  }
+  return true;
+}
+}  // namespace
+
+Status MaybeCreateProfile(const ColorEncoding& c,
+                          PaddedBytes* JXL_RESTRICT icc) {
+  PaddedBytes header, tagtable, tags;
+
+  if (c.GetColorSpace() == ColorSpace::kUnknown || c.tf.IsUnknown()) {
+    return false;  // Not an error
+  }
+
+  switch (c.GetColorSpace()) {
+    case ColorSpace::kRGB:
+    case ColorSpace::kGray:
+    case ColorSpace::kXYB:
+      break;  // OK
+    default:
+      return JXL_FAILURE("Invalid CS %u",
+                         static_cast<unsigned int>(c.GetColorSpace()));
+  }
+
+  if (c.GetColorSpace() == ColorSpace::kXYB &&
+      c.rendering_intent != RenderingIntent::kPerceptual) {
+    return JXL_FAILURE(
+        "Only perceptual rendering intent implemented for XYB "
+        "ICC profile.");
+  }
+
+  JXL_RETURN_IF_ERROR(CreateICCHeader(c, &header));
+
+  std::vector<size_t> offsets;
+  // tag count, deferred to later
+  WriteICCUint32(0, tagtable.size(), &tagtable);
+
+  size_t tag_offset = 0, tag_size = 0;
+
+  CreateICCMlucTag(Description(c), &tags);
+  FinalizeICCTag(&tags, &tag_offset, &tag_size);
+  AddToICCTagTable("desc", tag_offset, tag_size, &tagtable, &offsets);
+
+  const std::string copyright = "CC0";
+  CreateICCMlucTag(copyright, &tags);
+  FinalizeICCTag(&tags, &tag_offset, &tag_size);
+  AddToICCTagTable("cprt", tag_offset, tag_size, &tagtable, &offsets);
+
+  // TODO(eustas): isn't it the other way round: gray image has d50 WhitePoint?
+  if (c.IsGray()) {
+    float wtpt[3];
+    JXL_RETURN_IF_ERROR(CIEXYZFromWhiteCIExy(c.GetWhitePoint(), wtpt));
+    JXL_RETURN_IF_ERROR(CreateICCXYZTag(wtpt, &tags));
+  } else {
+    float d50[3] = {0.964203, 1.0, 0.824905};
+    JXL_RETURN_IF_ERROR(CreateICCXYZTag(d50, &tags));
+  }
+  FinalizeICCTag(&tags, &tag_offset, &tag_size);
+  AddToICCTagTable("wtpt", tag_offset, tag_size, &tagtable, &offsets);
+
+  if (!c.IsGray()) {
+    // Chromatic adaptation matrix
+    float chad[9];
+    JXL_RETURN_IF_ERROR(CreateICCChadMatrix(c.GetWhitePoint(), chad));
+
+    JXL_RETURN_IF_ERROR(CreateICCChadTag(chad, &tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("chad", tag_offset, tag_size, &tagtable, &offsets);
+  }
+
+  if (c.GetColorSpace() == ColorSpace::kRGB) {
+    MaybeCreateICCCICPTag(c, &tags, &tag_offset, &tag_size, &tagtable,
+                          &offsets);
+
+    const PrimariesCIExy primaries = c.GetPrimaries();
+    float m[9];
+    JXL_RETURN_IF_ERROR(CreateICCRGBMatrix(primaries.r, primaries.g,
+                                           primaries.b, c.GetWhitePoint(), m));
+    float r[3] = {m[0], m[3], m[6]};
+    float g[3] = {m[1], m[4], m[7]};
+    float b[3] = {m[2], m[5], m[8]};
+
+    JXL_RETURN_IF_ERROR(CreateICCXYZTag(r, &tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("rXYZ", tag_offset, tag_size, &tagtable, &offsets);
+
+    JXL_RETURN_IF_ERROR(CreateICCXYZTag(g, &tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("gXYZ", tag_offset, tag_size, &tagtable, &offsets);
+
+    JXL_RETURN_IF_ERROR(CreateICCXYZTag(b, &tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("bXYZ", tag_offset, tag_size, &tagtable, &offsets);
+  }
+
+  if (c.GetColorSpace() == ColorSpace::kXYB) {
+    JXL_RETURN_IF_ERROR(CreateICCLutAtoBTagForXYB(&tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("A2B0", tag_offset, tag_size, &tagtable, &offsets);
+  } else {
+    if (c.tf.IsGamma()) {
+      float gamma = 1.0 / c.tf.GetGamma();
+      JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({gamma}, 0, &tags));
+    } else if (c.GetColorSpace() != ColorSpace::kXYB) {
+      switch (c.tf.GetTransferFunction()) {
+        case TransferFunction::kHLG:
+          CreateICCCurvCurvTag(
+              HWY_DYNAMIC_DISPATCH(CreateTableCurve)(4096, ExtraTF::kHLG),
+              &tags);
+          break;
+        case TransferFunction::kPQ:
+          CreateICCCurvCurvTag(
+              HWY_DYNAMIC_DISPATCH(CreateTableCurve)(4096, ExtraTF::kPQ),
+              &tags);
+          break;
+        case TransferFunction::kSRGB:
+          JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(
+              {2.4, 1.0 / 1.055, 0.055 / 1.055, 1.0 / 12.92, 0.04045}, 3,
+              &tags));
+          break;
+        case TransferFunction::k709:
+          JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(
+              {1.0 / 0.45, 1.0 / 1.099, 0.099 / 1.099, 1.0 / 4.5, 0.081}, 3,
+              &tags));
+          break;
+        case TransferFunction::kLinear:
+          JXL_RETURN_IF_ERROR(
+              CreateICCCurvParaTag({1.0, 1.0, 0.0, 1.0, 0.0}, 3, &tags));
+          break;
+        case TransferFunction::kDCI:
+          JXL_RETURN_IF_ERROR(
+              CreateICCCurvParaTag({2.6, 1.0, 0.0, 1.0, 0.0}, 3, &tags));
+          break;
+        default:
+          JXL_ABORT("Unknown TF %u",
+                    static_cast<unsigned int>(c.tf.GetTransferFunction()));
+      }
+    }
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    if (c.IsGray()) {
+      AddToICCTagTable("kTRC", tag_offset, tag_size, &tagtable, &offsets);
+    } else {
+      AddToICCTagTable("rTRC", tag_offset, tag_size, &tagtable, &offsets);
+      AddToICCTagTable("gTRC", tag_offset, tag_size, &tagtable, &offsets);
+      AddToICCTagTable("bTRC", tag_offset, tag_size, &tagtable, &offsets);
+    }
+  }
+
+  // Tag count
+  WriteICCUint32(offsets.size(), 0, &tagtable);
+  for (size_t i = 0; i < offsets.size(); i++) {
+    WriteICCUint32(offsets[i] + header.size() + tagtable.size(), 4 + 12 * i + 4,
+                   &tagtable);
+  }
+
+  // ICC profile size
+  WriteICCUint32(header.size() + tagtable.size() + tags.size(), 0, &header);
+
+  *icc = header;
+  icc->append(tagtable);
+  icc->append(tags);
+
+  // The MD5 checksum must be computed on the profile with profile flags,
+  // rendering intent, and region of the checksum itself, set to 0.
+  // TODO(lode): manually verify with a reliable tool that this creates correct
+  // signature (profile id) for ICC profiles.
+  PaddedBytes icc_sum = *icc;
+  if (icc_sum.size() >= 64 + 4) {
+    memset(icc_sum.data() + 44, 0, 4);
+    memset(icc_sum.data() + 64, 0, 4);
+  }
+  uint8_t checksum[16];
+  ICCComputeMD5(icc_sum, checksum);
+
+  memcpy(icc->data() + 84, checksum, sizeof(checksum));
+
+  return true;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/color_management.h b/third_party/jpeg-xl/lib/jxl/color_management.h
new file mode 100644
index 0000000000..f623aa1c90
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/color_management.h
@@ -0,0 +1,40 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_COLOR_MANAGEMENT_H_
+#define LIB_JXL_COLOR_MANAGEMENT_H_
+
+// ICC profiles and color space conversions.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+enum class ExtraTF {
+  kNone,
+  kPQ,
+  kHLG,
+  kSRGB,
+};
+
+// NOTE: for XYB colorspace, the created profile can be used to transform a
+// *scaled* XYB image (created by ScaleXYB()) to another colorspace.
+Status MaybeCreateProfile(const ColorEncoding& c,
+                          PaddedBytes* JXL_RESTRICT icc);
+
+Status CIEXYZFromWhiteCIExy(const CIExy& xy, float XYZ[3]);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_COLOR_MANAGEMENT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/color_management_test.cc b/third_party/jpeg-xl/lib/jxl/color_management_test.cc
new file mode 100644
index 0000000000..fc7a1c57ff
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/color_management_test.cc
@@ -0,0 +1,405 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/color_management.h"
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <new>
+#include <string>
+#include <utility>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+
+std::ostream& operator<<(std::ostream& os, const CIExy& xy) {
+  return os << "{x=" << xy.x << ", y=" << xy.y << "}";
+}
+
+std::ostream& operator<<(std::ostream& os, const PrimariesCIExy& primaries) {
+  return os << "{r=" << primaries.r << ", g=" << primaries.g
+            << ", b=" << primaries.b << "}";
+}
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::FloatNear;
+
+// Small enough to be fast. If changed, must update Generate*.
+static constexpr size_t kWidth = 16;
+
+static constexpr size_t kNumThreads = 1;  // only have a single row.
+
+struct Globals {
+  // TODO(deymo): Make this a const.
+  static Globals* GetInstance() {
+    static Globals ret;
+    return &ret;
+  }
+
+ private:
+  Globals() {
+    in_gray = GenerateGray();
+    in_color = GenerateColor();
+    out_gray = ImageF(kWidth, 1);
+    out_color = ImageF(kWidth * 3, 1);
+
+    c_native = ColorEncoding::LinearSRGB(/*is_gray=*/false);
+    c_gray = ColorEncoding::LinearSRGB(/*is_gray=*/true);
+  }
+
+  static ImageF GenerateGray() {
+    ImageF gray(kWidth, 1);
+    float* JXL_RESTRICT row = gray.Row(0);
+    // Increasing left to right
+    for (uint32_t x = 0; x < kWidth; ++x) {
+      row[x] = x * 1.0f / (kWidth - 1);  // [0, 1]
+    }
+    return gray;
+  }
+
+  static ImageF GenerateColor() {
+    ImageF image(kWidth * 3, 1);
+    float* JXL_RESTRICT interleaved = image.Row(0);
+    std::fill(interleaved, interleaved + kWidth * 3, 0.0f);
+
+    // [0, 4): neutral
+    for (int32_t x = 0; x < 4; ++x) {
+      interleaved[3 * x + 0] = x * 1.0f / 3;  // [0, 1]
+      interleaved[3 * x + 2] = interleaved[3 * x + 1] = interleaved[3 * x + 0];
+    }
+
+    // [4, 13): pure RGB with low/medium/high saturation
+    for (int32_t c = 0; c < 3; ++c) {
+      interleaved[3 * (4 + c) + c] = 0.08f + c * 0.01f;
+      interleaved[3 * (7 + c) + c] = 0.75f + c * 0.01f;
+      interleaved[3 * (10 + c) + c] = 1.0f;
+    }
+
+    // [13, 16): impure, not quite saturated RGB
+    interleaved[3 * 13 + 0] = 0.86f;
+    interleaved[3 * 13 + 2] = interleaved[3 * 13 + 1] = 0.16f;
+    interleaved[3 * 14 + 1] = 0.87f;
+    interleaved[3 * 14 + 2] = interleaved[3 * 14 + 0] = 0.16f;
+    interleaved[3 * 15 + 2] = 0.88f;
+    interleaved[3 * 15 + 1] = interleaved[3 * 15 + 0] = 0.16f;
+
+    return image;
+  }
+
+ public:
+  // ImageF so we can use VerifyRelativeError; all are interleaved RGB.
+  ImageF in_gray;
+  ImageF in_color;
+  ImageF out_gray;
+  ImageF out_color;
+  ColorEncoding c_native;
+  ColorEncoding c_gray;
+};
+
+class ColorManagementTest
+    : public ::testing::TestWithParam<test::ColorEncodingDescriptor> {
+ public:
+  static void VerifySameFields(const ColorEncoding& c,
+                               const ColorEncoding& c2) {
+    ASSERT_EQ(c.rendering_intent, c2.rendering_intent);
+    ASSERT_EQ(c.GetColorSpace(), c2.GetColorSpace());
+    ASSERT_EQ(c.white_point, c2.white_point);
+    if (c.HasPrimaries()) {
+      ASSERT_EQ(c.primaries, c2.primaries);
+    }
+    ASSERT_TRUE(c.tf.IsSame(c2.tf));
+  }
+
+  // "Same" pixels after converting g->c_native -> c -> g->c_native.
+  static void VerifyPixelRoundTrip(const ColorEncoding& c) {
+    Globals* g = Globals::GetInstance();
+    const ColorEncoding& c_native = c.IsGray() ? g->c_gray : g->c_native;
+    const JxlCmsInterface& cms = GetJxlCms();
+    ColorSpaceTransform xform_fwd(cms);
+    ColorSpaceTransform xform_rev(cms);
+    const float intensity_target =
+        c.tf.IsHLG() ? 1000 : kDefaultIntensityTarget;
+    ASSERT_TRUE(
+        xform_fwd.Init(c_native, c, intensity_target, kWidth, kNumThreads));
+    ASSERT_TRUE(
+        xform_rev.Init(c, c_native, intensity_target, kWidth, kNumThreads));
+
+    const size_t thread = 0;
+    const ImageF& in = c.IsGray() ? g->in_gray : g->in_color;
+    ImageF* JXL_RESTRICT out = c.IsGray() ? &g->out_gray : &g->out_color;
+    ASSERT_TRUE(xform_fwd.Run(thread, in.Row(0), xform_fwd.BufDst(thread)));
+    ASSERT_TRUE(xform_rev.Run(thread, xform_fwd.BufDst(thread), out->Row(0)));
+
+#if JPEGXL_ENABLE_SKCMS
+    double max_l1 = 7E-4;
+    double max_rel = 4E-7;
+#else
+    double max_l1 = 5E-5;
+    // Most are lower; reached 3E-7 with D60 AP0.
+    double max_rel = 4E-7;
+#endif
+    if (c.IsGray()) max_rel = 2E-5;
+    JXL_ASSERT_OK(VerifyRelativeError(in, *out, max_l1, max_rel, _));
+  }
+};
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(ColorManagementTestInstantiation,
+                                   ColorManagementTest,
+                                   ::testing::ValuesIn(test::AllEncodings()));
+
+// Exercises the ColorManagement interface for ALL ColorEncoding synthesizable
+// via enums.
+TEST_P(ColorManagementTest, VerifyAllProfiles) {
+  ColorEncoding c = ColorEncodingFromDescriptor(GetParam());
+  printf("%s\n", Description(c).c_str());
+
+  // Can create profile.
+  ASSERT_TRUE(c.CreateICC());
+
+  // Can set an equivalent ColorEncoding from the generated ICC profile.
+  ColorEncoding c3;
+  ASSERT_TRUE(c3.SetICC(PaddedBytes(c.ICC())));
+  VerifySameFields(c, c3);
+
+  VerifyPixelRoundTrip(c);
+}
+
+testing::Matcher<CIExy> CIExyIs(const double x, const double y) {
+  static constexpr double kMaxError = 1e-4;
+  return testing::AllOf(
+      testing::Field(&CIExy::x, testing::DoubleNear(x, kMaxError)),
+      testing::Field(&CIExy::y, testing::DoubleNear(y, kMaxError)));
+}
+
+testing::Matcher<PrimariesCIExy> PrimariesAre(
+    const testing::Matcher<CIExy>& r, const testing::Matcher<CIExy>& g,
+    const testing::Matcher<CIExy>& b) {
+  return testing::AllOf(testing::Field(&PrimariesCIExy::r, r),
+                        testing::Field(&PrimariesCIExy::g, g),
+                        testing::Field(&PrimariesCIExy::b, b));
+}
+
+TEST_F(ColorManagementTest, sRGBChromaticity) {
+  const ColorEncoding sRGB = ColorEncoding::SRGB();
+  EXPECT_THAT(sRGB.GetWhitePoint(), CIExyIs(0.3127, 0.3290));
+  EXPECT_THAT(sRGB.GetPrimaries(),
+              PrimariesAre(CIExyIs(0.64, 0.33), CIExyIs(0.30, 0.60),
+                           CIExyIs(0.15, 0.06)));
+}
+
+TEST_F(ColorManagementTest, D2700Chromaticity) {
+  PaddedBytes icc =
+      jxl::test::ReadTestData("jxl/color_management/sRGB-D2700.icc");
+  ColorEncoding sRGB_D2700;
+  ASSERT_TRUE(sRGB_D2700.SetICC(std::move(icc)));
+
+  EXPECT_THAT(sRGB_D2700.GetWhitePoint(), CIExyIs(0.45986, 0.41060));
+  // The illuminant-relative chromaticities of this profile's primaries are the
+  // same as for sRGB. It is the PCS-relative chromaticities that would be
+  // different.
+  EXPECT_THAT(sRGB_D2700.GetPrimaries(),
+              PrimariesAre(CIExyIs(0.64, 0.33), CIExyIs(0.30, 0.60),
+                           CIExyIs(0.15, 0.06)));
+}
+
+TEST_F(ColorManagementTest, D2700ToSRGB) {
+  PaddedBytes icc =
+      jxl::test::ReadTestData("jxl/color_management/sRGB-D2700.icc");
+  ColorEncoding sRGB_D2700;
+  ASSERT_TRUE(sRGB_D2700.SetICC(std::move(icc)));
+
+  ColorSpaceTransform transform(GetJxlCms());
+  ASSERT_TRUE(transform.Init(sRGB_D2700, ColorEncoding::SRGB(),
+                             kDefaultIntensityTarget, 1, 1));
+  const float sRGB_D2700_values[3] = {0.863, 0.737, 0.490};
+  float sRGB_values[3];
+  ASSERT_TRUE(transform.Run(0, sRGB_D2700_values, sRGB_values));
+  EXPECT_THAT(sRGB_values,
+              ElementsAre(FloatNear(0.914, 1e-3), FloatNear(0.745, 1e-3),
+                          FloatNear(0.601, 1e-3)));
+}
+
+TEST_F(ColorManagementTest, P3HlgTo2020Hlg) {
+  ColorEncoding p3_hlg;
+  p3_hlg.SetColorSpace(ColorSpace::kRGB);
+  p3_hlg.white_point = WhitePoint::kD65;
+  p3_hlg.primaries = Primaries::kP3;
+  p3_hlg.tf.SetTransferFunction(TransferFunction::kHLG);
+  ASSERT_TRUE(p3_hlg.CreateICC());
+
+  ColorEncoding rec2020_hlg = p3_hlg;
+  rec2020_hlg.primaries = Primaries::k2100;
+  ASSERT_TRUE(rec2020_hlg.CreateICC());
+
+  ColorSpaceTransform transform(GetJxlCms());
+  ASSERT_TRUE(transform.Init(p3_hlg, rec2020_hlg, 1000, 1, 1));
+  const float p3_hlg_values[3] = {0., 0.75, 0.};
+  float rec2020_hlg_values[3];
+  ASSERT_TRUE(transform.Run(0, p3_hlg_values, rec2020_hlg_values));
+  EXPECT_THAT(rec2020_hlg_values,
+              ElementsAre(FloatNear(0.3973, 1e-4), FloatNear(0.7382, 1e-4),
+                          FloatNear(0.1183, 1e-4)));
+}
+
+TEST_F(ColorManagementTest, HlgOotf) {
+  ColorEncoding p3_hlg;
+  p3_hlg.SetColorSpace(ColorSpace::kRGB);
+  p3_hlg.white_point = WhitePoint::kD65;
+  p3_hlg.primaries = Primaries::kP3;
+  p3_hlg.tf.SetTransferFunction(TransferFunction::kHLG);
+  ASSERT_TRUE(p3_hlg.CreateICC());
+
+  ColorSpaceTransform transform_to_1000(GetJxlCms());
+  ASSERT_TRUE(
+      transform_to_1000.Init(p3_hlg, ColorEncoding::LinearSRGB(), 1000, 1, 1));
+  // HDR reference white: https://www.itu.int/pub/R-REP-BT.2408-4-2021
+  float p3_hlg_values[3] = {0.75, 0.75, 0.75};
+  float linear_srgb_values[3];
+  ASSERT_TRUE(transform_to_1000.Run(0, p3_hlg_values, linear_srgb_values));
+  // On a 1000-nit display, HDR reference white should be 203 cd/m² which is
+  // 0.203 times the maximum.
+  EXPECT_THAT(linear_srgb_values,
+              ElementsAre(FloatNear(0.203, 1e-3), FloatNear(0.203, 1e-3),
+                          FloatNear(0.203, 1e-3)));
+
+  ColorSpaceTransform transform_to_400(GetJxlCms());
+  ASSERT_TRUE(
+      transform_to_400.Init(p3_hlg, ColorEncoding::LinearSRGB(), 400, 1, 1));
+  ASSERT_TRUE(transform_to_400.Run(0, p3_hlg_values, linear_srgb_values));
+  // On a 400-nit display, it should be 100 cd/m².
+  EXPECT_THAT(linear_srgb_values,
+              ElementsAre(FloatNear(0.250, 1e-3), FloatNear(0.250, 1e-3),
+                          FloatNear(0.250, 1e-3)));
+
+  p3_hlg_values[2] = 0.50;
+  ASSERT_TRUE(transform_to_1000.Run(0, p3_hlg_values, linear_srgb_values));
+  EXPECT_THAT(linear_srgb_values,
+              ElementsAre(FloatNear(0.201, 1e-3), FloatNear(0.201, 1e-3),
+                          FloatNear(0.050, 1e-3)));
+
+  ColorSpaceTransform transform_from_400(GetJxlCms());
+  ASSERT_TRUE(
+      transform_from_400.Init(ColorEncoding::LinearSRGB(), p3_hlg, 400, 1, 1));
+  linear_srgb_values[0] = linear_srgb_values[1] = linear_srgb_values[2] = 0.250;
+  ASSERT_TRUE(transform_from_400.Run(0, linear_srgb_values, p3_hlg_values));
+  EXPECT_THAT(p3_hlg_values,
+              ElementsAre(FloatNear(0.75, 1e-3), FloatNear(0.75, 1e-3),
+                          FloatNear(0.75, 1e-3)));
+
+  ColorEncoding grayscale_hlg;
+  grayscale_hlg.SetColorSpace(ColorSpace::kGray);
+  grayscale_hlg.white_point = WhitePoint::kD65;
+  grayscale_hlg.tf.SetTransferFunction(TransferFunction::kHLG);
+  ASSERT_TRUE(grayscale_hlg.CreateICC());
+
+  ColorSpaceTransform grayscale_transform(GetJxlCms());
+  ASSERT_TRUE(grayscale_transform.Init(
+      grayscale_hlg, ColorEncoding::LinearSRGB(/*is_gray=*/true), 1000, 1, 1));
+  const float grayscale_hlg_value = 0.75;
+  float linear_grayscale_value;
+  ASSERT_TRUE(grayscale_transform.Run(0, &grayscale_hlg_value,
+                                      &linear_grayscale_value));
+  EXPECT_THAT(linear_grayscale_value, FloatNear(0.203, 1e-3));
+}
+
+TEST_F(ColorManagementTest, XYBProfile) {
+  ColorEncoding c_xyb;
+  c_xyb.SetColorSpace(ColorSpace::kXYB);
+  c_xyb.rendering_intent = RenderingIntent::kPerceptual;
+  ASSERT_TRUE(c_xyb.CreateICC());
+  ColorEncoding c_native = ColorEncoding::LinearSRGB(false);
+
+  static const size_t kGridDim = 17;
+  static const size_t kNumColors = kGridDim * kGridDim * kGridDim;
+  const JxlCmsInterface& cms = GetJxlCms();
+  ColorSpaceTransform xform(cms);
+  ASSERT_TRUE(
+      xform.Init(c_xyb, c_native, kDefaultIntensityTarget, kNumColors, 1));
+
+  ImageMetadata metadata;
+  metadata.color_encoding = c_native;
+  ImageBundle ib(&metadata);
+  Image3F native(kNumColors, 1);
+  float mul = 1.0f / (kGridDim - 1);
+  for (size_t ir = 0, x = 0; ir < kGridDim; ++ir) {
+    for (size_t ig = 0; ig < kGridDim; ++ig) {
+      for (size_t ib = 0; ib < kGridDim; ++ib, ++x) {
+        native.PlaneRow(0, 0)[x] = ir * mul;
+        native.PlaneRow(1, 0)[x] = ig * mul;
+        native.PlaneRow(2, 0)[x] = ib * mul;
+      }
+    }
+  }
+  ib.SetFromImage(std::move(native), c_native);
+  const Image3F& in = *ib.color();
+  Image3F opsin(kNumColors, 1);
+  ToXYB(ib, nullptr, &opsin, cms, nullptr);
+
+  Image3F opsin2 = CopyImage(opsin);
+  ScaleXYB(&opsin2);
+
+  float* src = xform.BufSrc(0);
+  for (size_t i = 0; i < kNumColors; ++i) {
+    for (size_t c = 0; c < 3; ++c) {
+      src[3 * i + c] = opsin2.PlaneRow(c, 0)[i];
+    }
+  }
+
+  float* dst = xform.BufDst(0);
+  ASSERT_TRUE(xform.Run(0, src, dst));
+
+  Image3F out(kNumColors, 1);
+  for (size_t i = 0; i < kNumColors; ++i) {
+    for (size_t c = 0; c < 3; ++c) {
+      out.PlaneRow(c, 0)[i] = dst[3 * i + c];
+    }
+  }
+
+  auto debug_print_color = [&](size_t i) {
+    printf(
+        "(%f, %f, %f) -> (%9.6f, %f, %f) -> (%f, %f, %f) -> "
+        "(%9.6f, %9.6f, %9.6f)",
+        in.PlaneRow(0, 0)[i], in.PlaneRow(1, 0)[i], in.PlaneRow(2, 0)[i],
+        opsin.PlaneRow(0, 0)[i], opsin.PlaneRow(1, 0)[i],
+        opsin.PlaneRow(2, 0)[i], opsin2.PlaneRow(0, 0)[i],
+        opsin2.PlaneRow(1, 0)[i], opsin2.PlaneRow(2, 0)[i],
+        out.PlaneRow(0, 0)[i], out.PlaneRow(1, 0)[i], out.PlaneRow(2, 0)[i]);
+  };
+
+  float max_err[3] = {};
+  size_t max_err_i[3] = {};
+  for (size_t i = 0; i < kNumColors; ++i) {
+    for (size_t c = 0; c < 3; ++c) {
+      // debug_print_color(i); printf("\n");
+      float err = std::abs(in.PlaneRow(c, 0)[i] - out.PlaneRow(c, 0)[i]);
+      if (err > max_err[c]) {
+        max_err[c] = err;
+        max_err_i[c] = i;
+      }
+    }
+  }
+  static float kMaxError[3] = {9e-4, 4e-4, 5e-4};
+  printf("Maximum errors:\n");
+  for (size_t c = 0; c < 3; ++c) {
+    debug_print_color(max_err_i[c]);
+    printf("    %f\n", max_err[c]);
+    EXPECT_LT(max_err[c], kMaxError[c]);
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/common.h b/third_party/jpeg-xl/lib/jxl/common.h
new file mode 100644
index 0000000000..c2ebe029a8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/common.h
@@ -0,0 +1,245 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_COMMON_H_
+#define LIB_JXL_COMMON_H_
+
+// Shared constants and helper functions.
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include <limits>  // numeric_limits
+#include <memory>  // unique_ptr
+#include <string>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+
+#ifndef JXL_HIGH_PRECISION
+#define JXL_HIGH_PRECISION 1
+#endif
+
+// Macro that defines whether support for decoding JXL files to JPEG is enabled.
+#ifndef JPEGXL_ENABLE_TRANSCODE_JPEG
+#define JPEGXL_ENABLE_TRANSCODE_JPEG 1
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+// Macro that defines whether support for decoding boxes is enabled.
+#ifndef JPEGXL_ENABLE_BOXES
+#define JPEGXL_ENABLE_BOXES 1
+#endif  // JPEGXL_ENABLE_BOXES
+
+namespace jxl {
+// Some enums and typedefs used by more than one header file.
+
+constexpr size_t kBitsPerByte = 8;  // more clear than CHAR_BIT
+
+constexpr inline size_t RoundUpBitsToByteMultiple(size_t bits) {
+  return (bits + 7) & ~size_t(7);
+}
+
+constexpr inline size_t RoundUpToBlockDim(size_t dim) {
+  return (dim + 7) & ~size_t(7);
+}
+
+static inline bool JXL_MAYBE_UNUSED SafeAdd(const uint64_t a, const uint64_t b,
+                                            uint64_t& sum) {
+  sum = a + b;
+  return sum >= a;  // no need to check b - either sum >= both or < both.
+}
+
+template <typename T1, typename T2>
+constexpr inline T1 DivCeil(T1 a, T2 b) {
+  return (a + b - 1) / b;
+}
+
+// Works for any `align`; if a power of two, compiler emits ADD+AND.
+constexpr inline size_t RoundUpTo(size_t what, size_t align) {
+  return DivCeil(what, align) * align;
+}
+
+constexpr double kPi = 3.14159265358979323846264338327950288;
+
+// Reasonable default for sRGB, matches common monitors. We map white to this
+// many nits (cd/m^2) by default. Butteraugli was tuned for 250 nits, which is
+// very close.
+static constexpr float kDefaultIntensityTarget = 255;
+
+template <typename T>
+constexpr T Pi(T multiplier) {
+  return static_cast<T>(multiplier * kPi);
+}
+
+// Block is the square grid of pixels to which an "energy compaction"
+// transformation (e.g. DCT) is applied. Each block has its own AC quantizer.
+constexpr size_t kBlockDim = 8;
+
+constexpr size_t kDCTBlockSize = kBlockDim * kBlockDim;
+
+constexpr size_t kGroupDim = 256;
+static_assert(kGroupDim % kBlockDim == 0,
+              "Group dim should be divisible by block dim");
+constexpr size_t kGroupDimInBlocks = kGroupDim / kBlockDim;
+
+// Maximum number of passes in an image.
+constexpr size_t kMaxNumPasses = 11;
+
+// Maximum number of reference frames.
+constexpr size_t kMaxNumReferenceFrames = 4;
+
+// Dimensions of a frame, in pixels, and other derived dimensions.
+// Computed from FrameHeader.
+// TODO(veluca): add extra channels.
+struct FrameDimensions {
+  void Set(size_t xsize, size_t ysize, size_t group_size_shift,
+           size_t max_hshift, size_t max_vshift, bool modular_mode,
+           size_t upsampling) {
+    group_dim = (kGroupDim >> 1) << group_size_shift;
+    dc_group_dim = group_dim * kBlockDim;
+    xsize_upsampled = xsize;
+    ysize_upsampled = ysize;
+    this->xsize = DivCeil(xsize, upsampling);
+    this->ysize = DivCeil(ysize, upsampling);
+    xsize_blocks = DivCeil(this->xsize, kBlockDim << max_hshift) << max_hshift;
+    ysize_blocks = DivCeil(this->ysize, kBlockDim << max_vshift) << max_vshift;
+    xsize_padded = xsize_blocks * kBlockDim;
+    ysize_padded = ysize_blocks * kBlockDim;
+    if (modular_mode) {
+      // Modular mode doesn't have any padding.
+      xsize_padded = this->xsize;
+      ysize_padded = this->ysize;
+    }
+    xsize_upsampled_padded = xsize_padded * upsampling;
+    ysize_upsampled_padded = ysize_padded * upsampling;
+    xsize_groups = DivCeil(this->xsize, group_dim);
+    ysize_groups = DivCeil(this->ysize, group_dim);
+    xsize_dc_groups = DivCeil(xsize_blocks, group_dim);
+    ysize_dc_groups = DivCeil(ysize_blocks, group_dim);
+    num_groups = xsize_groups * ysize_groups;
+    num_dc_groups = xsize_dc_groups * ysize_dc_groups;
+  }
+
+  // Image size without any upsampling, i.e. original_size / upsampling.
+  size_t xsize;
+  size_t ysize;
+  // Original image size.
+  size_t xsize_upsampled;
+  size_t ysize_upsampled;
+  // Image size after upsampling the padded image.
+  size_t xsize_upsampled_padded;
+  size_t ysize_upsampled_padded;
+  // Image size after padding to a multiple of kBlockDim (if VarDCT mode).
+  size_t xsize_padded;
+  size_t ysize_padded;
+  // Image size in kBlockDim blocks.
+  size_t xsize_blocks;
+  size_t ysize_blocks;
+  // Image size in number of groups.
+  size_t xsize_groups;
+  size_t ysize_groups;
+  // Image size in number of DC groups.
+  size_t xsize_dc_groups;
+  size_t ysize_dc_groups;
+  // Number of AC or DC groups.
+  size_t num_groups;
+  size_t num_dc_groups;
+  // Size of a group.
+  size_t group_dim;
+  size_t dc_group_dim;
+};
+
+// Prior to C++14 (i.e. C++11): provide our own make_unique
+#if __cplusplus < 201402L
+template <typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+#else
+using std::make_unique;
+#endif
+
+template <typename T>
+JXL_INLINE T Clamp1(T val, T low, T hi) {
+  return val < low ? low : val > hi ? hi : val;
+}
+
+// Encodes non-negative (X) into (2 * X), negative (-X) into (2 * X - 1)
+constexpr uint32_t PackSigned(int32_t value)
+    JXL_NO_SANITIZE("unsigned-integer-overflow") {
+  return (static_cast<uint32_t>(value) << 1) ^
+         ((static_cast<uint32_t>(~value) >> 31) - 1);
+}
+
+// Reverse to PackSigned, i.e. UnpackSigned(PackSigned(X)) == X.
+// (((~value) & 1) - 1) is either 0 or 0xFF...FF and it will have an expected
+// unsigned-integer-overflow.
+constexpr intptr_t UnpackSigned(size_t value)
+    JXL_NO_SANITIZE("unsigned-integer-overflow") {
+  return static_cast<intptr_t>((value >> 1) ^ (((~value) & 1) - 1));
+}
+
+// conversion from integer to string.
+template <typename T>
+std::string ToString(T n) {
+  char data[32] = {};
+  if (T(0.1) != T(0)) {
+    // float
+    snprintf(data, sizeof(data), "%g", static_cast<double>(n));
+  } else if (T(-1) > T(0)) {
+    // unsigned
+    snprintf(data, sizeof(data), "%llu", static_cast<unsigned long long>(n));
+  } else {
+    // signed
+    snprintf(data, sizeof(data), "%lld", static_cast<long long>(n));
+  }
+  return data;
+}
+
+static inline JXL_MAYBE_UNUSED uint64_t DecodeVarInt(const uint8_t* input,
+                                                     size_t inputSize,
+                                                     size_t* pos) {
+  size_t i;
+  uint64_t ret = 0;
+  for (i = 0; *pos + i < inputSize && i < 10; ++i) {
+    ret |= uint64_t(input[*pos + i] & 127) << uint64_t(7 * i);
+    // If the next-byte flag is not set, stop
+    if ((input[*pos + i] & 128) == 0) break;
+  }
+  // TODO: Return a decoding error if i == 10.
+  *pos += i + 1;
+  return ret;
+}
+
+static inline JXL_MAYBE_UNUSED bool EncodeVarInt(uint64_t value,
+                                                 size_t output_size,
+                                                 size_t* output_pos,
+                                                 uint8_t* output) {
+  // While more than 7 bits of data are left,
+  // store 7 bits and set the next byte flag
+  while (value > 127) {
+    if (*output_pos > output_size) return false;
+    // |128: Set the next byte flag
+    output[(*output_pos)++] = ((uint8_t)(value & 127)) | 128;
+    // Remove the seven bits we just wrote
+    value >>= 7;
+  }
+  if (*output_pos > output_size) return false;
+  output[(*output_pos)++] = ((uint8_t)value) & 127;
+  return true;
+}
+
+static inline JXL_MAYBE_UNUSED void EncodeVarInt(uint64_t value,
+                                                 PaddedBytes* data) {
+  size_t pos = data->size();
+  data->resize(data->size() + 9);
+  JXL_CHECK(EncodeVarInt(value, data->size(), &pos, data->data()));
+  data->resize(pos);
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_COMMON_H_
diff --git a/third_party/jpeg-xl/lib/jxl/compressed_dc.cc b/third_party/jpeg-xl/lib/jxl/compressed_dc.cc
new file mode 100644
index 0000000000..f9a8f149dd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/compressed_dc.cc
@@ -0,0 +1,318 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/compressed_dc.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/compressed_dc.cc"
+#include <hwy/aligned_allocator.h>
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/image.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+using D = HWY_FULL(float);
+using DScalar = HWY_CAPPED(float, 1);
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+// TODO(veluca): optimize constants.
+const float w1 = 0.20345139757231578f;
+const float w2 = 0.0334829185968739f;
+const float w0 = 1.0f - 4.0f * (w1 + w2);
+
+template <class V>
+V MaxWorkaround(V a, V b) {
+#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800
+  // Prevents "Do not know how to split the result of this operator" error
+  return IfThenElse(a > b, a, b);
+#else
+  return Max(a, b);
+#endif
+}
+
+template <typename D>
+JXL_INLINE void ComputePixelChannel(const D d, const float dc_factor,
+                                    const float* JXL_RESTRICT row_top,
+                                    const float* JXL_RESTRICT row,
+                                    const float* JXL_RESTRICT row_bottom,
+                                    Vec<D>* JXL_RESTRICT mc,
+                                    Vec<D>* JXL_RESTRICT sm,
+                                    Vec<D>* JXL_RESTRICT gap, size_t x) {
+  const auto tl = LoadU(d, row_top + x - 1);
+  const auto tc = Load(d, row_top + x);
+  const auto tr = LoadU(d, row_top + x + 1);
+
+  const auto ml = LoadU(d, row + x - 1);
+  *mc = Load(d, row + x);
+  const auto mr = LoadU(d, row + x + 1);
+
+  const auto bl = LoadU(d, row_bottom + x - 1);
+  const auto bc = Load(d, row_bottom + x);
+  const auto br = LoadU(d, row_bottom + x + 1);
+
+  const auto w_center = Set(d, w0);
+  const auto w_side = Set(d, w1);
+  const auto w_corner = Set(d, w2);
+
+  const auto corner = Add(Add(tl, tr), Add(bl, br));
+  const auto side = Add(Add(ml, mr), Add(tc, bc));
+  *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center)));
+
+  const auto dc_quant = Set(d, dc_factor);
+  *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant)));
+}
+
+template <typename D>
+JXL_INLINE void ComputePixel(
+    const float* JXL_RESTRICT dc_factors,
+    const float* JXL_RESTRICT* JXL_RESTRICT rows_top,
+    const float* JXL_RESTRICT* JXL_RESTRICT rows,
+    const float* JXL_RESTRICT* JXL_RESTRICT rows_bottom,
+    float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) {
+  const D d;
+  auto mc_x = Undefined(d);
+  auto mc_y = Undefined(d);
+  auto mc_b = Undefined(d);
+  auto sm_x = Undefined(d);
+  auto sm_y = Undefined(d);
+  auto sm_b = Undefined(d);
+  auto gap = Set(d, 0.5f);
+  ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0],
+                      &mc_x, &sm_x, &gap, x);
+  ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1],
+                      &mc_y, &sm_y, &gap, x);
+  ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2],
+                      &mc_b, &sm_b, &gap, x);
+  auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f));
+  factor = ZeroIfNegative(factor);
+
+  auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x);
+  Store(out, d, out_rows[0] + x);
+  out = MulAdd(Sub(sm_y, mc_y), factor, mc_y);
+  Store(out, d, out_rows[1] + x);
+  out = MulAdd(Sub(sm_b, mc_b), factor, mc_b);
+  Store(out, d, out_rows[2] + x);
+}
+
+void AdaptiveDCSmoothing(const float* dc_factors, Image3F* dc,
+                         ThreadPool* pool) {
+  const size_t xsize = dc->xsize();
+  const size_t ysize = dc->ysize();
+  if (ysize <= 2 || xsize <= 2) return;
+
+  // TODO(veluca): use tile-based processing?
+  // TODO(veluca): decide if changes to the y channel should be propagated to
+  // the x and b channels through color correlation.
+  JXL_ASSERT(w1 + w2 < 0.25f);
+
+  PROFILER_FUNC;
+
+  Image3F smoothed(xsize, ysize);
+  // Fill in borders that the loop below will not. First and last are unused.
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y : {size_t(0), ysize - 1}) {
+      memcpy(smoothed.PlaneRow(c, y), dc->PlaneRow(c, y),
+             xsize * sizeof(float));
+    }
+  }
+  auto process_row = [&](const uint32_t y, size_t /*thread*/) {
+    const float* JXL_RESTRICT rows_top[3]{
+        dc->ConstPlaneRow(0, y - 1),
+        dc->ConstPlaneRow(1, y - 1),
+        dc->ConstPlaneRow(2, y - 1),
+    };
+    const float* JXL_RESTRICT rows[3] = {
+        dc->ConstPlaneRow(0, y),
+        dc->ConstPlaneRow(1, y),
+        dc->ConstPlaneRow(2, y),
+    };
+    const float* JXL_RESTRICT rows_bottom[3] = {
+        dc->ConstPlaneRow(0, y + 1),
+        dc->ConstPlaneRow(1, y + 1),
+        dc->ConstPlaneRow(2, y + 1),
+    };
+    float* JXL_RESTRICT rows_out[3] = {
+        smoothed.PlaneRow(0, y),
+        smoothed.PlaneRow(1, y),
+        smoothed.PlaneRow(2, y),
+    };
+    for (size_t x : {size_t(0), xsize - 1}) {
+      for (size_t c = 0; c < 3; c++) {
+        rows_out[c][x] = rows[c][x];
+      }
+    }
+
+    size_t x = 1;
+    // First pixels
+    const size_t N = Lanes(D());
+    for (; x < std::min(N, xsize - 1); x++) {
+      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
+                            x);
+    }
+    // Full vectors.
+    for (; x + N <= xsize - 1; x += N) {
+      ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x);
+    }
+    // Last pixels.
+    for (; x < xsize - 1; x++) {
+      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
+                            x);
+    }
+  };
+  JXL_CHECK(RunOnPool(pool, 1, ysize - 1, ThreadPool::NoInit, process_row,
+                      "DCSmoothingRow"));
+  dc->Swap(smoothed);
+}
+
+// DC dequantization.
+void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in,
+               const float* dc_factors, float mul, const float* cfl_factors,
+               YCbCrChromaSubsampling chroma_subsampling,
+               const BlockCtxMap& bctx) {
+  const HWY_FULL(float) df;
+  const Rebind<pixel_type, HWY_FULL(float)> di;  // assumes pixel_type <= float
+  if (chroma_subsampling.Is444()) {
+    const auto fac_x = Set(df, dc_factors[0] * mul);
+    const auto fac_y = Set(df, dc_factors[1] * mul);
+    const auto fac_b = Set(df, dc_factors[2] * mul);
+    const auto cfl_fac_x = Set(df, cfl_factors[0]);
+    const auto cfl_fac_b = Set(df, cfl_factors[2]);
+    for (size_t y = 0; y < r.ysize(); y++) {
+      float* dec_row_x = r.PlaneRow(dc, 0, y);
+      float* dec_row_y = r.PlaneRow(dc, 1, y);
+      float* dec_row_b = r.PlaneRow(dc, 2, y);
+      const int32_t* quant_row_x = in.channel[1].plane.Row(y);
+      const int32_t* quant_row_y = in.channel[0].plane.Row(y);
+      const int32_t* quant_row_b = in.channel[2].plane.Row(y);
+      for (size_t x = 0; x < r.xsize(); x += Lanes(di)) {
+        const auto in_q_x = Load(di, quant_row_x + x);
+        const auto in_q_y = Load(di, quant_row_y + x);
+        const auto in_q_b = Load(di, quant_row_b + x);
+        const auto in_x = Mul(ConvertTo(df, in_q_x), fac_x);
+        const auto in_y = Mul(ConvertTo(df, in_q_y), fac_y);
+        const auto in_b = Mul(ConvertTo(df, in_q_b), fac_b);
+        Store(in_y, df, dec_row_y + x);
+        Store(MulAdd(in_y, cfl_fac_x, in_x), df, dec_row_x + x);
+        Store(MulAdd(in_y, cfl_fac_b, in_b), df, dec_row_b + x);
+      }
+    }
+  } else {
+    for (size_t c : {1, 0, 2}) {
+      Rect rect(r.x0() >> chroma_subsampling.HShift(c),
+                r.y0() >> chroma_subsampling.VShift(c),
+                r.xsize() >> chroma_subsampling.HShift(c),
+                r.ysize() >> chroma_subsampling.VShift(c));
+      const auto fac = Set(df, dc_factors[c] * mul);
+      const Channel& ch = in.channel[c < 2 ? c ^ 1 : c];
+      for (size_t y = 0; y < rect.ysize(); y++) {
+        const int32_t* quant_row = ch.plane.Row(y);
+        float* row = rect.PlaneRow(dc, c, y);
+        for (size_t x = 0; x < rect.xsize(); x += Lanes(di)) {
+          const auto in_q = Load(di, quant_row + x);
+          const auto in = Mul(ConvertTo(df, in_q), fac);
+          Store(in, df, row + x);
+        }
+      }
+    }
+  }
+  if (bctx.num_dc_ctxs <= 1) {
+    for (size_t y = 0; y < r.ysize(); y++) {
+      uint8_t* qdc_row = r.Row(quant_dc, y);
+      memset(qdc_row, 0, sizeof(*qdc_row) * r.xsize());
+    }
+  } else {
+    for (size_t y = 0; y < r.ysize(); y++) {
+      uint8_t* qdc_row_val = r.Row(quant_dc, y);
+      const int32_t* quant_row_x =
+          in.channel[1].plane.Row(y >> chroma_subsampling.VShift(0));
+      const int32_t* quant_row_y =
+          in.channel[0].plane.Row(y >> chroma_subsampling.VShift(1));
+      const int32_t* quant_row_b =
+          in.channel[2].plane.Row(y >> chroma_subsampling.VShift(2));
+      for (size_t x = 0; x < r.xsize(); x++) {
+        int bucket_x = 0, bucket_y = 0, bucket_b = 0;
+        for (int t : bctx.dc_thresholds[0]) {
+          if (quant_row_x[x >> chroma_subsampling.HShift(0)] > t) bucket_x++;
+        }
+        for (int t : bctx.dc_thresholds[1]) {
+          if (quant_row_y[x >> chroma_subsampling.HShift(1)] > t) bucket_y++;
+        }
+        for (int t : bctx.dc_thresholds[2]) {
+          if (quant_row_b[x >> chroma_subsampling.HShift(2)] > t) bucket_b++;
+        }
+        int bucket = bucket_x;
+        bucket *= bctx.dc_thresholds[2].size() + 1;
+        bucket += bucket_b;
+        bucket *= bctx.dc_thresholds[1].size() + 1;
+        bucket += bucket_y;
+        qdc_row_val[x] = bucket;
+      }
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(DequantDC);
+HWY_EXPORT(AdaptiveDCSmoothing);
+void AdaptiveDCSmoothing(const float* dc_factors, Image3F* dc,
+                         ThreadPool* pool) {
+  return HWY_DYNAMIC_DISPATCH(AdaptiveDCSmoothing)(dc_factors, dc, pool);
+}
+
+void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in,
+               const float* dc_factors, float mul, const float* cfl_factors,
+               YCbCrChromaSubsampling chroma_subsampling,
+               const BlockCtxMap& bctx) {
+  return HWY_DYNAMIC_DISPATCH(DequantDC)(r, dc, quant_dc, in, dc_factors, mul,
+                                         cfl_factors, chroma_subsampling, bctx);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/compressed_dc.h b/third_party/jpeg-xl/lib/jxl/compressed_dc.h
new file mode 100644
index 0000000000..b06e5931f0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/compressed_dc.h
@@ -0,0 +1,34 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_COMPRESSED_DC_H_
+#define LIB_JXL_COMPRESSED_DC_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/modular/modular_image.h"
+
+// DC handling functions: encoding and decoding of DC to and from bitstream, and
+// related function to initialize the per-group decoder cache.
+
+namespace jxl {
+
+// Smooth DC in already-smooth areas, to counteract banding.
+void AdaptiveDCSmoothing(const float* dc_factors, Image3F* dc,
+                         ThreadPool* pool);
+
+void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in,
+               const float* dc_factors, float mul, const float* cfl_factors,
+               YCbCrChromaSubsampling chroma_subsampling,
+               const BlockCtxMap& bctx);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_COMPRESSED_DC_H_
diff --git a/third_party/jpeg-xl/lib/jxl/convolve-inl.h b/third_party/jpeg-xl/lib/jxl/convolve-inl.h
new file mode 100644
index 0000000000..054c9c6f0d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve-inl.h
@@ -0,0 +1,297 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JXL_CONVOLVE_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_CONVOLVE_INL_H_
+#undef LIB_JXL_CONVOLVE_INL_H_
+#else
+#define LIB_JXL_CONVOLVE_INL_H_
+#endif
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/image_ops.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Broadcast;
+#if HWY_TARGET != HWY_SCALAR
+using hwy::HWY_NAMESPACE::CombineShiftRightBytes;
+#endif
+using hwy::HWY_NAMESPACE::TableLookupLanes;
+using hwy::HWY_NAMESPACE::Vec;
+
+// Synthesizes left/right neighbors from a vector of center pixels.
+class Neighbors {
+ public:
+  using D = HWY_CAPPED(float, 16);
+  using V = Vec<D>;
+
+  // Returns l[i] == c[Mirror(i - 1)].
+  HWY_INLINE HWY_MAYBE_UNUSED static V FirstL1(const V c) {
+#if HWY_CAP_GE256
+    const D d;
+    HWY_ALIGN constexpr int32_t lanes[16] = {0, 0, 1, 2,  3,  4,  5,  6,
+                                             7, 8, 9, 10, 11, 12, 13, 14};
+    const auto indices = SetTableIndices(d, lanes);
+    // c = PONM'LKJI
+    return TableLookupLanes(c, indices);  // ONML'KJII
+#elif HWY_TARGET == HWY_SCALAR
+    return c;  // Same (the first mirrored value is the last valid one)
+#else  // 128 bit
+    // c = LKJI
+#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86)
+    return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(2, 1, 0, 0))};  // KJII
+#else
+    const D d;
+    // TODO(deymo): Figure out if this can be optimized using a single vsri
+    // instruction to convert LKJI to KJII.
+    HWY_ALIGN constexpr int lanes[4] = {0, 0, 1, 2};  // KJII
+    const auto indices = SetTableIndices(d, lanes);
+    return TableLookupLanes(c, indices);
+#endif
+#endif
+  }
+
+  // Returns l[i] == c[Mirror(i - 2)].
+  HWY_INLINE HWY_MAYBE_UNUSED static V FirstL2(const V c) {
+#if HWY_CAP_GE256
+    const D d;
+    HWY_ALIGN constexpr int32_t lanes[16] = {1, 0, 0, 1, 2,  3,  4,  5,
+                                             6, 7, 8, 9, 10, 11, 12, 13};
+    const auto indices = SetTableIndices(d, lanes);
+    // c = PONM'LKJI
+    return TableLookupLanes(c, indices);  // NMLK'JIIJ
+#elif HWY_TARGET == HWY_SCALAR
+    const D d;
+    JXL_ASSERT(false);  // unsupported, avoid calling this.
+    return Zero(d);
+#else  // 128 bit
+    // c = LKJI
+#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86)
+    return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(1, 0, 0, 1))};  // JIIJ
+#else
+    const D d;
+    HWY_ALIGN constexpr int lanes[4] = {1, 0, 0, 1};  // JIIJ
+    const auto indices = SetTableIndices(d, lanes);
+    return TableLookupLanes(c, indices);
+#endif
+#endif
+  }
+
+  // Returns l[i] == c[Mirror(i - 3)].
+  HWY_INLINE HWY_MAYBE_UNUSED static V FirstL3(const V c) {
+#if HWY_CAP_GE256
+    const D d;
+    HWY_ALIGN constexpr int32_t lanes[16] = {2, 1, 0, 0, 1, 2,  3,  4,
+                                             5, 6, 7, 8, 9, 10, 11, 12};
+    const auto indices = SetTableIndices(d, lanes);
+    // c = PONM'LKJI
+    return TableLookupLanes(c, indices);  // MLKJ'IIJK
+#elif HWY_TARGET == HWY_SCALAR
+    const D d;
+    JXL_ASSERT(false);  // unsupported, avoid calling this.
+    return Zero(d);
+#else  // 128 bit
+    // c = LKJI
+#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86)
+    return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(0, 0, 1, 2))};  // IIJK
+#else
+    const D d;
+    HWY_ALIGN constexpr int lanes[4] = {2, 1, 0, 0};  // IIJK
+    const auto indices = SetTableIndices(d, lanes);
+    return TableLookupLanes(c, indices);
+#endif
+#endif
+  }
+};
+
+#if HWY_TARGET != HWY_SCALAR
+
+// Returns indices for SetTableIndices such that TableLookupLanes on the
+// rightmost unaligned vector (rightmost sample in its most-significant lane)
+// returns the mirrored values, with the mirror outside the last valid sample.
+static inline const int32_t* MirrorLanes(const size_t mod) {
+  const HWY_CAPPED(float, 16) d;
+  constexpr size_t kN = MaxLanes(d);
+
+  // For mod = `image width mod 16` 0..15:
+  // last full vec     mirrored (mem order)  loadedVec  mirrorVec  idxVec
+  // 0123456789abcdef| fedcba9876543210      fed..210   012..def   012..def
+  // 0123456789abcdef|0 0fedcba98765432      0fe..321   234..f00   123..eff
+  // 0123456789abcdef|01 10fedcba987654      10f..432   456..110   234..ffe
+  // 0123456789abcdef|012 210fedcba9876      210..543   67..2210   34..ffed
+  // 0123456789abcdef|0123 3210fedcba98      321..654   8..33210   4..ffedc
+  // 0123456789abcdef|01234 43210fedcba
+  // 0123456789abcdef|012345 543210fedc
+  // 0123456789abcdef|0123456 6543210fe
+  // 0123456789abcdef|01234567 76543210
+  // 0123456789abcdef|012345678 8765432
+  // 0123456789abcdef|0123456789 987654
+  // 0123456789abcdef|0123456789A A9876
+  // 0123456789abcdef|0123456789AB BA98
+  // 0123456789abcdef|0123456789ABC CBA
+  // 0123456789abcdef|0123456789ABCD DC
+  // 0123456789abcdef|0123456789ABCDE E      EDC..10f   EED..210   ffe..321
+#if HWY_CAP_GE512
+  HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {
+      1,  2,  3,  4,  5,  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15,  //
+      14, 13, 12, 11, 10, 9, 8, 7, 6, 5,  4,  3,  2,  1,  0};
+#elif HWY_CAP_GE256
+  HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {
+      1, 2, 3, 4, 5, 6, 7, 7,  //
+      6, 5, 4, 3, 2, 1, 0};
+#else  // 128-bit
+  HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {1, 2, 3, 3,  //
+                                                              2, 1, 0};
+#endif
+  return idx_lanes + kN - 1 - mod;
+}
+
+#endif  // HWY_TARGET != HWY_SCALAR
+
+// Single entry point for convolution.
+// "Strategy" (Direct*/Separable*) decides kernel size and how to evaluate it.
+template <class Strategy>
+class ConvolveT {
+  static constexpr int64_t kRadius = Strategy::kRadius;
+  using Simd = HWY_CAPPED(float, 16);
+
+ public:
+  static size_t MinWidth() {
+#if HWY_TARGET == HWY_SCALAR
+    // First/Last use mirrored loads of up to +/- kRadius.
+    return 2 * kRadius;
+#else
+    return Lanes(Simd()) + kRadius;
+#endif
+  }
+
+  // "Image" is ImageF or Image3F.
+  template <class Image, class Weights>
+  static void Run(const Image& in, const Rect& rect, const Weights& weights,
+                  ThreadPool* pool, Image* out) {
+    PROFILER_ZONE("ConvolveT::Run");
+    JXL_CHECK(SameSize(rect, *out));
+    JXL_CHECK(rect.xsize() >= MinWidth());
+
+    static_assert(int64_t(kRadius) <= 3,
+                  "Must handle [0, kRadius) and >= kRadius");
+    switch (rect.xsize() % Lanes(Simd())) {
+      case 0:
+        return RunRows<0>(in, rect, weights, pool, out);
+      case 1:
+        return RunRows<1>(in, rect, weights, pool, out);
+      case 2:
+        return RunRows<2>(in, rect, weights, pool, out);
+      default:
+        return RunRows<3>(in, rect, weights, pool, out);
+    }
+  }
+
+ private:
+  template <size_t kSizeModN, class WrapRow, class Weights>
+  static JXL_INLINE void RunRow(const float* JXL_RESTRICT in,
+                                const size_t xsize, const int64_t stride,
+                                const WrapRow& wrap_row, const Weights& weights,
+                                float* JXL_RESTRICT out) {
+    Strategy::template ConvolveRow<kSizeModN>(in, xsize, stride, wrap_row,
+                                              weights, out);
+  }
+
+  template <size_t kSizeModN, class Weights>
+  static JXL_INLINE void RunBorderRows(const ImageF& in, const Rect& rect,
+                                       const int64_t ybegin, const int64_t yend,
+                                       const Weights& weights, ImageF* out) {
+    const int64_t stride = in.PixelsPerRow();
+    const WrapRowMirror wrap_row(in, rect.ysize());
+    for (int64_t y = ybegin; y < yend; ++y) {
+      RunRow<kSizeModN>(rect.ConstRow(in, y), rect.xsize(), stride, wrap_row,
+                        weights, out->Row(y));
+    }
+  }
+
+  // Image3F.
+  template <size_t kSizeModN, class Weights>
+  static JXL_INLINE void RunBorderRows(const Image3F& in, const Rect& rect,
+                                       const int64_t ybegin, const int64_t yend,
+                                       const Weights& weights, Image3F* out) {
+    const int64_t stride = in.PixelsPerRow();
+    for (int64_t y = ybegin; y < yend; ++y) {
+      for (size_t c = 0; c < 3; ++c) {
+        const WrapRowMirror wrap_row(in.Plane(c), rect.ysize());
+        RunRow<kSizeModN>(rect.ConstPlaneRow(in, c, y), rect.xsize(), stride,
+                          wrap_row, weights, out->PlaneRow(c, y));
+      }
+    }
+  }
+
+  template <size_t kSizeModN, class Weights>
+  static JXL_INLINE void RunInteriorRows(const ImageF& in, const Rect& rect,
+                                         const int64_t ybegin,
+                                         const int64_t yend,
+                                         const Weights& weights,
+                                         ThreadPool* pool, ImageF* out) {
+    const int64_t stride = in.PixelsPerRow();
+    JXL_CHECK(RunOnPool(
+        pool, ybegin, yend, ThreadPool::NoInit,
+        [&](const uint32_t y, size_t /*thread*/) HWY_ATTR {
+          RunRow<kSizeModN>(rect.ConstRow(in, y), rect.xsize(), stride,
+                            WrapRowUnchanged(), weights, out->Row(y));
+        },
+        "Convolve"));
+  }
+
+  // Image3F.
+  template <size_t kSizeModN, class Weights>
+  static JXL_INLINE void RunInteriorRows(const Image3F& in, const Rect& rect,
+                                         const int64_t ybegin,
+                                         const int64_t yend,
+                                         const Weights& weights,
+                                         ThreadPool* pool, Image3F* out) {
+    const int64_t stride = in.PixelsPerRow();
+    JXL_CHECK(RunOnPool(
+        pool, ybegin, yend, ThreadPool::NoInit,
+        [&](const uint32_t y, size_t /*thread*/) HWY_ATTR {
+          for (size_t c = 0; c < 3; ++c) {
+            RunRow<kSizeModN>(rect.ConstPlaneRow(in, c, y), rect.xsize(),
+                              stride, WrapRowUnchanged(), weights,
+                              out->PlaneRow(c, y));
+          }
+        },
+        "Convolve3"));
+  }
+
+  template <size_t kSizeModN, class Image, class Weights>
+  static JXL_INLINE void RunRows(const Image& in, const Rect& rect,
+                                 const Weights& weights, ThreadPool* pool,
+                                 Image* out) {
+    const int64_t ysize = rect.ysize();
+    RunBorderRows<kSizeModN>(in, rect, 0, std::min(int64_t(kRadius), ysize),
+                             weights, out);
+    if (ysize > 2 * int64_t(kRadius)) {
+      RunInteriorRows<kSizeModN>(in, rect, int64_t(kRadius),
+                                 ysize - int64_t(kRadius), weights, pool, out);
+    }
+    if (ysize > int64_t(kRadius)) {
+      RunBorderRows<kSizeModN>(in, rect, ysize - int64_t(kRadius), ysize,
+                               weights, out);
+    }
+  }
+};
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_CONVOLVE_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/convolve.h b/third_party/jpeg-xl/lib/jxl/convolve.h
new file mode 100644
index 0000000000..2fcd2d0980
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve.h
@@ -0,0 +1,105 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_CONVOLVE_H_
+#define LIB_JXL_CONVOLVE_H_
+
+// 2D convolution.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+// No valid values outside [0, xsize), but the strategy may still safely load
+// the preceding vector, and/or round xsize up to the vector lane count. This
+// avoids needing PadImage.
+// Requires xsize >= kConvolveLanes + kConvolveMaxRadius.
+static constexpr size_t kConvolveMaxRadius = 3;
+
+// Weights must already be normalized.
+
+struct WeightsSymmetric3 {
+  // d r d (each replicated 4x)
+  // r c r
+  // d r d
+  float c[4];
+  float r[4];
+  float d[4];
+};
+
+struct WeightsSymmetric5 {
+  // The lower-right quadrant is: c r R  (each replicated 4x)
+  //                              r d L
+  //                              R L D
+  float c[4];
+  float r[4];
+  float R[4];
+  float d[4];
+  float D[4];
+  float L[4];
+};
+
+// Weights for separable 5x5 filters (typically but not necessarily the same
+// values for horizontal and vertical directions). The kernel must already be
+// normalized, but note that values for negative offsets are omitted, so the
+// given values do not sum to 1.
+struct WeightsSeparable5 {
+  // Horizontal 1D, distances 0..2 (each replicated 4x)
+  float horz[3 * 4];
+  float vert[3 * 4];
+};
+
+// Weights for separable 7x7 filters (typically but not necessarily the same
+// values for horizontal and vertical directions). The kernel must already be
+// normalized, but note that values for negative offsets are omitted, so the
+// given values do not sum to 1.
+//
+// NOTE: for >= 7x7 Gaussian kernels, it is faster to use FastGaussian instead,
+// at least when images exceed the L1 cache size.
+struct WeightsSeparable7 {
+  // Horizontal 1D, distances 0..3 (each replicated 4x)
+  float horz[4 * 4];
+  float vert[4 * 4];
+};
+
+const WeightsSymmetric3& WeightsSymmetric3Lowpass();
+const WeightsSeparable5& WeightsSeparable5Lowpass();
+const WeightsSymmetric5& WeightsSymmetric5Lowpass();
+
+void SlowSymmetric3(const ImageF& in, const Rect& rect,
+                    const WeightsSymmetric3& weights, ThreadPool* pool,
+                    ImageF* JXL_RESTRICT out);
+
+void SlowSeparable5(const ImageF& in, const Rect& rect,
+                    const WeightsSeparable5& weights, ThreadPool* pool,
+                    ImageF* out);
+
+void SlowSeparable7(const ImageF& in, const Rect& rect,
+                    const WeightsSeparable7& weights, ThreadPool* pool,
+                    ImageF* out);
+
+void Symmetric3(const ImageF& in, const Rect& rect,
+                const WeightsSymmetric3& weights, ThreadPool* pool,
+                ImageF* out);
+
+void Symmetric5(const ImageF& in, const Rect& rect,
+                const WeightsSymmetric5& weights, ThreadPool* pool,
+                ImageF* JXL_RESTRICT out);
+
+void Separable5(const ImageF& in, const Rect& rect,
+                const WeightsSeparable5& weights, ThreadPool* pool,
+                ImageF* out);
+
+void Separable7(const ImageF& in, const Rect& rect,
+                const WeightsSeparable7& weights, ThreadPool* pool,
+                ImageF* out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_CONVOLVE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/convolve_separable5.cc b/third_party/jpeg-xl/lib/jxl/convolve_separable5.cc
new file mode 100644
index 0000000000..b26ff54bbc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve_separable5.cc
@@ -0,0 +1,261 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/convolve.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/convolve_separable5.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/convolve-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Vec;
+
+// 5x5 convolution by separable kernel with a single scan through the input.
+// This is more cache-efficient than separate horizontal/vertical passes, and
+// possibly faster (given enough registers) than tiling and/or transposing.
+//
+// Overview: imagine a 5x5 window around a central pixel. First convolve the
+// rows by multiplying the pixels with the corresponding weights from
+// WeightsSeparable5.horz[abs(x_offset) * 4]. Then multiply each of these
+// intermediate results by the corresponding vertical weight, i.e.
+// vert[abs(y_offset) * 4]. Finally, store the sum of these values as the
+// convolution result at the position of the central pixel in the output.
+//
+// Each of these operations uses SIMD vectors. The central pixel and most
+// importantly the output are aligned, so neighnoring pixels (e.g. x_offset=1)
+// require unaligned loads. Because weights are supplied in identical groups of
+// 4, we can use LoadDup128 to load them (slightly faster).
+//
+// Uses mirrored boundary handling. Until x >= kRadius, the horizontal
+// convolution uses Neighbors class to shuffle vectors as if each of its lanes
+// had been loaded from the mirrored offset. Similarly, the last full vector to
+// write uses mirroring. In the case of scalar vectors, Neighbors is not usable
+// and the value is loaded directly. Otherwise, the number of valid pixels
+// modulo the vector size enables a small optimization: for smaller offsets,
+// a non-mirrored load is sufficient.
+class Separable5Strategy {
+  using D = HWY_CAPPED(float, 16);
+  using V = Vec<D>;
+
+ public:
+  static constexpr int64_t kRadius = 2;
+
+  template <size_t kSizeModN, class WrapRow>
+  static JXL_MAYBE_INLINE void ConvolveRow(
+      const float* const JXL_RESTRICT row_m, const size_t xsize,
+      const int64_t stride, const WrapRow& wrap_row,
+      const WeightsSeparable5& weights, float* const JXL_RESTRICT row_out) {
+    const D d;
+    const int64_t neg_stride = -stride;  // allows LEA addressing.
+    const float* const JXL_RESTRICT row_t2 =
+        wrap_row(row_m + 2 * neg_stride, stride);
+    const float* const JXL_RESTRICT row_t1 =
+        wrap_row(row_m + 1 * neg_stride, stride);
+    const float* const JXL_RESTRICT row_b1 =
+        wrap_row(row_m + 1 * stride, stride);
+    const float* const JXL_RESTRICT row_b2 =
+        wrap_row(row_m + 2 * stride, stride);
+
+    const V wh0 = LoadDup128(d, weights.horz + 0 * 4);
+    const V wh1 = LoadDup128(d, weights.horz + 1 * 4);
+    const V wh2 = LoadDup128(d, weights.horz + 2 * 4);
+    const V wv0 = LoadDup128(d, weights.vert + 0 * 4);
+    const V wv1 = LoadDup128(d, weights.vert + 1 * 4);
+    const V wv2 = LoadDup128(d, weights.vert + 2 * 4);
+
+    size_t x = 0;
+
+    // More than one iteration for scalars.
+    for (; x < kRadius; x += Lanes(d)) {
+      const V conv0 =
+          Mul(HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2), wv0);
+
+      const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2);
+      const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2);
+      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
+
+      const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2);
+      const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2);
+      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
+      Store(conv2, d, row_out + x);
+    }
+
+    // Main loop: load inputs without padding
+    for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) {
+      const V conv0 = Mul(HorzConvolve(row_m + x, wh0, wh1, wh2), wv0);
+
+      const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2);
+      const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2);
+      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
+
+      const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2);
+      const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2);
+      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
+      Store(conv2, d, row_out + x);
+    }
+
+    // Last full vector to write (the above loop handled mod >= kRadius)
+#if HWY_TARGET == HWY_SCALAR
+    while (x < xsize) {
+#else
+    if (kSizeModN < kRadius) {
+#endif
+      const V conv0 =
+          Mul(HorzConvolveLast<kSizeModN>(row_m, x, xsize, wh0, wh1, wh2), wv0);
+
+      const V conv1t =
+          HorzConvolveLast<kSizeModN>(row_t1, x, xsize, wh0, wh1, wh2);
+      const V conv1b =
+          HorzConvolveLast<kSizeModN>(row_b1, x, xsize, wh0, wh1, wh2);
+      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
+
+      const V conv2t =
+          HorzConvolveLast<kSizeModN>(row_t2, x, xsize, wh0, wh1, wh2);
+      const V conv2b =
+          HorzConvolveLast<kSizeModN>(row_b2, x, xsize, wh0, wh1, wh2);
+      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
+      Store(conv2, d, row_out + x);
+      x += Lanes(d);
+    }
+
+    // If mod = 0, the above vector was the last.
+    if (kSizeModN != 0) {
+      for (; x < xsize; ++x) {
+        float mul = 0.0f;
+        for (int64_t dy = -kRadius; dy <= kRadius; ++dy) {
+          const float wy = weights.vert[std::abs(dy) * 4];
+          const float* clamped_row = wrap_row(row_m + dy * stride, stride);
+          for (int64_t dx = -kRadius; dx <= kRadius; ++dx) {
+            const float wx = weights.horz[std::abs(dx) * 4];
+            const int64_t clamped_x = Mirror(x + dx, xsize);
+            mul += clamped_row[clamped_x] * wx * wy;
+          }
+        }
+        row_out[x] = mul;
+      }
+    }
+  }
+
+ private:
+  // Same as HorzConvolve for the first/last vector in a row.
+  static JXL_MAYBE_INLINE V HorzConvolveFirst(
+      const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize,
+      const V wh0, const V wh1, const V wh2) {
+    const D d;
+    const V c = LoadU(d, row + x);
+    const V mul0 = Mul(c, wh0);
+
+#if HWY_TARGET == HWY_SCALAR
+    const V l1 = LoadU(d, row + Mirror(x - 1, xsize));
+    const V l2 = LoadU(d, row + Mirror(x - 2, xsize));
+#else
+    (void)xsize;
+    const V l1 = Neighbors::FirstL1(c);
+    const V l2 = Neighbors::FirstL2(c);
+#endif
+
+    const V r1 = LoadU(d, row + x + 1);
+    const V r2 = LoadU(d, row + x + 2);
+
+    const V mul1 = MulAdd(Add(l1, r1), wh1, mul0);
+    const V mul2 = MulAdd(Add(l2, r2), wh2, mul1);
+    return mul2;
+  }
+
+  template <size_t kSizeModN>
+  static JXL_MAYBE_INLINE V
+  HorzConvolveLast(const float* const JXL_RESTRICT row, const int64_t x,
+                   const int64_t xsize, const V wh0, const V wh1, const V wh2) {
+    const D d;
+    const V c = LoadU(d, row + x);
+    const V mul0 = Mul(c, wh0);
+
+    const V l1 = LoadU(d, row + x - 1);
+    const V l2 = LoadU(d, row + x - 2);
+
+    V r1, r2;
+#if HWY_TARGET == HWY_SCALAR
+    r1 = LoadU(d, row + Mirror(x + 1, xsize));
+    r2 = LoadU(d, row + Mirror(x + 2, xsize));
+#else
+    const size_t N = Lanes(d);
+    if (kSizeModN == 0) {
+      r2 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 2)));
+      r1 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 1)));
+    } else {  // == 1
+      const auto last = LoadU(d, row + xsize - N);
+      r2 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1)));
+      r1 = last;
+    }
+#endif
+
+    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
+    const V sum1 = Add(l1, r1);
+    const V mul1 = MulAdd(sum1, wh1, mul0);
+    const V sum2 = Add(l2, r2);
+    const V mul2 = MulAdd(sum2, wh2, mul1);
+    return mul2;
+  }
+
+  // Requires kRadius valid pixels before/after pos.
+  static JXL_MAYBE_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos,
+                                         const V wh0, const V wh1,
+                                         const V wh2) {
+    const D d;
+    const V c = LoadU(d, pos);
+    const V mul0 = Mul(c, wh0);
+
+    // Loading anew is faster than combining vectors.
+    const V l1 = LoadU(d, pos - 1);
+    const V r1 = LoadU(d, pos + 1);
+    const V l2 = LoadU(d, pos - 2);
+    const V r2 = LoadU(d, pos + 2);
+    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
+    const V sum1 = Add(l1, r1);
+    const V mul1 = MulAdd(sum1, wh1, mul0);
+    const V sum2 = Add(l2, r2);
+    const V mul2 = MulAdd(sum2, wh2, mul1);
+    return mul2;
+  }
+};
+
+void Separable5(const ImageF& in, const Rect& rect,
+                const WeightsSeparable5& weights, ThreadPool* pool,
+                ImageF* out) {
+  using Conv = ConvolveT<Separable5Strategy>;
+  if (rect.xsize() >= Conv::MinWidth()) {
+    return Conv::Run(in, rect, weights, pool, out);
+  }
+
+  return SlowSeparable5(in, rect, weights, pool, out);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(Separable5);
+void Separable5(const ImageF& in, const Rect& rect,
+                const WeightsSeparable5& weights, ThreadPool* pool,
+                ImageF* out) {
+  return HWY_DYNAMIC_DISPATCH(Separable5)(in, rect, weights, pool, out);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/convolve_separable7.cc b/third_party/jpeg-xl/lib/jxl/convolve_separable7.cc
new file mode 100644
index 0000000000..086dfd22b5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve_separable7.cc
@@ -0,0 +1,285 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/convolve.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/convolve_separable7.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/convolve-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Vec;
+
+// 7x7 convolution by separable kernel with a single scan through the input.
+// Extended version of Separable5, see documentation there.
+class Separable7Strategy {
+  using D = HWY_CAPPED(float, 16);
+  using V = Vec<D>;
+
+ public:
+  static constexpr int64_t kRadius = 3;
+
+  template <size_t kSizeModN, class WrapRow>
+  static JXL_MAYBE_INLINE void ConvolveRow(
+      const float* const JXL_RESTRICT row_m, const size_t xsize,
+      const int64_t stride, const WrapRow& wrap_row,
+      const WeightsSeparable7& weights, float* const JXL_RESTRICT row_out) {
+    const D d;
+    const int64_t neg_stride = -stride;  // allows LEA addressing.
+    const float* const JXL_RESTRICT row_t3 =
+        wrap_row(row_m + 3 * neg_stride, stride);
+    const float* const JXL_RESTRICT row_t2 =
+        wrap_row(row_m + 2 * neg_stride, stride);
+    const float* const JXL_RESTRICT row_t1 =
+        wrap_row(row_m + 1 * neg_stride, stride);
+    const float* const JXL_RESTRICT row_b1 =
+        wrap_row(row_m + 1 * stride, stride);
+    const float* const JXL_RESTRICT row_b2 =
+        wrap_row(row_m + 2 * stride, stride);
+    const float* const JXL_RESTRICT row_b3 =
+        wrap_row(row_m + 3 * stride, stride);
+
+    const V wh0 = LoadDup128(d, weights.horz + 0 * 4);
+    const V wh1 = LoadDup128(d, weights.horz + 1 * 4);
+    const V wh2 = LoadDup128(d, weights.horz + 2 * 4);
+    const V wh3 = LoadDup128(d, weights.horz + 3 * 4);
+    const V wv0 = LoadDup128(d, weights.vert + 0 * 4);
+    const V wv1 = LoadDup128(d, weights.vert + 1 * 4);
+    const V wv2 = LoadDup128(d, weights.vert + 2 * 4);
+    const V wv3 = LoadDup128(d, weights.vert + 3 * 4);
+
+    size_t x = 0;
+
+    // More than one iteration for scalars.
+    for (; x < kRadius; x += Lanes(d)) {
+      const V conv0 =
+          Mul(HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2, wh3), wv0);
+
+      const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
+
+      const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
+
+      const V conv3t = HorzConvolveFirst(row_t3, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv3b = HorzConvolveFirst(row_b3, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv3 = MulAdd(Add(conv3t, conv3b), wv3, conv2);
+
+      Store(conv3, d, row_out + x);
+    }
+
+    // Main loop: load inputs without padding
+    for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) {
+      const V conv0 = Mul(HorzConvolve(row_m + x, wh0, wh1, wh2, wh3), wv0);
+
+      const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2, wh3);
+      const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2, wh3);
+      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
+
+      const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2, wh3);
+      const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2, wh3);
+      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
+
+      const V conv3t = HorzConvolve(row_t3 + x, wh0, wh1, wh2, wh3);
+      const V conv3b = HorzConvolve(row_b3 + x, wh0, wh1, wh2, wh3);
+      const V conv3 = MulAdd(Add(conv3t, conv3b), wv3, conv2);
+
+      Store(conv3, d, row_out + x);
+    }
+
+    // Last full vector to write (the above loop handled mod >= kRadius)
+#if HWY_TARGET == HWY_SCALAR
+    while (x < xsize) {
+#else
+    if (kSizeModN < kRadius) {
+#endif
+      const V conv0 =
+          Mul(HorzConvolveLast<kSizeModN>(row_m, x, xsize, wh0, wh1, wh2, wh3),
+              wv0);
+
+      const V conv1t =
+          HorzConvolveLast<kSizeModN>(row_t1, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv1b =
+          HorzConvolveLast<kSizeModN>(row_b1, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
+
+      const V conv2t =
+          HorzConvolveLast<kSizeModN>(row_t2, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv2b =
+          HorzConvolveLast<kSizeModN>(row_b2, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
+
+      const V conv3t =
+          HorzConvolveLast<kSizeModN>(row_t3, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv3b =
+          HorzConvolveLast<kSizeModN>(row_b3, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv3 = MulAdd(Add(conv3t, conv3b), wv3, conv2);
+
+      Store(conv3, d, row_out + x);
+      x += Lanes(d);
+    }
+
+    // If mod = 0, the above vector was the last.
+    if (kSizeModN != 0) {
+      for (; x < xsize; ++x) {
+        float mul = 0.0f;
+        for (int64_t dy = -kRadius; dy <= kRadius; ++dy) {
+          const float wy = weights.vert[std::abs(dy) * 4];
+          const float* clamped_row = wrap_row(row_m + dy * stride, stride);
+          for (int64_t dx = -kRadius; dx <= kRadius; ++dx) {
+            const float wx = weights.horz[std::abs(dx) * 4];
+            const int64_t clamped_x = Mirror(x + dx, xsize);
+            mul += clamped_row[clamped_x] * wx * wy;
+          }
+        }
+        row_out[x] = mul;
+      }
+    }
+  }
+
+ private:
+  // Same as HorzConvolve for the first/last vector in a row.
+  static JXL_MAYBE_INLINE V HorzConvolveFirst(
+      const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize,
+      const V wh0, const V wh1, const V wh2, const V wh3) {
+    const D d;
+    const V c = LoadU(d, row + x);
+    const V mul0 = Mul(c, wh0);
+
+#if HWY_TARGET == HWY_SCALAR
+    const V l1 = LoadU(d, row + Mirror(x - 1, xsize));
+    const V l2 = LoadU(d, row + Mirror(x - 2, xsize));
+    const V l3 = LoadU(d, row + Mirror(x - 3, xsize));
+#else
+    (void)xsize;
+    const V l1 = Neighbors::FirstL1(c);
+    const V l2 = Neighbors::FirstL2(c);
+    const V l3 = Neighbors::FirstL3(c);
+#endif
+
+    const V r1 = LoadU(d, row + x + 1);
+    const V r2 = LoadU(d, row + x + 2);
+    const V r3 = LoadU(d, row + x + 3);
+
+    const V mul1 = MulAdd(Add(l1, r1), wh1, mul0);
+    const V mul2 = MulAdd(Add(l2, r2), wh2, mul1);
+    const V mul3 = MulAdd(Add(l3, r3), wh3, mul2);
+    return mul3;
+  }
+
+  template <size_t kSizeModN>
+  static JXL_MAYBE_INLINE V HorzConvolveLast(
+      const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize,
+      const V wh0, const V wh1, const V wh2, const V wh3) {
+    const D d;
+    const V c = LoadU(d, row + x);
+    const V mul0 = Mul(c, wh0);
+
+    const V l1 = LoadU(d, row + x - 1);
+    const V l2 = LoadU(d, row + x - 2);
+    const V l3 = LoadU(d, row + x - 3);
+
+    V r1, r2, r3;
+#if HWY_TARGET == HWY_SCALAR
+    r1 = LoadU(d, row + Mirror(x + 1, xsize));
+    r2 = LoadU(d, row + Mirror(x + 2, xsize));
+    r3 = LoadU(d, row + Mirror(x + 3, xsize));
+#else
+    const size_t N = Lanes(d);
+    if (kSizeModN == 0) {
+      r3 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 3)));
+      r2 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 2)));
+      r1 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 1)));
+    } else if (kSizeModN == 1) {
+      const auto last = LoadU(d, row + xsize - N);
+      r3 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 2)));
+      r2 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1)));
+      r1 = last;
+    } else /* kSizeModN >= 2 */ {
+      const auto last = LoadU(d, row + xsize - N);
+      r3 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1)));
+      r2 = last;
+      r1 = LoadU(d, row + x + 1);
+    }
+#endif
+
+    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
+    const V sum1 = Add(l1, r1);
+    const V mul1 = MulAdd(sum1, wh1, mul0);
+    const V sum2 = Add(l2, r2);
+    const V mul2 = MulAdd(sum2, wh2, mul1);
+    const V sum3 = Add(l3, r3);
+    const V mul3 = MulAdd(sum3, wh3, mul2);
+    return mul3;
+  }
+
+  // Returns one vector of horizontal convolution results; lane i is the result
+  // for pixel pos + i. This is the fast path for interior pixels, i.e. kRadius
+  // valid pixels before/after pos.
+  static JXL_MAYBE_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos,
+                                         const V wh0, const V wh1, const V wh2,
+                                         const V wh3) {
+    const D d;
+    const V c = LoadU(d, pos);
+    const V mul0 = Mul(c, wh0);
+
+    // TODO(janwas): better to Combine
+    const V l1 = LoadU(d, pos - 1);
+    const V r1 = LoadU(d, pos + 1);
+    const V l2 = LoadU(d, pos - 2);
+    const V r2 = LoadU(d, pos + 2);
+    const V l3 = LoadU(d, pos - 3);
+    const V r3 = LoadU(d, pos + 3);
+    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
+    const V sum1 = Add(l1, r1);
+    const V mul1 = MulAdd(sum1, wh1, mul0);
+    const V sum2 = Add(l2, r2);
+    const V mul2 = MulAdd(sum2, wh2, mul1);
+    const V sum3 = Add(l3, r3);
+    const V mul3 = MulAdd(sum3, wh3, mul2);
+    return mul3;
+  }
+};
+
+void Separable7(const ImageF& in, const Rect& rect,
+                const WeightsSeparable7& weights, ThreadPool* pool,
+                ImageF* out) {
+  using Conv = ConvolveT<Separable7Strategy>;
+  if (rect.xsize() >= Conv::MinWidth()) {
+    return Conv::Run(in, rect, weights, pool, out);
+  }
+
+  return SlowSeparable7(in, rect, weights, pool, out);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(Separable7);
+void Separable7(const ImageF& in, const Rect& rect,
+                const WeightsSeparable7& weights, ThreadPool* pool,
+                ImageF* out) {
+  return HWY_DYNAMIC_DISPATCH(Separable7)(in, rect, weights, pool, out);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/convolve_slow.cc b/third_party/jpeg-xl/lib/jxl/convolve_slow.cc
new file mode 100644
index 0000000000..fffe5f74c8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve_slow.cc
@@ -0,0 +1,212 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/convolve.h"
+
+#include "lib/jxl/convolve-inl.h"
+
+namespace jxl {
+
+//------------------------------------------------------------------------------
+// Kernels
+
+// 4 instances of a given literal value, useful as input to LoadDup128.
+#define JXL_REP4(literal) literal, literal, literal, literal
+
+// Concentrates energy in low-frequency components (e.g. for antialiasing).
+const WeightsSymmetric3& WeightsSymmetric3Lowpass() {
+  // Computed by research/convolve_weights.py's cubic spline approximations of
+  // prolate spheroidal wave functions.
+  constexpr float w0 = 0.36208932f;
+  constexpr float w1 = 0.12820096f;
+  constexpr float w2 = 0.03127668f;
+  static constexpr WeightsSymmetric3 weights = {
+      {JXL_REP4(w0)}, {JXL_REP4(w1)}, {JXL_REP4(w2)}};
+  return weights;
+}
+
+const WeightsSeparable5& WeightsSeparable5Lowpass() {
+  constexpr float w0 = 0.41714928f;
+  constexpr float w1 = 0.25539268f;
+  constexpr float w2 = 0.03603267f;
+  static constexpr WeightsSeparable5 weights = {
+      {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)},
+      {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)}};
+  return weights;
+}
+
+const WeightsSymmetric5& WeightsSymmetric5Lowpass() {
+  static constexpr WeightsSymmetric5 weights = {
+      {JXL_REP4(0.1740135f)}, {JXL_REP4(0.1065369f)}, {JXL_REP4(0.0150310f)},
+      {JXL_REP4(0.0652254f)}, {JXL_REP4(0.0012984f)}, {JXL_REP4(0.0092025f)}};
+  return weights;
+}
+
+const WeightsSeparable5& WeightsSeparable5Gaussian1() {
+  constexpr float w0 = 0.38774f;
+  constexpr float w1 = 0.24477f;
+  constexpr float w2 = 0.06136f;
+  static constexpr WeightsSeparable5 weights = {
+      {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)},
+      {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)}};
+  return weights;
+}
+
+const WeightsSeparable5& WeightsSeparable5Gaussian2() {
+  constexpr float w0 = 0.250301f;
+  constexpr float w1 = 0.221461f;
+  constexpr float w2 = 0.153388f;
+  static constexpr WeightsSeparable5 weights = {
+      {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)},
+      {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)}};
+  return weights;
+}
+
+#undef JXL_REP4
+
+//------------------------------------------------------------------------------
+// Slow
+
+namespace {
+
+template <class WrapX, class WrapY>
+float SlowSymmetric3Pixel(const ImageF& in, const int64_t ix, const int64_t iy,
+                          const int64_t xsize, const int64_t ysize,
+                          const WeightsSymmetric3& weights) {
+  float sum = 0.0f;
+
+  // ix: image; kx: kernel
+  for (int64_t ky = -1; ky <= 1; ky++) {
+    const int64_t y = WrapY()(iy + ky, ysize);
+    const float* JXL_RESTRICT row_in = in.ConstRow(static_cast<size_t>(y));
+
+    const float wc = ky == 0 ? weights.c[0] : weights.r[0];
+    const float wlr = ky == 0 ? weights.r[0] : weights.d[0];
+
+    const int64_t xm1 = WrapX()(ix - 1, xsize);
+    const int64_t xp1 = WrapX()(ix + 1, xsize);
+    sum += row_in[ix] * wc + (row_in[xm1] + row_in[xp1]) * wlr;
+  }
+  return sum;
+}
+
+template <class WrapY>
+void SlowSymmetric3Row(const ImageF& in, const int64_t iy, const int64_t xsize,
+                       const int64_t ysize, const WeightsSymmetric3& weights,
+                       float* JXL_RESTRICT row_out) {
+  row_out[0] =
+      SlowSymmetric3Pixel<WrapMirror, WrapY>(in, 0, iy, xsize, ysize, weights);
+  for (int64_t ix = 1; ix < xsize - 1; ix++) {
+    row_out[ix] = SlowSymmetric3Pixel<WrapUnchanged, WrapY>(in, ix, iy, xsize,
+                                                            ysize, weights);
+  }
+  {
+    const int64_t ix = xsize - 1;
+    row_out[ix] = SlowSymmetric3Pixel<WrapMirror, WrapY>(in, ix, iy, xsize,
+                                                         ysize, weights);
+  }
+}
+
+}  // namespace
+
+void SlowSymmetric3(const ImageF& in, const Rect& rect,
+                    const WeightsSymmetric3& weights, ThreadPool* pool,
+                    ImageF* JXL_RESTRICT out) {
+  PROFILER_FUNC;
+
+  const int64_t xsize = static_cast<int64_t>(rect.xsize());
+  const int64_t ysize = static_cast<int64_t>(rect.ysize());
+  const int64_t kRadius = 1;
+
+  JXL_CHECK(RunOnPool(
+      pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const int64_t iy = task;
+        float* JXL_RESTRICT out_row = out->Row(static_cast<size_t>(iy));
+
+        if (iy < kRadius || iy >= ysize - kRadius) {
+          SlowSymmetric3Row<WrapMirror>(in, iy, xsize, ysize, weights, out_row);
+        } else {
+          SlowSymmetric3Row<WrapUnchanged>(in, iy, xsize, ysize, weights,
+                                           out_row);
+        }
+      },
+      "SlowSymmetric3"));
+}
+
+namespace {
+
+// Separable kernels, any radius.
+float SlowSeparablePixel(const ImageF& in, const Rect& rect, const int64_t x,
+                         const int64_t y, const int64_t radius,
+                         const float* JXL_RESTRICT horz_weights,
+                         const float* JXL_RESTRICT vert_weights) {
+  const size_t xsize = rect.xsize();
+  const size_t ysize = rect.ysize();
+  const WrapMirror wrap;
+
+  float mul = 0.0f;
+  for (int dy = -radius; dy <= radius; ++dy) {
+    const float wy = vert_weights[std::abs(dy) * 4];
+    const size_t sy = wrap(y + dy, ysize);
+    JXL_CHECK(sy < ysize);
+    const float* const JXL_RESTRICT row = rect.ConstRow(in, sy);
+    for (int dx = -radius; dx <= radius; ++dx) {
+      const float wx = horz_weights[std::abs(dx) * 4];
+      const size_t sx = wrap(x + dx, xsize);
+      JXL_CHECK(sx < xsize);
+      mul += row[sx] * wx * wy;
+    }
+  }
+  return mul;
+}
+
+}  // namespace
+
+void SlowSeparable5(const ImageF& in, const Rect& rect,
+                    const WeightsSeparable5& weights, ThreadPool* pool,
+                    ImageF* out) {
+  PROFILER_FUNC;
+  const float* horz_weights = &weights.horz[0];
+  const float* vert_weights = &weights.vert[0];
+
+  const size_t ysize = rect.ysize();
+  JXL_CHECK(RunOnPool(
+      pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const int64_t y = task;
+
+        float* const JXL_RESTRICT row_out = out->Row(y);
+        for (size_t x = 0; x < rect.xsize(); ++x) {
+          row_out[x] = SlowSeparablePixel(in, rect, x, y, /*radius=*/2,
+                                          horz_weights, vert_weights);
+        }
+      },
+      "SlowSeparable5"));
+}
+
+void SlowSeparable7(const ImageF& in, const Rect& rect,
+                    const WeightsSeparable7& weights, ThreadPool* pool,
+                    ImageF* out) {
+  PROFILER_FUNC;
+  const float* horz_weights = &weights.horz[0];
+  const float* vert_weights = &weights.vert[0];
+
+  const size_t ysize = rect.ysize();
+  JXL_CHECK(RunOnPool(
+      pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const int64_t y = task;
+
+        float* const JXL_RESTRICT row_out = out->Row(y);
+        for (size_t x = 0; x < rect.xsize(); ++x) {
+          row_out[x] = SlowSeparablePixel(in, rect, x, y, /*radius=*/3,
+                                          horz_weights, vert_weights);
+        }
+      },
+      "SlowSeparable7"));
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/convolve_symmetric3.cc b/third_party/jpeg-xl/lib/jxl/convolve_symmetric3.cc
new file mode 100644
index 0000000000..06b59dfb60
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve_symmetric3.cc
@@ -0,0 +1,194 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/convolve.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric3.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/convolve-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Vec;
+
+template <class WrapY, class V>
+static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix,
+                     const int64_t iy, const size_t ysize, const V wx0,
+                     const V wx1, const V wx2) {
+  const HWY_FULL(float) d;
+  const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix;
+  const auto in_m2 = LoadU(d, center - 2);
+  const auto in_p2 = LoadU(d, center + 2);
+  const auto in_m1 = LoadU(d, center - 1);
+  const auto in_p1 = LoadU(d, center + 1);
+  const auto in_00 = Load(d, center);
+  const auto sum_2 = Mul(wx2, Add(in_m2, in_p2));
+  const auto sum_1 = Mul(wx1, Add(in_m1, in_p1));
+  const auto sum_0 = Mul(wx0, in_00);
+  return Add(sum_2, Add(sum_1, sum_0));
+}
+
+// 3x3 convolution by symmetric kernel with a single scan through the input.
+class Symmetric3Strategy {
+  using D = HWY_CAPPED(float, 16);
+  using V = Vec<D>;
+
+ public:
+  static constexpr int64_t kRadius = 1;
+
+  // Only accesses pixels in [0, xsize).
+  template <size_t kSizeModN, class WrapRow>
+  static JXL_MAYBE_INLINE void ConvolveRow(
+      const float* const JXL_RESTRICT row_m, const size_t xsize,
+      const int64_t stride, const WrapRow& wrap_row,
+      const WeightsSymmetric3& weights, float* const JXL_RESTRICT row_out) {
+    const D d;
+    // t, m, b = top, middle, bottom row;
+    const float* const JXL_RESTRICT row_t = wrap_row(row_m - stride, stride);
+    const float* const JXL_RESTRICT row_b = wrap_row(row_m + stride, stride);
+
+    // Must load in advance - compiler doesn't understand LoadDup128 and
+    // schedules them too late.
+    const V w0 = LoadDup128(d, weights.c);
+    const V w1 = LoadDup128(d, weights.r);
+    const V w2 = LoadDup128(d, weights.d);
+
+    // l, c, r = left, center, right. Leftmost vector: need FirstL1.
+    {
+      const V tc = LoadU(d, row_t + 0);
+      const V mc = LoadU(d, row_m + 0);
+      const V bc = LoadU(d, row_b + 0);
+      const V tl = Neighbors::FirstL1(tc);
+      const V tr = LoadU(d, row_t + 0 + 1);
+      const V ml = Neighbors::FirstL1(mc);
+      const V mr = LoadU(d, row_m + 0 + 1);
+      const V bl = Neighbors::FirstL1(bc);
+      const V br = LoadU(d, row_b + 0 + 1);
+      const V conv =
+          WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2);
+      Store(conv, d, row_out + 0);
+    }
+
+    // Loop as long as we can load enough new values:
+    const size_t N = Lanes(d);
+    size_t x = N;
+    for (; x + N + kRadius <= xsize; x += N) {
+      const auto conv = ConvolveValid(row_t, row_m, row_b, x, w0, w1, w2);
+      Store(conv, d, row_out + x);
+    }
+
+    // For final (partial) vector:
+    const V tc = LoadU(d, row_t + x);
+    const V mc = LoadU(d, row_m + x);
+    const V bc = LoadU(d, row_b + x);
+
+    V tr, mr, br;
+#if HWY_TARGET == HWY_SCALAR
+    tr = tc;  // Single-lane => mirrored right neighbor = center value.
+    mr = mc;
+    br = bc;
+#else
+    if (kSizeModN == 0) {
+      // The above loop didn't handle the last vector because it needs an
+      // additional right neighbor (generated via mirroring).
+      auto mirror = SetTableIndices(d, MirrorLanes(N - 1));
+      tr = TableLookupLanes(tc, mirror);
+      mr = TableLookupLanes(mc, mirror);
+      br = TableLookupLanes(bc, mirror);
+    } else {
+      auto mirror = SetTableIndices(d, MirrorLanes((xsize % N) - 1));
+      // Loads last valid value into uppermost lane and mirrors.
+      tr = TableLookupLanes(LoadU(d, row_t + xsize - N), mirror);
+      mr = TableLookupLanes(LoadU(d, row_m + xsize - N), mirror);
+      br = TableLookupLanes(LoadU(d, row_b + xsize - N), mirror);
+    }
+#endif
+
+    const V tl = LoadU(d, row_t + x - 1);
+    const V ml = LoadU(d, row_m + x - 1);
+    const V bl = LoadU(d, row_b + x - 1);
+    const V conv = WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2);
+    Store(conv, d, row_out + x);
+  }
+
+ private:
+  // Returns sum{x_i * w_i}.
+  template <class V>
+  static JXL_MAYBE_INLINE V WeightedSum(const V tl, const V tc, const V tr,
+                                        const V ml, const V mc, const V mr,
+                                        const V bl, const V bc, const V br,
+                                        const V w0, const V w1, const V w2) {
+    const V sum_tb = Add(tc, bc);
+
+    // Faster than 5 mul + 4 FMA.
+    const V mul0 = Mul(mc, w0);
+    const V sum_lr = Add(ml, mr);
+
+    const V x1 = Add(sum_tb, sum_lr);
+    const V mul1 = MulAdd(x1, w1, mul0);
+
+    const V sum_t2 = Add(tl, tr);
+    const V sum_b2 = Add(bl, br);
+    const V x2 = Add(sum_t2, sum_b2);
+    const V mul2 = MulAdd(x2, w2, mul1);
+    return mul2;
+  }
+
+  static JXL_MAYBE_INLINE V ConvolveValid(const float* JXL_RESTRICT row_t,
+                                          const float* JXL_RESTRICT row_m,
+                                          const float* JXL_RESTRICT row_b,
+                                          const int64_t x, const V w0,
+                                          const V w1, const V w2) {
+    const D d;
+    const V tc = LoadU(d, row_t + x);
+    const V mc = LoadU(d, row_m + x);
+    const V bc = LoadU(d, row_b + x);
+    const V tl = LoadU(d, row_t + x - 1);
+    const V tr = LoadU(d, row_t + x + 1);
+    const V ml = LoadU(d, row_m + x - 1);
+    const V mr = LoadU(d, row_m + x + 1);
+    const V bl = LoadU(d, row_b + x - 1);
+    const V br = LoadU(d, row_b + x + 1);
+    return WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2);
+  }
+};
+
+void Symmetric3(const ImageF& in, const Rect& rect,
+                const WeightsSymmetric3& weights, ThreadPool* pool,
+                ImageF* out) {
+  using Conv = ConvolveT<Symmetric3Strategy>;
+  if (rect.xsize() >= Conv::MinWidth()) {
+    return Conv::Run(in, rect, weights, pool, out);
+  }
+
+  return SlowSymmetric3(in, rect, weights, pool, out);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(Symmetric3);
+void Symmetric3(const ImageF& in, const Rect& rect,
+                const WeightsSymmetric3& weights, ThreadPool* pool,
+                ImageF* out) {
+  return HWY_DYNAMIC_DISPATCH(Symmetric3)(in, rect, weights, pool, out);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc b/third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc
new file mode 100644
index 0000000000..55a16899c3
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc
@@ -0,0 +1,185 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/convolve.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric5.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/common.h"  // RoundUpTo
+#include "lib/jxl/convolve-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::Vec;
+
+// Weighted sum of 1x5 pixels around ix, iy with [wx2 wx1 wx0 wx1 wx2].
+template <class WrapY>
+static float WeightedSumBorder(const ImageF& in, const WrapY wrap_y,
+                               const int64_t ix, const int64_t iy,
+                               const size_t xsize, const size_t ysize,
+                               const float wx0, const float wx1,
+                               const float wx2) {
+  const WrapMirror wrap_x;
+  const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize));
+  const float in_m2 = row[wrap_x(ix - 2, xsize)];
+  const float in_p2 = row[wrap_x(ix + 2, xsize)];
+  const float in_m1 = row[wrap_x(ix - 1, xsize)];
+  const float in_p1 = row[wrap_x(ix + 1, xsize)];
+  const float in_00 = row[ix];
+  const float sum_2 = wx2 * (in_m2 + in_p2);
+  const float sum_1 = wx1 * (in_m1 + in_p1);
+  const float sum_0 = wx0 * in_00;
+  return sum_2 + sum_1 + sum_0;
+}
+
+template <class WrapY, class V>
+static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix,
+                     const int64_t iy, const size_t ysize, const V wx0,
+                     const V wx1, const V wx2) {
+  const HWY_FULL(float) d;
+  const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix;
+  const auto in_m2 = LoadU(d, center - 2);
+  const auto in_p2 = LoadU(d, center + 2);
+  const auto in_m1 = LoadU(d, center - 1);
+  const auto in_p1 = LoadU(d, center + 1);
+  const auto in_00 = Load(d, center);
+  const auto sum_2 = Mul(wx2, Add(in_m2, in_p2));
+  const auto sum_1 = Mul(wx1, Add(in_m1, in_p1));
+  const auto sum_0 = Mul(wx0, in_00);
+  return Add(sum_2, Add(sum_1, sum_0));
+}
+
+// Produces result for one pixel
+template <class WrapY>
+float Symmetric5Border(const ImageF& in, const Rect& rect, const int64_t ix,
+                       const int64_t iy, const WeightsSymmetric5& weights) {
+  const float w0 = weights.c[0];
+  const float w1 = weights.r[0];
+  const float w2 = weights.R[0];
+  const float w4 = weights.d[0];
+  const float w5 = weights.L[0];
+  const float w8 = weights.D[0];
+
+  const size_t xsize = rect.xsize();
+  const size_t ysize = rect.ysize();
+  const WrapY wrap_y;
+  // Unrolled loop over all 5 rows of the kernel.
+  float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2);
+
+  sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8);
+  float sum1 =
+      WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8);
+
+  sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5);
+  sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5);
+
+  return sum0 + sum1;
+}
+
+// Produces result for one vector's worth of pixels
+template <class WrapY>
+static void Symmetric5Interior(const ImageF& in, const Rect& rect,
+                               const int64_t ix, const int64_t iy,
+                               const WeightsSymmetric5& weights,
+                               float* JXL_RESTRICT row_out) {
+  const HWY_FULL(float) d;
+
+  const auto w0 = LoadDup128(d, weights.c);
+  const auto w1 = LoadDup128(d, weights.r);
+  const auto w2 = LoadDup128(d, weights.R);
+  const auto w4 = LoadDup128(d, weights.d);
+  const auto w5 = LoadDup128(d, weights.L);
+  const auto w8 = LoadDup128(d, weights.D);
+
+  const size_t ysize = rect.ysize();
+  const WrapY wrap_y;
+  // Unrolled loop over all 5 rows of the kernel.
+  auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2);
+
+  sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8));
+  auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8);
+
+  sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5));
+  sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5));
+
+  Store(Add(sum0, sum1), d, row_out + ix);
+}
+
+template <class WrapY>
+static void Symmetric5Row(const ImageF& in, const Rect& rect, const int64_t iy,
+                          const WeightsSymmetric5& weights,
+                          float* JXL_RESTRICT row_out) {
+  const int64_t kRadius = 2;
+  const size_t xsize = rect.xsize();
+
+  size_t ix = 0;
+  const HWY_FULL(float) d;
+  const size_t N = Lanes(d);
+  const size_t aligned_x = RoundUpTo(kRadius, N);
+  for (; ix < std::min(aligned_x, xsize); ++ix) {
+    row_out[ix] = Symmetric5Border<WrapY>(in, rect, ix, iy, weights);
+  }
+  for (; ix + N + kRadius <= xsize; ix += N) {
+    Symmetric5Interior<WrapY>(in, rect, ix, iy, weights, row_out);
+  }
+  for (; ix < xsize; ++ix) {
+    row_out[ix] = Symmetric5Border<WrapY>(in, rect, ix, iy, weights);
+  }
+}
+
+static JXL_NOINLINE void Symmetric5BorderRow(const ImageF& in, const Rect& rect,
+                                             const int64_t iy,
+                                             const WeightsSymmetric5& weights,
+                                             float* JXL_RESTRICT row_out) {
+  return Symmetric5Row<WrapMirror>(in, rect, iy, weights, row_out);
+}
+
+// Semi-vectorized (interior pixels Fonly); called directly like slow::, unlike
+// the fully vectorized strategies below.
+void Symmetric5(const ImageF& in, const Rect& rect,
+                const WeightsSymmetric5& weights, ThreadPool* pool,
+                ImageF* JXL_RESTRICT out) {
+  PROFILER_FUNC;
+
+  const size_t ysize = rect.ysize();
+  JXL_CHECK(RunOnPool(
+      pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const int64_t iy = task;
+
+        if (iy < 2 || iy >= static_cast<ssize_t>(ysize) - 2) {
+          Symmetric5BorderRow(in, rect, iy, weights, out->Row(iy));
+        } else {
+          Symmetric5Row<WrapUnchanged>(in, rect, iy, weights, out->Row(iy));
+        }
+      },
+      "Symmetric5x5Convolution"));
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(Symmetric5);
+void Symmetric5(const ImageF& in, const Rect& rect,
+                const WeightsSymmetric5& weights, ThreadPool* pool,
+                ImageF* JXL_RESTRICT out) {
+  return HWY_DYNAMIC_DISPATCH(Symmetric5)(in, rect, weights, pool, out);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/convolve_test.cc b/third_party/jpeg-xl/lib/jxl/convolve_test.cc
new file mode 100644
index 0000000000..e86d637114
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve_test.cc
@@ -0,0 +1,252 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/convolve.h"
+
+#include <time.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/convolve_test.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+#include <hwy/nanobenchmark.h>
+#include <hwy/tests/test_util-inl.h>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+#ifndef JXL_DEBUG_CONVOLVE
+#define JXL_DEBUG_CONVOLVE 0
+#endif
+
+#include "lib/jxl/convolve-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+void TestNeighbors() {
+  const Neighbors::D d;
+  const Neighbors::V v = Iota(d, 0);
+  HWY_ALIGN float actual[hwy::kTestMaxVectorSize / sizeof(float)] = {0};
+
+  HWY_ALIGN float first_l1[hwy::kTestMaxVectorSize / sizeof(float)] = {
+      0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14};
+  Store(Neighbors::FirstL1(v), d, actual);
+  const size_t N = Lanes(d);
+  EXPECT_EQ(std::vector<float>(first_l1, first_l1 + N),
+            std::vector<float>(actual, actual + N));
+
+#if HWY_TARGET != HWY_SCALAR
+  HWY_ALIGN float first_l2[hwy::kTestMaxVectorSize / sizeof(float)] = {
+      1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13};
+  Store(Neighbors::FirstL2(v), d, actual);
+  EXPECT_EQ(std::vector<float>(first_l2, first_l2 + N),
+            std::vector<float>(actual, actual + N));
+
+  HWY_ALIGN float first_l3[hwy::kTestMaxVectorSize / sizeof(float)] = {
+      2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  Store(Neighbors::FirstL3(v), d, actual);
+  EXPECT_EQ(std::vector<float>(first_l3, first_l3 + N),
+            std::vector<float>(actual, actual + N));
+#endif  // HWY_TARGET != HWY_SCALAR
+}
+
+void VerifySymmetric3(const size_t xsize, const size_t ysize, ThreadPool* pool,
+                      Rng* rng) {
+  const Rect rect(0, 0, xsize, ysize);
+
+  ImageF in(xsize, ysize);
+  GenerateImage(*rng, &in, 0.0f, 1.0f);
+
+  ImageF out_expected(xsize, ysize);
+  ImageF out_actual(xsize, ysize);
+
+  const WeightsSymmetric3& weights = WeightsSymmetric3Lowpass();
+  Symmetric3(in, rect, weights, pool, &out_expected);
+  SlowSymmetric3(in, rect, weights, pool, &out_actual);
+
+  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
+}
+
+// Ensures Symmetric and Separable give the same result.
+void VerifySymmetric5(const size_t xsize, const size_t ysize, ThreadPool* pool,
+                      Rng* rng) {
+  const Rect rect(0, 0, xsize, ysize);
+
+  ImageF in(xsize, ysize);
+  GenerateImage(*rng, &in, 0.0f, 1.0f);
+
+  ImageF out_expected(xsize, ysize);
+  ImageF out_actual(xsize, ysize);
+
+  Separable5(in, Rect(in), WeightsSeparable5Lowpass(), pool, &out_expected);
+  Symmetric5(in, rect, WeightsSymmetric5Lowpass(), pool, &out_actual);
+
+  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
+}
+
+void VerifySeparable5(const size_t xsize, const size_t ysize, ThreadPool* pool,
+                      Rng* rng) {
+  const Rect rect(0, 0, xsize, ysize);
+
+  ImageF in(xsize, ysize);
+  GenerateImage(*rng, &in, 0.0f, 1.0f);
+
+  ImageF out_expected(xsize, ysize);
+  ImageF out_actual(xsize, ysize);
+
+  const WeightsSeparable5& weights = WeightsSeparable5Lowpass();
+  Separable5(in, Rect(in), weights, pool, &out_expected);
+  SlowSeparable5(in, rect, weights, pool, &out_actual);
+
+  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
+}
+
+void VerifySeparable7(const size_t xsize, const size_t ysize, ThreadPool* pool,
+                      Rng* rng) {
+  const Rect rect(0, 0, xsize, ysize);
+
+  ImageF in(xsize, ysize);
+  GenerateImage(*rng, &in, 0.0f, 1.0f);
+
+  ImageF out_expected(xsize, ysize);
+  ImageF out_actual(xsize, ysize);
+
+  // Gaussian sigma 1.0
+  const WeightsSeparable7 weights = {{HWY_REP4(0.383103f), HWY_REP4(0.241843f),
+                                      HWY_REP4(0.060626f), HWY_REP4(0.00598f)},
+                                     {HWY_REP4(0.383103f), HWY_REP4(0.241843f),
+                                      HWY_REP4(0.060626f), HWY_REP4(0.00598f)}};
+
+  SlowSeparable7(in, rect, weights, pool, &out_expected);
+  Separable7(in, Rect(in), weights, pool, &out_actual);
+
+  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
+}
+
+// For all xsize/ysize and kernels:
+void TestConvolve() {
+  TestNeighbors();
+
+  test::ThreadPoolForTests pool(4);
+  EXPECT_EQ(true,
+            RunOnPool(
+                &pool, kConvolveMaxRadius, 40, ThreadPool::NoInit,
+                [](const uint32_t task, size_t /*thread*/) {
+                  const size_t xsize = task;
+                  Rng rng(129 + 13 * xsize);
+
+                  ThreadPool* null_pool = nullptr;
+                  test::ThreadPoolForTests pool3(3);
+                  for (size_t ysize = kConvolveMaxRadius; ysize < 16; ++ysize) {
+                    JXL_DEBUG(JXL_DEBUG_CONVOLVE,
+                              "%" PRIuS " x %" PRIuS " (target %" PRIx64
+                              ")===============================",
+                              xsize, ysize, static_cast<int64_t>(HWY_TARGET));
+
+                    JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sym3------------------");
+                    VerifySymmetric3(xsize, ysize, null_pool, &rng);
+                    VerifySymmetric3(xsize, ysize, &pool3, &rng);
+
+                    JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sym5------------------");
+                    VerifySymmetric5(xsize, ysize, null_pool, &rng);
+                    VerifySymmetric5(xsize, ysize, &pool3, &rng);
+
+                    JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sep5------------------");
+                    VerifySeparable5(xsize, ysize, null_pool, &rng);
+                    VerifySeparable5(xsize, ysize, &pool3, &rng);
+
+                    JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sep7------------------");
+                    VerifySeparable7(xsize, ysize, null_pool, &rng);
+                    VerifySeparable7(xsize, ysize, &pool3, &rng);
+                  }
+                },
+                "TestConvolve"));
+}
+
+// Measures durations, verifies results, prints timings. `unpredictable1`
+// must have value 1 (unknown to the compiler to prevent elision).
+template <class Conv>
+void BenchmarkConv(const char* caption, const Conv& conv,
+                   const hwy::FuncInput unpredictable1) {
+  const size_t kNumInputs = 1;
+  const hwy::FuncInput inputs[kNumInputs] = {unpredictable1};
+  hwy::Result results[kNumInputs];
+
+  const size_t kDim = 160;  // in+out fit in L2
+  ImageF in(kDim, kDim);
+  ZeroFillImage(&in);
+  in.Row(kDim / 2)[kDim / 2] = unpredictable1;
+  ImageF out(kDim, kDim);
+
+  hwy::Params p;
+  p.verbose = false;
+  p.max_evals = 7;
+  p.target_rel_mad = 0.002;
+  const size_t num_results = MeasureClosure(
+      [&in, &conv, &out](const hwy::FuncInput input) {
+        conv(in, &out);
+        return out.Row(input)[0];
+      },
+      inputs, kNumInputs, results, p);
+  if (num_results != kNumInputs) {
+    fprintf(stderr, "MeasureClosure failed.\n");
+  }
+  for (size_t i = 0; i < num_results; ++i) {
+    const double seconds = static_cast<double>(results[i].ticks) /
+                           hwy::platform::InvariantTicksPerSecond();
+    printf("%12s: %7.2f MP/s (MAD=%4.2f%%)\n", caption,
+           kDim * kDim * 1E-6 / seconds,
+           static_cast<double>(results[i].variability) * 100.0);
+  }
+}
+
+struct ConvSymmetric3 {
+  void operator()(const ImageF& in, ImageF* JXL_RESTRICT out) const {
+    ThreadPool* null_pool = nullptr;
+    Symmetric3(in, Rect(in), WeightsSymmetric3Lowpass(), null_pool, out);
+  }
+};
+
+struct ConvSeparable5 {
+  void operator()(const ImageF& in, ImageF* JXL_RESTRICT out) const {
+    ThreadPool* null_pool = nullptr;
+    Separable5(in, Rect(in), WeightsSeparable5Lowpass(), null_pool, out);
+  }
+};
+
+void BenchmarkAll() {
+#if 0  // disabled to avoid test timeouts, run manually on demand
+  const hwy::FuncInput unpredictable1 = time(nullptr) != 1234;
+  BenchmarkConv("Symmetric3", ConvSymmetric3(), unpredictable1);
+  BenchmarkConv("Separable5", ConvSeparable5(), unpredictable1);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class ConvolveTest : public hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(ConvolveTest);
+
+HWY_EXPORT_AND_TEST_P(ConvolveTest, TestConvolve);
+
+HWY_EXPORT_AND_TEST_P(ConvolveTest, BenchmarkAll);
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/data_parallel_test.cc b/third_party/jpeg-xl/lib/jxl/data_parallel_test.cc
new file mode 100644
index 0000000000..ee2a97f93a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/data_parallel_test.cc
@@ -0,0 +1,87 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/data_parallel.h"
+
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+class DataParallelTest : public ::testing::Test {
+ protected:
+  // A fake class to verify that DataParallel is properly calling the
+  // client-provided runner functions.
+  static int FakeRunner(void* runner_opaque, void* jpegxl_opaque,
+                        JxlParallelRunInit init, JxlParallelRunFunction func,
+                        uint32_t start_range, uint32_t end_range) {
+    DataParallelTest* self = static_cast<DataParallelTest*>(runner_opaque);
+    self->runner_called_++;
+    self->jpegxl_opaque_ = jpegxl_opaque;
+    self->init_ = init;
+    self->func_ = func;
+    self->start_range_ = start_range;
+    self->end_range_ = end_range;
+    return self->runner_return_;
+  }
+
+  ThreadPool pool_{&DataParallelTest::FakeRunner, this};
+
+  // Number of times FakeRunner() was called.
+  int runner_called_ = 0;
+
+  // Parameters passed to FakeRunner.
+  void* jpegxl_opaque_ = nullptr;
+  JxlParallelRunInit init_ = nullptr;
+  JxlParallelRunFunction func_ = nullptr;
+  uint32_t start_range_ = -1;
+  uint32_t end_range_ = -1;
+
+  // Return value that FakeRunner will return.
+  int runner_return_ = 0;
+};
+
+// JxlParallelRunInit interface.
+typedef int (*JxlParallelRunInit)();
+
+}  // namespace
+
+TEST_F(DataParallelTest, RunnerCalledParameters) {
+  EXPECT_TRUE(pool_.Run(
+      1234, 5678, [](size_t /* num_threads */) { return true; },
+      [](uint32_t /* task */, size_t /* thread */) { return; }));
+  EXPECT_EQ(1, runner_called_);
+  EXPECT_NE(nullptr, init_);
+  EXPECT_NE(nullptr, func_);
+  EXPECT_NE(nullptr, jpegxl_opaque_);
+  EXPECT_EQ(1234u, start_range_);
+  EXPECT_EQ(5678u, end_range_);
+}
+
+TEST_F(DataParallelTest, RunnerFailurePropagates) {
+  runner_return_ = -1;  // FakeRunner return value.
+  EXPECT_FALSE(pool_.Run(
+      1234, 5678, [](size_t /* num_threads */) { return false; },
+      [](uint32_t /* task */, size_t /* thread */) { return; }));
+  EXPECT_FALSE(RunOnPool(
+      nullptr, 1234, 5678, [](size_t /* num_threads */) { return false; },
+      [](uint32_t /* task */, size_t /* thread */) { return; }, "Test"));
+}
+
+TEST_F(DataParallelTest, RunnerNotCalledOnEmptyRange) {
+  runner_return_ = -1;  // FakeRunner return value.
+  EXPECT_TRUE(pool_.Run(
+      123, 123, [](size_t /* num_threads */) { return false; },
+      [](uint32_t /* task */, size_t /* thread */) { return; }));
+  EXPECT_TRUE(RunOnPool(
+      nullptr, 123, 123, [](size_t /* num_threads */) { return false; },
+      [](uint32_t /* task */, size_t /* thread */) { return; }, "Test"));
+  // We don't call the external runner when the range is empty. We don't even
+  // need to call the init function.
+  EXPECT_EQ(0, runner_called_);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dct-inl.h b/third_party/jpeg-xl/lib/jxl/dct-inl.h
new file mode 100644
index 0000000000..532606075e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dct-inl.h
@@ -0,0 +1,334 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Fast SIMD floating-point (I)DCT, any power of two.
+
+#if defined(LIB_JXL_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_DCT_INL_H_
+#undef LIB_JXL_DCT_INL_H_
+#else
+#define LIB_JXL_DCT_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/dct_block-inl.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/transpose-inl.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::NegMulAdd;
+using hwy::HWY_NAMESPACE::Sub;
+
+template <size_t SZ>
+struct FVImpl {
+  using type = HWY_CAPPED(float, SZ);
+};
+
+template <>
+struct FVImpl<0> {
+  using type = HWY_FULL(float);
+};
+
+template <size_t SZ>
+using FV = typename FVImpl<SZ>::type;
+
+// Implementation of Lowest Complexity Self Recursive Radix-2 DCT II/III
+// Algorithms, by Siriani M. Perera and Jianhua Liu.
+
+template <size_t N, size_t SZ>
+struct CoeffBundle {
+  static void AddReverse(const float* JXL_RESTRICT ain1,
+                         const float* JXL_RESTRICT ain2,
+                         float* JXL_RESTRICT aout) {
+    for (size_t i = 0; i < N; i++) {
+      auto in1 = Load(FV<SZ>(), ain1 + i * SZ);
+      auto in2 = Load(FV<SZ>(), ain2 + (N - i - 1) * SZ);
+      Store(Add(in1, in2), FV<SZ>(), aout + i * SZ);
+    }
+  }
+  static void SubReverse(const float* JXL_RESTRICT ain1,
+                         const float* JXL_RESTRICT ain2,
+                         float* JXL_RESTRICT aout) {
+    for (size_t i = 0; i < N; i++) {
+      auto in1 = Load(FV<SZ>(), ain1 + i * SZ);
+      auto in2 = Load(FV<SZ>(), ain2 + (N - i - 1) * SZ);
+      Store(Sub(in1, in2), FV<SZ>(), aout + i * SZ);
+    }
+  }
+  static void B(float* JXL_RESTRICT coeff) {
+    auto sqrt2 = Set(FV<SZ>(), kSqrt2);
+    auto in1 = Load(FV<SZ>(), coeff);
+    auto in2 = Load(FV<SZ>(), coeff + SZ);
+    Store(MulAdd(in1, sqrt2, in2), FV<SZ>(), coeff);
+    for (size_t i = 1; i + 1 < N; i++) {
+      auto in1 = Load(FV<SZ>(), coeff + i * SZ);
+      auto in2 = Load(FV<SZ>(), coeff + (i + 1) * SZ);
+      Store(Add(in1, in2), FV<SZ>(), coeff + i * SZ);
+    }
+  }
+  static void BTranspose(float* JXL_RESTRICT coeff) {
+    for (size_t i = N - 1; i > 0; i--) {
+      auto in1 = Load(FV<SZ>(), coeff + i * SZ);
+      auto in2 = Load(FV<SZ>(), coeff + (i - 1) * SZ);
+      Store(Add(in1, in2), FV<SZ>(), coeff + i * SZ);
+    }
+    auto sqrt2 = Set(FV<SZ>(), kSqrt2);
+    auto in1 = Load(FV<SZ>(), coeff);
+    Store(Mul(in1, sqrt2), FV<SZ>(), coeff);
+  }
+  // Ideally optimized away by compiler (except the multiply).
+  static void InverseEvenOdd(const float* JXL_RESTRICT ain,
+                             float* JXL_RESTRICT aout) {
+    for (size_t i = 0; i < N / 2; i++) {
+      auto in1 = Load(FV<SZ>(), ain + i * SZ);
+      Store(in1, FV<SZ>(), aout + 2 * i * SZ);
+    }
+    for (size_t i = N / 2; i < N; i++) {
+      auto in1 = Load(FV<SZ>(), ain + i * SZ);
+      Store(in1, FV<SZ>(), aout + (2 * (i - N / 2) + 1) * SZ);
+    }
+  }
+  // Ideally optimized away by compiler.
+  static void ForwardEvenOdd(const float* JXL_RESTRICT ain, size_t ain_stride,
+                             float* JXL_RESTRICT aout) {
+    for (size_t i = 0; i < N / 2; i++) {
+      auto in1 = LoadU(FV<SZ>(), ain + 2 * i * ain_stride);
+      Store(in1, FV<SZ>(), aout + i * SZ);
+    }
+    for (size_t i = N / 2; i < N; i++) {
+      auto in1 = LoadU(FV<SZ>(), ain + (2 * (i - N / 2) + 1) * ain_stride);
+      Store(in1, FV<SZ>(), aout + i * SZ);
+    }
+  }
+  // Invoked on full vector.
+  static void Multiply(float* JXL_RESTRICT coeff) {
+    for (size_t i = 0; i < N / 2; i++) {
+      auto in1 = Load(FV<SZ>(), coeff + (N / 2 + i) * SZ);
+      auto mul = Set(FV<SZ>(), WcMultipliers<N>::kMultipliers[i]);
+      Store(Mul(in1, mul), FV<SZ>(), coeff + (N / 2 + i) * SZ);
+    }
+  }
+  static void MultiplyAndAdd(const float* JXL_RESTRICT coeff,
+                             float* JXL_RESTRICT out, size_t out_stride) {
+    for (size_t i = 0; i < N / 2; i++) {
+      auto mul = Set(FV<SZ>(), WcMultipliers<N>::kMultipliers[i]);
+      auto in1 = Load(FV<SZ>(), coeff + i * SZ);
+      auto in2 = Load(FV<SZ>(), coeff + (N / 2 + i) * SZ);
+      auto out1 = MulAdd(mul, in2, in1);
+      auto out2 = NegMulAdd(mul, in2, in1);
+      StoreU(out1, FV<SZ>(), out + i * out_stride);
+      StoreU(out2, FV<SZ>(), out + (N - i - 1) * out_stride);
+    }
+  }
+  template <typename Block>
+  static void LoadFromBlock(const Block& in, size_t off,
+                            float* JXL_RESTRICT coeff) {
+    for (size_t i = 0; i < N; i++) {
+      Store(in.LoadPart(FV<SZ>(), i, off), FV<SZ>(), coeff + i * SZ);
+    }
+  }
+  template <typename Block>
+  static void StoreToBlockAndScale(const float* JXL_RESTRICT coeff,
+                                   const Block& out, size_t off) {
+    auto mul = Set(FV<SZ>(), 1.0f / N);
+    for (size_t i = 0; i < N; i++) {
+      out.StorePart(FV<SZ>(), Mul(mul, Load(FV<SZ>(), coeff + i * SZ)), i, off);
+    }
+  }
+};
+
+template <size_t N, size_t SZ>
+struct DCT1DImpl;
+
+template <size_t SZ>
+struct DCT1DImpl<1, SZ> {
+  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {}
+};
+
+template <size_t SZ>
+struct DCT1DImpl<2, SZ> {
+  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {
+    auto in1 = Load(FV<SZ>(), mem);
+    auto in2 = Load(FV<SZ>(), mem + SZ);
+    Store(Add(in1, in2), FV<SZ>(), mem);
+    Store(Sub(in1, in2), FV<SZ>(), mem + SZ);
+  }
+};
+
+template <size_t N, size_t SZ>
+struct DCT1DImpl {
+  void operator()(float* JXL_RESTRICT mem) {
+    // This is relatively small (4kB with 64-DCT and AVX-512)
+    HWY_ALIGN float tmp[N * SZ];
+    CoeffBundle<N / 2, SZ>::AddReverse(mem, mem + N / 2 * SZ, tmp);
+    DCT1DImpl<N / 2, SZ>()(tmp);
+    CoeffBundle<N / 2, SZ>::SubReverse(mem, mem + N / 2 * SZ, tmp + N / 2 * SZ);
+    CoeffBundle<N, SZ>::Multiply(tmp);
+    DCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ);
+    CoeffBundle<N / 2, SZ>::B(tmp + N / 2 * SZ);
+    CoeffBundle<N, SZ>::InverseEvenOdd(tmp, mem);
+  }
+};
+
+template <size_t N, size_t SZ>
+struct IDCT1DImpl;
+
+template <size_t SZ>
+struct IDCT1DImpl<1, SZ> {
+  JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
+                             size_t to_stride) {
+    StoreU(LoadU(FV<SZ>(), from), FV<SZ>(), to);
+  }
+};
+
+template <size_t SZ>
+struct IDCT1DImpl<2, SZ> {
+  JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
+                             size_t to_stride) {
+    JXL_DASSERT(from_stride >= SZ);
+    JXL_DASSERT(to_stride >= SZ);
+    auto in1 = LoadU(FV<SZ>(), from);
+    auto in2 = LoadU(FV<SZ>(), from + from_stride);
+    StoreU(Add(in1, in2), FV<SZ>(), to);
+    StoreU(Sub(in1, in2), FV<SZ>(), to + to_stride);
+  }
+};
+
+template <size_t N, size_t SZ>
+struct IDCT1DImpl {
+  void operator()(const float* from, size_t from_stride, float* to,
+                  size_t to_stride) {
+    JXL_DASSERT(from_stride >= SZ);
+    JXL_DASSERT(to_stride >= SZ);
+    // This is relatively small (4kB with 64-DCT and AVX-512)
+    HWY_ALIGN float tmp[N * SZ];
+    CoeffBundle<N, SZ>::ForwardEvenOdd(from, from_stride, tmp);
+    IDCT1DImpl<N / 2, SZ>()(tmp, SZ, tmp, SZ);
+    CoeffBundle<N / 2, SZ>::BTranspose(tmp + N / 2 * SZ);
+    IDCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ, SZ, tmp + N / 2 * SZ, SZ);
+    CoeffBundle<N, SZ>::MultiplyAndAdd(tmp, to, to_stride);
+  }
+};
+
+template <size_t N, size_t M_or_0, typename FromBlock, typename ToBlock>
+void DCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) {
+  size_t M = M_or_0 != 0 ? M_or_0 : Mp;
+  constexpr size_t SZ = MaxLanes(FV<M_or_0>());
+  HWY_ALIGN float tmp[N * SZ];
+  for (size_t i = 0; i < M; i += Lanes(FV<M_or_0>())) {
+    // TODO(veluca): consider removing the temporary memory here (as is done in
+    // IDCT), if it turns out that some compilers don't optimize away the loads
+    // and this is performance-critical.
+    CoeffBundle<N, SZ>::LoadFromBlock(from, i, tmp);
+    DCT1DImpl<N, SZ>()(tmp);
+    CoeffBundle<N, SZ>::StoreToBlockAndScale(tmp, to, i);
+  }
+}
+
+template <size_t N, size_t M_or_0, typename FromBlock, typename ToBlock>
+void IDCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) {
+  size_t M = M_or_0 != 0 ? M_or_0 : Mp;
+  constexpr size_t SZ = MaxLanes(FV<M_or_0>());
+  for (size_t i = 0; i < M; i += Lanes(FV<M_or_0>())) {
+    IDCT1DImpl<N, SZ>()(from.Address(0, i), from.Stride(), to.Address(0, i),
+                        to.Stride());
+  }
+}
+
+template <size_t N, size_t M, typename = void>
+struct DCT1D {
+  template <typename FromBlock, typename ToBlock>
+  void operator()(const FromBlock& from, const ToBlock& to) {
+    return DCT1DWrapper<N, M>(from, to, M);
+  }
+};
+
+template <size_t N, size_t M>
+struct DCT1D<N, M, typename std::enable_if<(M > MaxLanes(FV<0>()))>::type> {
+  template <typename FromBlock, typename ToBlock>
+  void operator()(const FromBlock& from, const ToBlock& to) {
+    return NoInlineWrapper(DCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to, M);
+  }
+};
+
+template <size_t N, size_t M, typename = void>
+struct IDCT1D {
+  template <typename FromBlock, typename ToBlock>
+  void operator()(const FromBlock& from, const ToBlock& to) {
+    return IDCT1DWrapper<N, M>(from, to, M);
+  }
+};
+
+template <size_t N, size_t M>
+struct IDCT1D<N, M, typename std::enable_if<(M > MaxLanes(FV<0>()))>::type> {
+  template <typename FromBlock, typename ToBlock>
+  void operator()(const FromBlock& from, const ToBlock& to) {
+    return NoInlineWrapper(IDCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to,
+                           M);
+  }
+};
+
+// Computes the maybe-transposed, scaled DCT of a block, that needs to be
+// HWY_ALIGN'ed.
+template <size_t ROWS, size_t COLS>
+struct ComputeScaledDCT {
+  // scratch_space must be aligned, and should have space for ROWS*COLS
+  // floats.
+  template <class From>
+  HWY_MAYBE_UNUSED void operator()(const From& from, float* to,
+                                   float* JXL_RESTRICT scratch_space) {
+    float* JXL_RESTRICT block = scratch_space;
+    if (ROWS < COLS) {
+      DCT1D<ROWS, COLS>()(from, DCTTo(block, COLS));
+      Transpose<ROWS, COLS>::Run(DCTFrom(block, COLS), DCTTo(to, ROWS));
+      DCT1D<COLS, ROWS>()(DCTFrom(to, ROWS), DCTTo(block, ROWS));
+      Transpose<COLS, ROWS>::Run(DCTFrom(block, ROWS), DCTTo(to, COLS));
+    } else {
+      DCT1D<ROWS, COLS>()(from, DCTTo(to, COLS));
+      Transpose<ROWS, COLS>::Run(DCTFrom(to, COLS), DCTTo(block, ROWS));
+      DCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(to, ROWS));
+    }
+  }
+};
+// Computes the maybe-transposed, scaled IDCT of a block, that needs to be
+// HWY_ALIGN'ed.
+template <size_t ROWS, size_t COLS>
+struct ComputeScaledIDCT {
+  // scratch_space must be aligned, and should have space for ROWS*COLS
+  // floats.
+  template <class To>
+  HWY_MAYBE_UNUSED void operator()(float* JXL_RESTRICT from, const To& to,
+                                   float* JXL_RESTRICT scratch_space) {
+    float* JXL_RESTRICT block = scratch_space;
+    // Reverse the steps done in ComputeScaledDCT.
+    if (ROWS < COLS) {
+      Transpose<ROWS, COLS>::Run(DCTFrom(from, COLS), DCTTo(block, ROWS));
+      IDCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(from, ROWS));
+      Transpose<COLS, ROWS>::Run(DCTFrom(from, ROWS), DCTTo(block, COLS));
+      IDCT1D<ROWS, COLS>()(DCTFrom(block, COLS), to);
+    } else {
+      IDCT1D<COLS, ROWS>()(DCTFrom(from, ROWS), DCTTo(block, ROWS));
+      Transpose<COLS, ROWS>::Run(DCTFrom(block, ROWS), DCTTo(from, COLS));
+      IDCT1D<ROWS, COLS>()(DCTFrom(from, COLS), to);
+    }
+  }
+};
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+#endif  // LIB_JXL_DCT_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dct_block-inl.h b/third_party/jpeg-xl/lib/jxl/dct_block-inl.h
new file mode 100644
index 0000000000..50646a737f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dct_block-inl.h
@@ -0,0 +1,108 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Adapters for DCT input/output: from/to contiguous blocks or image rows.
+
+#if defined(LIB_JXL_DCT_BLOCK_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_DCT_BLOCK_INL_H_
+#undef LIB_JXL_DCT_BLOCK_INL_H_
+#else
+#define LIB_JXL_DCT_BLOCK_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/status.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Vec;
+
+// Block: (x, y) <-> (N * y + x)
+// Lines: (x, y) <-> (stride * y + x)
+//
+// I.e. Block is a specialization of Lines with fixed stride.
+//
+// FromXXX should implement Read and Load (Read vector).
+// ToXXX should implement Write and Store (Write vector).
+
+template <size_t N>
+using BlockDesc = HWY_CAPPED(float, N);
+
+// Here and in the following, the SZ template parameter specifies the number of
+// values to load/store. Needed because we want to handle 4x4 sub-blocks of
+// 16x16 blocks.
+class DCTFrom {
+ public:
+  DCTFrom(const float* data, size_t stride) : stride_(stride), data_(data) {}
+
+  template <typename D>
+  HWY_INLINE Vec<D> LoadPart(D, const size_t row, size_t i) const {
+    JXL_DASSERT(Lanes(D()) <= stride_);
+    // Since these functions are used also for DC, no alignment at all is
+    // guaranteed in the case of floating blocks.
+    // TODO(veluca): consider using a different class for DC-to-LF and
+    // DC-from-LF, or copying DC values to/from a temporary aligned location.
+    return LoadU(D(), Address(row, i));
+  }
+
+  HWY_INLINE float Read(const size_t row, const size_t i) const {
+    return *Address(row, i);
+  }
+
+  constexpr HWY_INLINE const float* Address(const size_t row,
+                                            const size_t i) const {
+    return data_ + row * stride_ + i;
+  }
+
+  size_t Stride() const { return stride_; }
+
+ private:
+  size_t stride_;
+  const float* JXL_RESTRICT data_;
+};
+
+class DCTTo {
+ public:
+  DCTTo(float* data, size_t stride) : stride_(stride), data_(data) {}
+
+  template <typename D>
+  HWY_INLINE void StorePart(D, const Vec<D>& v, const size_t row,
+                            size_t i) const {
+    JXL_DASSERT(Lanes(D()) <= stride_);
+    // Since these functions are used also for DC, no alignment at all is
+    // guaranteed in the case of floating blocks.
+    // TODO(veluca): consider using a different class for DC-to-LF and
+    // DC-from-LF, or copying DC values to/from a temporary aligned location.
+    StoreU(v, D(), Address(row, i));
+  }
+
+  HWY_INLINE void Write(float v, const size_t row, const size_t i) const {
+    *Address(row, i) = v;
+  }
+
+  constexpr HWY_INLINE float* Address(const size_t row, const size_t i) const {
+    return data_ + row * stride_ + i;
+  }
+
+  size_t Stride() const { return stride_; }
+
+ private:
+  size_t stride_;
+  float* JXL_RESTRICT data_;
+};
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_DCT_BLOCK_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dct_for_test.h b/third_party/jpeg-xl/lib/jxl/dct_for_test.h
new file mode 100644
index 0000000000..8e32aa7eff
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dct_for_test.h
@@ -0,0 +1,99 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DCT_FOR_TEST_H_
+#define LIB_JXL_DCT_FOR_TEST_H_
+
+// Unoptimized DCT only for use in tests.
+
+#include <string.h>  // memcpy
+
+#include <cmath>
+#include <vector>
+
+#include "lib/jxl/common.h"  // Pi
+
+namespace jxl {
+
+namespace test {
+static inline double alpha(int u) { return u == 0 ? 0.7071067811865475 : 1.0; }
+
+// N-DCT on M columns, divided by sqrt(N). Matches the definition in the spec.
+template <size_t N, size_t M>
+void DCT1D(double block[N * M], double out[N * M]) {
+  std::vector<double> matrix(N * N);
+  const double scale = std::sqrt(2.0) / N;
+  for (size_t y = 0; y < N; y++) {
+    for (size_t u = 0; u < N; u++) {
+      matrix[N * u + y] = alpha(u) * cos((y + 0.5) * u * Pi(1.0 / N)) * scale;
+    }
+  }
+  for (size_t x = 0; x < M; x++) {
+    for (size_t u = 0; u < N; u++) {
+      out[M * u + x] = 0;
+      for (size_t y = 0; y < N; y++) {
+        out[M * u + x] += matrix[N * u + y] * block[M * y + x];
+      }
+    }
+  }
+}
+
+// N-IDCT on M columns, multiplied by sqrt(N). Matches the definition in the
+// spec.
+template <size_t N, size_t M>
+void IDCT1D(double block[N * M], double out[N * M]) {
+  std::vector<double> matrix(N * N);
+  const double scale = std::sqrt(2.0);
+  for (size_t y = 0; y < N; y++) {
+    for (size_t u = 0; u < N; u++) {
+      // Transpose of DCT matrix.
+      matrix[N * y + u] = alpha(u) * cos((y + 0.5) * u * Pi(1.0 / N)) * scale;
+    }
+  }
+  for (size_t x = 0; x < M; x++) {
+    for (size_t u = 0; u < N; u++) {
+      out[M * u + x] = 0;
+      for (size_t y = 0; y < N; y++) {
+        out[M * u + x] += matrix[N * u + y] * block[M * y + x];
+      }
+    }
+  }
+}
+
+template <size_t N, size_t M>
+void TransposeBlock(double in[N * M], double out[M * N]) {
+  for (size_t x = 0; x < N; x++) {
+    for (size_t y = 0; y < M; y++) {
+      out[y * N + x] = in[x * M + y];
+    }
+  }
+}
+}  // namespace test
+
+// Untransposed DCT.
+template <size_t N>
+void DCTSlow(double block[N * N]) {
+  constexpr size_t kBlockSize = N * N;
+  std::vector<double> g(kBlockSize);
+  test::DCT1D<N, N>(block, g.data());
+  test::TransposeBlock<N, N>(g.data(), block);
+  test::DCT1D<N, N>(block, g.data());
+  test::TransposeBlock<N, N>(g.data(), block);
+}
+
+// Untransposed IDCT.
+template <size_t N>
+void IDCTSlow(double block[N * N]) {
+  constexpr size_t kBlockSize = N * N;
+  std::vector<double> g(kBlockSize);
+  test::IDCT1D<N, N>(block, g.data());
+  test::TransposeBlock<N, N>(g.data(), block);
+  test::IDCT1D<N, N>(block, g.data());
+  test::TransposeBlock<N, N>(g.data(), block);
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DCT_FOR_TEST_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dct_scales.cc b/third_party/jpeg-xl/lib/jxl/dct_scales.cc
new file mode 100644
index 0000000000..f9e89a6014
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dct_scales.cc
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dct_scales.h"
+
+namespace jxl {
+
+// Definition of constexpr arrays.
+constexpr float DCTResampleScales<1, 8>::kScales[];
+constexpr float DCTResampleScales<2, 16>::kScales[];
+constexpr float DCTResampleScales<4, 32>::kScales[];
+constexpr float DCTResampleScales<8, 64>::kScales[];
+constexpr float DCTResampleScales<16, 128>::kScales[];
+constexpr float DCTResampleScales<32, 256>::kScales[];
+constexpr float DCTResampleScales<8, 1>::kScales[];
+constexpr float DCTResampleScales<16, 2>::kScales[];
+constexpr float DCTResampleScales<32, 4>::kScales[];
+constexpr float DCTResampleScales<64, 8>::kScales[];
+constexpr float DCTResampleScales<128, 16>::kScales[];
+constexpr float DCTResampleScales<256, 32>::kScales[];
+constexpr float WcMultipliers<4>::kMultipliers[];
+constexpr float WcMultipliers<8>::kMultipliers[];
+constexpr float WcMultipliers<16>::kMultipliers[];
+constexpr float WcMultipliers<32>::kMultipliers[];
+constexpr float WcMultipliers<64>::kMultipliers[];
+constexpr float WcMultipliers<128>::kMultipliers[];
+constexpr float WcMultipliers<256>::kMultipliers[];
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dct_scales.h b/third_party/jpeg-xl/lib/jxl/dct_scales.h
new file mode 100644
index 0000000000..23af03d60f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dct_scales.h
@@ -0,0 +1,379 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DCT_SCALES_H_
+#define LIB_JXL_DCT_SCALES_H_
+
+// Scaling factors.
+
+#include <stddef.h>
+
+namespace jxl {
+
+static constexpr float kSqrt2 = 1.41421356237f;
+static constexpr float kSqrt0_5 = 0.70710678118f;
+
+// For n != 0, the n-th basis function of a N-DCT, evaluated in pixel k, has a
+// value of cos((k+1/2) n/(2N) pi). When downsampling by 2x, we average
+// the values for pixel k and k+1 to get the value for pixel (k/2), thus we get
+//
+// [cos((k+1/2) n/N pi) + cos((k+3/2) n/N pi)]/2 =
+// cos(n/(2N) pi) cos((k+1) n/N pi) =
+// cos(n/(2N) pi) cos(((k/2)+1/2) n/(N/2) pi)
+//
+// which is exactly the same as the value of pixel k/2 of a N/2-sized DCT,
+// except for the cos(n/(2N) pi) scaling factor (which does *not*
+// depend on the pixel). Thus, when using the lower-frequency coefficients of a
+// DCT-N to compute a DCT-(N/2), they should be scaled by this constant. Scaling
+// factors for a DCT-(N/4) etc can then be obtained by successive
+// multiplications. The structs below contain the above-mentioned scaling
+// factors.
+//
+// Python code for the tables below:
+//
+// for i in range(N // 8):
+//    v = math.cos(i / (2 * N) * math.pi)
+//    v *= math.cos(i / (N) * math.pi)
+//    v *= math.cos(i / (N / 2) * math.pi)
+//    print(v, end=", ")
+
+template <size_t FROM, size_t TO>
+struct DCTResampleScales;
+
+template <>
+struct DCTResampleScales<8, 1> {
+  static constexpr float kScales[] = {
+      1.000000000000000000,
+  };
+};
+
+template <>
+struct DCTResampleScales<16, 2> {
+  static constexpr float kScales[] = {
+      1.000000000000000000,
+      0.901764195028874394,
+  };
+};
+
+template <>
+struct DCTResampleScales<32, 4> {
+  static constexpr float kScales[] = {
+      1.000000000000000000,
+      0.974886821136879522,
+      0.901764195028874394,
+      0.787054918159101335,
+  };
+};
+
+template <>
+struct DCTResampleScales<64, 8> {
+  static constexpr float kScales[] = {
+      1.0000000000000000, 0.9936866130906366, 0.9748868211368796,
+      0.9440180941651672, 0.9017641950288744, 0.8490574973847023,
+      0.7870549181591013, 0.7171081282466044,
+  };
+};
+
+template <>
+struct DCTResampleScales<128, 16> {
+  static constexpr float kScales[] = {
+      1.0,
+      0.9984194528776054,
+      0.9936866130906366,
+      0.9858278282666936,
+      0.9748868211368796,
+      0.9609244059440204,
+      0.9440180941651672,
+      0.9242615922757944,
+      0.9017641950288744,
+      0.8766500784429904,
+      0.8490574973847023,
+      0.8191378932865928,
+      0.7870549181591013,
+      0.7529833816270532,
+      0.7171081282466044,
+      0.6796228528314651,
+  };
+};
+
+template <>
+struct DCTResampleScales<256, 32> {
+  static constexpr float kScales[] = {
+      1.0,
+      0.9996047255830407,
+      0.9984194528776054,
+      0.9964458326264695,
+      0.9936866130906366,
+      0.9901456355893141,
+      0.9858278282666936,
+      0.9807391980963174,
+      0.9748868211368796,
+      0.9682788310563117,
+      0.9609244059440204,
+      0.9528337534340876,
+      0.9440180941651672,
+      0.9344896436056892,
+      0.9242615922757944,
+      0.913348084400198,
+      0.9017641950288744,
+      0.8895259056651056,
+      0.8766500784429904,
+      0.8631544288990163,
+      0.8490574973847023,
+      0.8343786191696513,
+      0.8191378932865928,
+      0.8033561501721485,
+      0.7870549181591013,
+      0.7702563888779096,
+      0.7529833816270532,
+      0.7352593067735488,
+      0.7171081282466044,
+      0.6985543251889097,
+      0.6796228528314651,
+      0.6603391026591464,
+  };
+};
+
+// Inverses of the above.
+template <>
+struct DCTResampleScales<1, 8> {
+  static constexpr float kScales[] = {
+      1.000000000000000000,
+  };
+};
+
+template <>
+struct DCTResampleScales<2, 16> {
+  static constexpr float kScales[] = {
+      1.000000000000000000,
+      1.108937353592731823,
+  };
+};
+
+template <>
+struct DCTResampleScales<4, 32> {
+  static constexpr float kScales[] = {
+      1.000000000000000000,
+      1.025760096781116015,
+      1.108937353592731823,
+      1.270559368765487251,
+  };
+};
+
+template <>
+struct DCTResampleScales<8, 64> {
+  static constexpr float kScales[] = {
+      1.0000000000000000, 1.0063534990068217, 1.0257600967811158,
+      1.0593017296817173, 1.1089373535927318, 1.1777765381970435,
+      1.2705593687654873, 1.3944898413647777,
+  };
+};
+
+template <>
+struct DCTResampleScales<16, 128> {
+  static constexpr float kScales[] = {
+      1.0,
+      1.0015830492062623,
+      1.0063534990068217,
+      1.0143759095928793,
+      1.0257600967811158,
+      1.0406645869480142,
+      1.0593017296817173,
+      1.0819447744633812,
+      1.1089373535927318,
+      1.1407059950032632,
+      1.1777765381970435,
+      1.2207956782315876,
+      1.2705593687654873,
+      1.3280505578213306,
+      1.3944898413647777,
+      1.4714043176061107,
+  };
+};
+
+template <>
+struct DCTResampleScales<32, 256> {
+  static constexpr float kScales[] = {
+      1.0,
+      1.0003954307206069,
+      1.0015830492062623,
+      1.0035668445360069,
+      1.0063534990068217,
+      1.009952439375063,
+      1.0143759095928793,
+      1.0196390660647288,
+      1.0257600967811158,
+      1.0327603660498115,
+      1.0406645869480142,
+      1.049501024072585,
+      1.0593017296817173,
+      1.0701028169146336,
+      1.0819447744633812,
+      1.0948728278734026,
+      1.1089373535927318,
+      1.124194353004584,
+      1.1407059950032632,
+      1.158541237256391,
+      1.1777765381970435,
+      1.1984966740820495,
+      1.2207956782315876,
+      1.244777922949508,
+      1.2705593687654873,
+      1.2982690107339132,
+      1.3280505578213306,
+      1.3600643892400104,
+      1.3944898413647777,
+      1.4315278911623237,
+      1.4714043176061107,
+      1.5143734423314616,
+  };
+};
+
+// Constants for DCT implementation. Generated by the following snippet:
+// for i in range(N // 2):
+//    print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
+template <size_t N>
+struct WcMultipliers;
+
+template <>
+struct WcMultipliers<4> {
+  static constexpr float kMultipliers[] = {
+      0.541196100146197,
+      1.3065629648763764,
+  };
+};
+
+template <>
+struct WcMultipliers<8> {
+  static constexpr float kMultipliers[] = {
+      0.5097955791041592,
+      0.6013448869350453,
+      0.8999762231364156,
+      2.5629154477415055,
+  };
+};
+
+template <>
+struct WcMultipliers<16> {
+  static constexpr float kMultipliers[] = {
+      0.5024192861881557, 0.5224986149396889, 0.5669440348163577,
+      0.6468217833599901, 0.7881546234512502, 1.060677685990347,
+      1.7224470982383342, 5.101148618689155,
+  };
+};
+
+template <>
+struct WcMultipliers<32> {
+  static constexpr float kMultipliers[] = {
+      0.5006029982351963, 0.5054709598975436, 0.5154473099226246,
+      0.5310425910897841, 0.5531038960344445, 0.5829349682061339,
+      0.6225041230356648, 0.6748083414550057, 0.7445362710022986,
+      0.8393496454155268, 0.9725682378619608, 1.1694399334328847,
+      1.4841646163141662, 2.057781009953411,  3.407608418468719,
+      10.190008123548033,
+  };
+};
+template <>
+struct WcMultipliers<64> {
+  static constexpr float kMultipliers[] = {
+      0.500150636020651,  0.5013584524464084, 0.5037887256810443,
+      0.5074711720725553, 0.5124514794082247, 0.5187927131053328,
+      0.52657731515427,   0.535909816907992,  0.5469204379855088,
+      0.5597698129470802, 0.57465518403266,   0.5918185358574165,
+      0.6115573478825099, 0.6342389366884031, 0.6603198078137061,
+      0.6903721282002123, 0.7251205223771985, 0.7654941649730891,
+      0.8127020908144905, 0.8683447152233481, 0.9345835970364075,
+      1.0144082649970547, 1.1120716205797176, 1.233832737976571,
+      1.3892939586328277, 1.5939722833856311, 1.8746759800084078,
+      2.282050068005162,  2.924628428158216,  4.084611078129248,
+      6.796750711673633,  20.373878167231453,
+  };
+};
+template <>
+struct WcMultipliers<128> {
+  static constexpr float kMultipliers[] = {
+      0.5000376519155477, 0.5003390374428216, 0.5009427176380873,
+      0.5018505174842379, 0.5030651913013697, 0.5045904432216454,
+      0.5064309549285542, 0.5085924210498143, 0.5110815927066812,
+      0.5139063298475396, 0.5170756631334912, 0.5205998663018917,
+      0.524490540114724,  0.5287607092074876, 0.5334249333971333,
+      0.538499435291984,  0.5440022463817783, 0.549953374183236,
+      0.5563749934898856, 0.5632916653417023, 0.5707305880121454,
+      0.5787218851348208, 0.5872989370937893, 0.5964987630244563,
+      0.606362462272146,  0.6169357260050706, 0.6282694319707711,
+      0.6404203382416639, 0.6534518953751283, 0.6674352009263413,
+      0.6824501259764195, 0.6985866506472291, 0.7159464549705746,
+      0.7346448236478627, 0.7548129391165311, 0.776600658233963,
+      0.8001798956216941, 0.8257487738627852, 0.8535367510066064,
+      0.8838110045596234, 0.9168844461846523, 0.9531258743921193,
+      0.9929729612675466, 1.036949040910389,  1.0856850642580145,
+      1.1399486751015042, 1.2006832557294167, 1.2690611716991191,
+      1.346557628206286,  1.4350550884414341, 1.5369941008524954,
+      1.6555965242641195, 1.7952052190778898, 1.961817848571166,
+      2.163957818751979,  2.4141600002500763, 2.7316450287739396,
+      3.147462191781909,  3.7152427383269746, 4.5362909369693565,
+      5.827688377844654,  8.153848602466814,  13.58429025728446,
+      40.744688103351834,
+  };
+};
+
+template <>
+struct WcMultipliers<256> {
+  static constexpr float kMultipliers[128] = {
+      0.5000094125358878, 0.500084723455784,  0.5002354020255269,
+      0.5004615618093246, 0.5007633734146156, 0.5011410648064231,
+      0.5015949217281668, 0.502125288230386,  0.5027325673091954,
+      0.5034172216566842, 0.5041797745258774, 0.5050208107132756,
+      0.5059409776624396, 0.5069409866925212, 0.5080216143561264,
+      0.509183703931388,  0.5104281670536573, 0.5117559854927805,
+      0.5131682130825206, 0.5146659778093218, 0.516250484068288,
+      0.5179230150949777, 0.5196849355823947, 0.5215376944933958,
+      0.5234828280796439, 0.52552196311921,   0.5276568203859896,
+      0.5298892183652453, 0.5322210772308335, 0.5346544231010253,
+      0.537191392591309,  0.5398342376841637, 0.5425853309375497,
+      0.545447171055775,  0.5484223888484947, 0.551513753605893,
+      0.554724179920619,  0.5580567349898085, 0.5615146464335654,
+      0.5651013106696203, 0.5688203018875696, 0.5726753816701664,
+      0.5766705093136241, 0.5808098529038624, 0.5850978012111273,
+      0.58953897647151,   0.5941382481306648, 0.5989007476325463,
+      0.6038318843443582, 0.6089373627182432, 0.614223200800649,
+      0.6196957502119484, 0.6253617177319102, 0.6312281886412079,
+      0.6373026519855411, 0.6435930279473415, 0.6501076975307724,
+      0.6568555347890955, 0.6638459418498757, 0.6710888870233562,
+      0.6785949463131795, 0.6863753486870501, 0.6944420255086364,
+      0.7028076645818034, 0.7114857693151208, 0.7204907235796304,
+      0.7298378629074134, 0.7395435527641373, 0.749625274727372,
+      0.7601017215162176, 0.7709929019493761, 0.7823202570613161,
+      0.7941067887834509, 0.8063772028037925, 0.8191580674598145,
+      0.83247799080191,   0.8463678182968619, 0.860860854031955,
+      0.8759931087426972, 0.8918035785352535, 0.9083345588266809,
+      0.9256319988042384, 0.9437459026371479, 0.962730784794803,
+      0.9826461881778968, 1.0035572754078206, 1.0255355056139732,
+      1.048659411496106,  1.0730154944316674, 1.0986992590905857,
+      1.1258164135986009, 1.1544842669978943, 1.184833362908442,
+      1.217009397314603,  1.2511754798461228, 1.287514812536712,
+      1.326233878832723,  1.3675662599582539, 1.411777227500661,
+      1.459169302866857,  1.5100890297227016, 1.5649352798258847,
+      1.6241695131835794, 1.6883285509131505, 1.7580406092704062,
+      1.8340456094306077, 1.9172211551275689, 2.0086161135167564,
+      2.1094945286246385, 2.22139377701127,   2.346202662531156,
+      2.486267909203593,  2.644541877144861,  2.824791402350551,
+      3.0318994541759925, 3.2723115884254845, 3.5547153325075804,
+      3.891107790700307,  4.298537526449054,  4.802076008665048,
+      5.440166215091329,  6.274908408039339,  7.413566756422303,
+      9.058751453879703,  11.644627325175037, 16.300023088031555,
+      27.163977662448232, 81.48784219222516,
+  };
+};
+
+// Apply the DCT algorithm-intrinsic constants to DCTResampleScale.
+template <size_t FROM, size_t TO>
+constexpr float DCTTotalResampleScale(size_t x) {
+  return DCTResampleScales<FROM, TO>::kScales[x];
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DCT_SCALES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dct_test.cc b/third_party/jpeg-xl/lib/jxl/dct_test.cc
new file mode 100644
index 0000000000..9f5eff41e9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dct_test.cc
@@ -0,0 +1,389 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string.h>
+
+#include <cmath>
+#include <numeric>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/dct_test.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+#include <hwy/tests/test_util-inl.h>
+
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct-inl.h"
+#include "lib/jxl/dct_for_test.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/test_utils.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// Computes the in-place NxN DCT of block.
+// Requires that block is HWY_ALIGN'ed.
+//
+// Performs ComputeTransposedScaledDCT and then transposes and scales it to
+// obtain "vanilla" DCT.
+template <size_t N>
+void ComputeDCT(float block[N * N]) {
+  HWY_ALIGN float tmp_block[N * N];
+  HWY_ALIGN float scratch_space[N * N];
+  ComputeScaledDCT<N, N>()(DCTFrom(block, N), tmp_block, scratch_space);
+
+  // Untranspose.
+  Transpose<N, N>::Run(DCTFrom(tmp_block, N), DCTTo(block, N));
+}
+
+// Computes the in-place 8x8 iDCT of block.
+// Requires that block is HWY_ALIGN'ed.
+template <int N>
+void ComputeIDCT(float block[N * N]) {
+  HWY_ALIGN float tmp_block[N * N];
+  HWY_ALIGN float scratch_space[N * N];
+  // Untranspose.
+  Transpose<N, N>::Run(DCTFrom(block, N), DCTTo(tmp_block, N));
+
+  ComputeScaledIDCT<N, N>()(tmp_block, DCTTo(block, N), scratch_space);
+}
+
+template <size_t N>
+void TransposeTestT(float accuracy) {
+  constexpr size_t kBlockSize = N * N;
+  HWY_ALIGN float src[kBlockSize];
+  DCTTo to_src(src, N);
+  for (size_t y = 0; y < N; ++y) {
+    for (size_t x = 0; x < N; ++x) {
+      to_src.Write(y * N + x, y, x);
+    }
+  }
+  HWY_ALIGN float dst[kBlockSize];
+  Transpose<N, N>::Run(DCTFrom(src, N), DCTTo(dst, N));
+  DCTFrom from_dst(dst, N);
+  for (size_t y = 0; y < N; ++y) {
+    for (size_t x = 0; x < N; ++x) {
+      float expected = x * N + y;
+      float actual = from_dst.Read(y, x);
+      EXPECT_NEAR(expected, actual, accuracy) << "x = " << x << ", y = " << y;
+    }
+  }
+}
+
+void TransposeTest() {
+  TransposeTestT<8>(1e-7f);
+  TransposeTestT<16>(1e-7f);
+  TransposeTestT<32>(1e-7f);
+}
+
+template <size_t N>
+void ColumnDctRoundtripT(float accuracy) {
+  constexpr size_t kBlockSize = N * N;
+  // Though we are only interested in single column result, dct.h has built-in
+  // limit on minimal number of columns processed. So, to be safe, we do
+  // regular 8x8 block transformation. On the bright side - we could check all
+  // 8 basis vectors at once.
+  HWY_ALIGN float block[kBlockSize];
+  DCTTo to(block, N);
+  DCTFrom from(block, N);
+  for (size_t i = 0; i < N; ++i) {
+    for (size_t j = 0; j < N; ++j) {
+      to.Write((i == j) ? 1.0f : 0.0f, i, j);
+    }
+  }
+
+  // Running (I)DCT on the same memory block seems to trigger a compiler bug on
+  // ARMv7 with clang6.
+  HWY_ALIGN float tmp[kBlockSize];
+  DCTTo to_tmp(tmp, N);
+  DCTFrom from_tmp(tmp, N);
+
+  DCT1D<N, N>()(from, to_tmp);
+  IDCT1D<N, N>()(from_tmp, to);
+
+  for (size_t i = 0; i < N; ++i) {
+    for (size_t j = 0; j < N; ++j) {
+      float expected = (i == j) ? 1.0f : 0.0f;
+      float actual = from.Read(i, j);
+      EXPECT_NEAR(expected, actual, accuracy) << " i=" << i << ", j=" << j;
+    }
+  }
+}
+
+void ColumnDctRoundtrip() {
+  ColumnDctRoundtripT<8>(1e-6f);
+  ColumnDctRoundtripT<16>(1e-6f);
+  ColumnDctRoundtripT<32>(1e-6f);
+}
+
+template <size_t N>
+void TestDctAccuracy(float accuracy, size_t start = 0, size_t end = N * N) {
+  constexpr size_t kBlockSize = N * N;
+  for (size_t i = start; i < end; i++) {
+    HWY_ALIGN float fast[kBlockSize] = {0.0f};
+    double slow[kBlockSize] = {0.0};
+    fast[i] = 1.0;
+    slow[i] = 1.0;
+    DCTSlow<N>(slow);
+    ComputeDCT<N>(fast);
+    for (size_t k = 0; k < kBlockSize; ++k) {
+      EXPECT_NEAR(fast[k], slow[k], accuracy / N)
+          << "i = " << i << ", k = " << k << ", N = " << N;
+    }
+  }
+}
+
+template <size_t N>
+void TestIdctAccuracy(float accuracy, size_t start = 0, size_t end = N * N) {
+  constexpr size_t kBlockSize = N * N;
+  for (size_t i = start; i < end; i++) {
+    HWY_ALIGN float fast[kBlockSize] = {0.0f};
+    double slow[kBlockSize] = {0.0};
+    fast[i] = 1.0;
+    slow[i] = 1.0;
+    IDCTSlow<N>(slow);
+    ComputeIDCT<N>(fast);
+    for (size_t k = 0; k < kBlockSize; ++k) {
+      EXPECT_NEAR(fast[k], slow[k], accuracy * N)
+          << "i = " << i << ", k = " << k << ", N = " << N;
+    }
+  }
+}
+
+template <size_t N>
+void TestInverseT(float accuracy) {
+  test::ThreadPoolForTests pool(N < 32 ? 0 : 8);
+  enum { kBlockSize = N * N };
+  EXPECT_TRUE(RunOnPool(
+      &pool, 0, kBlockSize, ThreadPool::NoInit,
+      [accuracy](const uint32_t task, size_t /*thread*/) {
+        const size_t i = static_cast<size_t>(task);
+        HWY_ALIGN float x[kBlockSize] = {0.0f};
+        x[i] = 1.0;
+
+        ComputeIDCT<N>(x);
+        ComputeDCT<N>(x);
+
+        for (size_t k = 0; k < kBlockSize; ++k) {
+          EXPECT_NEAR(x[k], (k == i) ? 1.0f : 0.0f, accuracy)
+              << "i = " << i << ", k = " << k;
+        }
+      },
+      "TestInverse"));
+}
+
+void InverseTest() {
+  TestInverseT<8>(1e-6f);
+  TestInverseT<16>(1e-6f);
+  TestInverseT<32>(3e-6f);
+}
+
+template <size_t N>
+void TestDctTranspose(float accuracy, size_t start = 0, size_t end = N * N) {
+  constexpr size_t kBlockSize = N * N;
+  for (size_t i = start; i < end; i++) {
+    for (size_t j = 0; j < kBlockSize; ++j) {
+      // We check that <e_i, Me_j> = <M^\dagger{}e_i, e_j>.
+      // That means (Me_j)_i = (M^\dagger{}e_i)_j
+
+      // x := Me_j
+      HWY_ALIGN float x[kBlockSize] = {0.0f};
+      x[j] = 1.0;
+      ComputeIDCT<N>(x);
+      // y := M^\dagger{}e_i
+      HWY_ALIGN float y[kBlockSize] = {0.0f};
+      y[i] = 1.0;
+      ComputeDCT<N>(y);
+
+      EXPECT_NEAR(x[i] / N, y[j] * N, accuracy) << "i = " << i << ", j = " << j;
+    }
+  }
+}
+
+template <size_t N>
+void TestSlowInverse(float accuracy, size_t start = 0, size_t end = N * N) {
+  constexpr size_t kBlockSize = N * N;
+  for (size_t i = start; i < end; i++) {
+    double x[kBlockSize] = {0.0f};
+    x[i] = 1.0;
+
+    DCTSlow<N>(x);
+    IDCTSlow<N>(x);
+
+    for (size_t k = 0; k < kBlockSize; ++k) {
+      EXPECT_NEAR(x[k], (k == i) ? 1.0f : 0.0f, accuracy)
+          << "i = " << i << ", k = " << k;
+    }
+  }
+}
+
+template <size_t ROWS, size_t COLS>
+void TestRectInverseT(float accuracy) {
+  constexpr size_t kBlockSize = ROWS * COLS;
+  for (size_t i = 0; i < kBlockSize; ++i) {
+    HWY_ALIGN float x[kBlockSize] = {0.0f};
+    HWY_ALIGN float out[kBlockSize] = {0.0f};
+    x[i] = 1.0;
+    HWY_ALIGN float coeffs[kBlockSize] = {0.0f};
+    HWY_ALIGN float scratch_space[kBlockSize * 2];
+
+    ComputeScaledDCT<ROWS, COLS>()(DCTFrom(x, COLS), coeffs, scratch_space);
+    ComputeScaledIDCT<ROWS, COLS>()(coeffs, DCTTo(out, COLS), scratch_space);
+
+    for (size_t k = 0; k < kBlockSize; ++k) {
+      EXPECT_NEAR(out[k], (k == i) ? 1.0f : 0.0f, accuracy)
+          << "i = " << i << ", k = " << k << " ROWS = " << ROWS
+          << " COLS = " << COLS;
+    }
+  }
+}
+
+void TestRectInverse() {
+  TestRectInverseT<16, 32>(1e-6f);
+  TestRectInverseT<8, 32>(1e-6f);
+  TestRectInverseT<8, 16>(1e-6f);
+  TestRectInverseT<4, 8>(1e-6f);
+  TestRectInverseT<2, 4>(1e-6f);
+  TestRectInverseT<1, 4>(1e-6f);
+  TestRectInverseT<1, 2>(1e-6f);
+
+  TestRectInverseT<32, 16>(1e-6f);
+  TestRectInverseT<32, 8>(1e-6f);
+  TestRectInverseT<16, 8>(1e-6f);
+  TestRectInverseT<8, 4>(1e-6f);
+  TestRectInverseT<4, 2>(1e-6f);
+  TestRectInverseT<4, 1>(1e-6f);
+  TestRectInverseT<2, 1>(1e-6f);
+}
+
+template <size_t ROWS, size_t COLS>
+void TestRectTransposeT(float accuracy) {
+  constexpr size_t kBlockSize = ROWS * COLS;
+  HWY_ALIGN float scratch_space[kBlockSize * 2];
+  for (size_t px = 0; px < COLS; ++px) {
+    for (size_t py = 0; py < ROWS; ++py) {
+      HWY_ALIGN float x1[kBlockSize] = {0.0f};
+      HWY_ALIGN float x2[kBlockSize] = {0.0f};
+      HWY_ALIGN float coeffs1[kBlockSize] = {0.0f};
+      HWY_ALIGN float coeffs2[kBlockSize] = {0.0f};
+      x1[py * COLS + px] = 1;
+      x2[px * ROWS + py] = 1;
+
+      constexpr size_t OUT_ROWS = ROWS < COLS ? ROWS : COLS;
+      constexpr size_t OUT_COLS = ROWS < COLS ? COLS : ROWS;
+
+      ComputeScaledDCT<ROWS, COLS>()(DCTFrom(x1, COLS), coeffs1, scratch_space);
+      ComputeScaledDCT<COLS, ROWS>()(DCTFrom(x2, ROWS), coeffs2, scratch_space);
+
+      for (size_t x = 0; x < OUT_COLS; ++x) {
+        for (size_t y = 0; y < OUT_ROWS; ++y) {
+          EXPECT_NEAR(coeffs1[y * OUT_COLS + x], coeffs2[y * OUT_COLS + x],
+                      accuracy)
+              << " px = " << px << ", py = " << py << ", x = " << x
+              << ", y = " << y;
+        }
+      }
+    }
+  }
+}
+
+void TestRectTranspose() {
+  TestRectTransposeT<16, 32>(1e-6f);
+  TestRectTransposeT<8, 32>(1e-6f);
+  TestRectTransposeT<8, 16>(1e-6f);
+  TestRectTransposeT<4, 8>(1e-6f);
+  TestRectTransposeT<2, 4>(1e-6f);
+  TestRectTransposeT<1, 4>(1e-6f);
+  TestRectTransposeT<1, 2>(1e-6f);
+
+  // Identical to 8, 16
+  //  TestRectTranspose<16, 8>(1e-6f);
+}
+
+void TestDctAccuracyShard(size_t shard) {
+  if (shard == 0) {
+    TestDctAccuracy<1>(1.1E-7f);
+    TestDctAccuracy<2>(1.1E-7f);
+    TestDctAccuracy<4>(1.1E-7f);
+    TestDctAccuracy<8>(1.1E-7f);
+    TestDctAccuracy<16>(1.3E-7f);
+  }
+  TestDctAccuracy<32>(1.1E-7f, 32 * shard, 32 * (shard + 1));
+}
+
+void TestIdctAccuracyShard(size_t shard) {
+  if (shard == 0) {
+    TestIdctAccuracy<1>(1E-7f);
+    TestIdctAccuracy<2>(1E-7f);
+    TestIdctAccuracy<4>(1E-7f);
+    TestIdctAccuracy<8>(1E-7f);
+    TestIdctAccuracy<16>(1E-7f);
+  }
+  TestIdctAccuracy<32>(1E-7f, 32 * shard, 32 * (shard + 1));
+}
+
+void TestDctTransposeShard(size_t shard) {
+  if (shard == 0) {
+    TestDctTranspose<8>(1E-6f);
+    TestDctTranspose<16>(1E-6f);
+  }
+  TestDctTranspose<32>(3E-6f, 32 * shard, 32 * (shard + 1));
+}
+
+void TestSlowInverseShard(size_t shard) {
+  if (shard == 0) {
+    TestSlowInverse<1>(1E-5f);
+    TestSlowInverse<2>(1E-5f);
+    TestSlowInverse<4>(1E-5f);
+    TestSlowInverse<8>(1E-5f);
+    TestSlowInverse<16>(1E-5f);
+  }
+  TestSlowInverse<32>(1E-5f, 32 * shard, 32 * (shard + 1));
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class TransposeTest : public hwy::TestWithParamTarget {};
+
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(TransposeTest);
+
+HWY_EXPORT_AND_TEST_P(TransposeTest, TransposeTest);
+HWY_EXPORT_AND_TEST_P(TransposeTest, InverseTest);
+HWY_EXPORT_AND_TEST_P(TransposeTest, ColumnDctRoundtrip);
+HWY_EXPORT_AND_TEST_P(TransposeTest, TestRectInverse);
+HWY_EXPORT_AND_TEST_P(TransposeTest, TestRectTranspose);
+
+// Tests in the DctShardedTest class are sharded for N=32.
+class DctShardedTest : public ::hwy::TestWithParamTargetAndT<uint32_t> {};
+
+std::vector<uint32_t> ShardRange(uint32_t n) {
+#ifdef JXL_DISABLE_SLOW_TESTS
+  JXL_ASSERT(n > 6);
+  std::vector<uint32_t> ret = {0, 1, 3, 5, n - 1};
+#else
+  std::vector<uint32_t> ret(n);
+  std::iota(ret.begin(), ret.end(), 0);
+#endif  // JXL_DISABLE_SLOW_TESTS
+  return ret;
+}
+
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(DctShardedTest,
+                                      ::testing::ValuesIn(ShardRange(32)));
+
+HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestDctAccuracyShard);
+HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestIdctAccuracyShard);
+HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestDctTransposeShard);
+HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestSlowInverseShard);
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/dct_util.h b/third_party/jpeg-xl/lib/jxl/dct_util.h
new file mode 100644
index 0000000000..fb6ce3b971
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dct_util.h
@@ -0,0 +1,86 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DCT_UTIL_H_
+#define LIB_JXL_DCT_UTIL_H_
+
+#include <stddef.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+union ACPtr {
+  int32_t* ptr32;
+  int16_t* ptr16;
+  ACPtr() = default;
+  explicit ACPtr(int16_t* p) : ptr16(p) {}
+  explicit ACPtr(int32_t* p) : ptr32(p) {}
+};
+
+union ConstACPtr {
+  const int32_t* ptr32;
+  const int16_t* ptr16;
+  ConstACPtr() = default;
+  explicit ConstACPtr(const int16_t* p) : ptr16(p) {}
+  explicit ConstACPtr(const int32_t* p) : ptr32(p) {}
+};
+
+enum class ACType { k16 = 0, k32 = 1 };
+
+class ACImage {
+ public:
+  virtual ~ACImage() = default;
+  virtual ACType Type() const = 0;
+  virtual ACPtr PlaneRow(size_t c, size_t y, size_t xbase) = 0;
+  virtual ConstACPtr PlaneRow(size_t c, size_t y, size_t xbase) const = 0;
+  virtual size_t PixelsPerRow() const = 0;
+  virtual void ZeroFill() = 0;
+  virtual void ZeroFillPlane(size_t c) = 0;
+  virtual bool IsEmpty() const = 0;
+};
+
+template <typename T>
+class ACImageT final : public ACImage {
+ public:
+  ACImageT() = default;
+  ACImageT(size_t xsize, size_t ysize) {
+    static_assert(
+        std::is_same<T, int16_t>::value || std::is_same<T, int32_t>::value,
+        "ACImage must be either 32- or 16- bit");
+    img_ = Image3<T>(xsize, ysize);
+  }
+  ACType Type() const override {
+    return sizeof(T) == 2 ? ACType::k16 : ACType::k32;
+  }
+  ACPtr PlaneRow(size_t c, size_t y, size_t xbase) override {
+    return ACPtr(img_.PlaneRow(c, y) + xbase);
+  }
+  ConstACPtr PlaneRow(size_t c, size_t y, size_t xbase) const override {
+    return ConstACPtr(img_.PlaneRow(c, y) + xbase);
+  }
+
+  size_t PixelsPerRow() const override { return img_.PixelsPerRow(); }
+
+  void ZeroFill() override { ZeroFillImage(&img_); }
+
+  void ZeroFillPlane(size_t c) override { ZeroFillImage(&img_.Plane(c)); }
+
+  bool IsEmpty() const override {
+    return img_.xsize() == 0 || img_.ysize() == 0;
+  }
+
+ private:
+  Image3<T> img_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DCT_UTIL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_ans.cc b/third_party/jpeg-xl/lib/jxl/dec_ans.cc
new file mode 100644
index 0000000000..c9145472e0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_ans.cc
@@ -0,0 +1,374 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_ans.h"
+
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/ans_common.h"
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_context_map.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+namespace {
+
+// Decodes a number in the range [0..255], by reading 1 - 11 bits.
+inline int DecodeVarLenUint8(BitReader* input) {
+  if (input->ReadFixedBits<1>()) {
+    int nbits = static_cast<int>(input->ReadFixedBits<3>());
+    if (nbits == 0) {
+      return 1;
+    } else {
+      return static_cast<int>(input->ReadBits(nbits)) + (1 << nbits);
+    }
+  }
+  return 0;
+}
+
+// Decodes a number in the range [0..65535], by reading 1 - 21 bits.
+inline int DecodeVarLenUint16(BitReader* input) {
+  if (input->ReadFixedBits<1>()) {
+    int nbits = static_cast<int>(input->ReadFixedBits<4>());
+    if (nbits == 0) {
+      return 1;
+    } else {
+      return static_cast<int>(input->ReadBits(nbits)) + (1 << nbits);
+    }
+  }
+  return 0;
+}
+
+Status ReadHistogram(int precision_bits, std::vector<int32_t>* counts,
+                     BitReader* input) {
+  int simple_code = input->ReadBits(1);
+  if (simple_code == 1) {
+    int i;
+    int symbols[2] = {0};
+    int max_symbol = 0;
+    const int num_symbols = input->ReadBits(1) + 1;
+    for (i = 0; i < num_symbols; ++i) {
+      symbols[i] = DecodeVarLenUint8(input);
+      if (symbols[i] > max_symbol) max_symbol = symbols[i];
+    }
+    counts->resize(max_symbol + 1);
+    if (num_symbols == 1) {
+      (*counts)[symbols[0]] = 1 << precision_bits;
+    } else {
+      if (symbols[0] == symbols[1]) {  // corrupt data
+        return false;
+      }
+      (*counts)[symbols[0]] = input->ReadBits(precision_bits);
+      (*counts)[symbols[1]] = (1 << precision_bits) - (*counts)[symbols[0]];
+    }
+  } else {
+    int is_flat = input->ReadBits(1);
+    if (is_flat == 1) {
+      int alphabet_size = DecodeVarLenUint8(input) + 1;
+      *counts = CreateFlatHistogram(alphabet_size, 1 << precision_bits);
+      return true;
+    }
+
+    uint32_t shift;
+    {
+      // TODO(veluca): speed up reading with table lookups.
+      int upper_bound_log = FloorLog2Nonzero(ANS_LOG_TAB_SIZE + 1);
+      int log = 0;
+      for (; log < upper_bound_log; log++) {
+        if (input->ReadFixedBits<1>() == 0) break;
+      }
+      shift = (input->ReadBits(log) | (1 << log)) - 1;
+      if (shift > ANS_LOG_TAB_SIZE + 1) {
+        return JXL_FAILURE("Invalid shift value");
+      }
+    }
+
+    int length = DecodeVarLenUint8(input) + 3;
+    counts->resize(length);
+    int total_count = 0;
+
+    static const uint8_t huff[128][2] = {
+        {3, 10}, {7, 12}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+        {3, 10}, {5, 0},  {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+        {3, 10}, {6, 11}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+        {3, 10}, {5, 0},  {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+        {3, 10}, {7, 13}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+        {3, 10}, {5, 0},  {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+        {3, 10}, {6, 11}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+        {3, 10}, {5, 0},  {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+    };
+
+    std::vector<int> logcounts(counts->size());
+    int omit_log = -1;
+    int omit_pos = -1;
+    // This array remembers which symbols have an RLE length.
+    std::vector<int> same(counts->size(), 0);
+    for (size_t i = 0; i < logcounts.size(); ++i) {
+      input->Refill();  // for PeekFixedBits + Advance
+      int idx = input->PeekFixedBits<7>();
+      input->Consume(huff[idx][0]);
+      logcounts[i] = huff[idx][1];
+      // The RLE symbol.
+      if (logcounts[i] == ANS_LOG_TAB_SIZE + 1) {
+        int rle_length = DecodeVarLenUint8(input);
+        same[i] = rle_length + 5;
+        i += rle_length + 3;
+        continue;
+      }
+      if (logcounts[i] > omit_log) {
+        omit_log = logcounts[i];
+        omit_pos = i;
+      }
+    }
+    // Invalid input, e.g. due to invalid usage of RLE.
+    if (omit_pos < 0) return JXL_FAILURE("Invalid histogram.");
+    if (static_cast<size_t>(omit_pos) + 1 < logcounts.size() &&
+        logcounts[omit_pos + 1] == ANS_TAB_SIZE + 1) {
+      return JXL_FAILURE("Invalid histogram.");
+    }
+    int prev = 0;
+    int numsame = 0;
+    for (size_t i = 0; i < logcounts.size(); ++i) {
+      if (same[i]) {
+        // RLE sequence, let this loop output the same count for the next
+        // iterations.
+        numsame = same[i] - 1;
+        prev = i > 0 ? (*counts)[i - 1] : 0;
+      }
+      if (numsame > 0) {
+        (*counts)[i] = prev;
+        numsame--;
+      } else {
+        int code = logcounts[i];
+        // omit_pos may not be negative at this point (checked before).
+        if (i == static_cast<size_t>(omit_pos)) {
+          continue;
+        } else if (code == 0) {
+          continue;
+        } else if (code == 1) {
+          (*counts)[i] = 1;
+        } else {
+          int bitcount = GetPopulationCountPrecision(code - 1, shift);
+          (*counts)[i] = (1 << (code - 1)) +
+                         (input->ReadBits(bitcount) << (code - 1 - bitcount));
+        }
+      }
+      total_count += (*counts)[i];
+    }
+    (*counts)[omit_pos] = (1 << precision_bits) - total_count;
+    if ((*counts)[omit_pos] <= 0) {
+      // The histogram we've read sums to more than total_count (including at
+      // least 1 for the omitted value).
+      return JXL_FAILURE("Invalid histogram count.");
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+Status DecodeANSCodes(const size_t num_histograms,
+                      const size_t max_alphabet_size, BitReader* in,
+                      ANSCode* result) {
+  result->degenerate_symbols.resize(num_histograms, -1);
+  if (result->use_prefix_code) {
+    JXL_ASSERT(max_alphabet_size <= 1 << PREFIX_MAX_BITS);
+    result->huffman_data.resize(num_histograms);
+    std::vector<uint16_t> alphabet_sizes(num_histograms);
+    for (size_t c = 0; c < num_histograms; c++) {
+      alphabet_sizes[c] = DecodeVarLenUint16(in) + 1;
+      if (alphabet_sizes[c] > max_alphabet_size) {
+        return JXL_FAILURE("Alphabet size is too long: %u", alphabet_sizes[c]);
+      }
+    }
+    for (size_t c = 0; c < num_histograms; c++) {
+      if (alphabet_sizes[c] > 1) {
+        if (!result->huffman_data[c].ReadFromBitStream(alphabet_sizes[c], in)) {
+          if (!in->AllReadsWithinBounds()) {
+            return JXL_STATUS(StatusCode::kNotEnoughBytes,
+                              "Not enough bytes for huffman code");
+          }
+          return JXL_FAILURE("Invalid huffman tree number %" PRIuS
+                             ", alphabet size %u",
+                             c, alphabet_sizes[c]);
+        }
+      } else {
+        // 0-bit codes does not require extension tables.
+        result->huffman_data[c].table_.clear();
+        result->huffman_data[c].table_.resize(1u << kHuffmanTableBits);
+      }
+      for (const auto& h : result->huffman_data[c].table_) {
+        if (h.bits <= kHuffmanTableBits) {
+          result->UpdateMaxNumBits(c, h.value);
+        }
+      }
+    }
+  } else {
+    JXL_ASSERT(max_alphabet_size <= ANS_MAX_ALPHABET_SIZE);
+    result->alias_tables =
+        AllocateArray(num_histograms * (1 << result->log_alpha_size) *
+                      sizeof(AliasTable::Entry));
+    AliasTable::Entry* alias_tables =
+        reinterpret_cast<AliasTable::Entry*>(result->alias_tables.get());
+    for (size_t c = 0; c < num_histograms; ++c) {
+      std::vector<int32_t> counts;
+      if (!ReadHistogram(ANS_LOG_TAB_SIZE, &counts, in)) {
+        return JXL_FAILURE("Invalid histogram bitstream.");
+      }
+      if (counts.size() > max_alphabet_size) {
+        return JXL_FAILURE("Alphabet size is too long: %" PRIuS, counts.size());
+      }
+      while (!counts.empty() && counts.back() == 0) {
+        counts.pop_back();
+      }
+      for (size_t s = 0; s < counts.size(); s++) {
+        if (counts[s] != 0) {
+          result->UpdateMaxNumBits(c, s);
+        }
+      }
+      // InitAliasTable "fixes" empty counts to contain degenerate "0" symbol.
+      int degenerate_symbol = counts.empty() ? 0 : (counts.size() - 1);
+      for (int s = 0; s < degenerate_symbol; ++s) {
+        if (counts[s] != 0) {
+          degenerate_symbol = -1;
+          break;
+        }
+      }
+      result->degenerate_symbols[c] = degenerate_symbol;
+      InitAliasTable(counts, ANS_TAB_SIZE, result->log_alpha_size,
+                     alias_tables + c * (1 << result->log_alpha_size));
+    }
+  }
+  return true;
+}
+Status DecodeUintConfig(size_t log_alpha_size, HybridUintConfig* uint_config,
+                        BitReader* br) {
+  br->Refill();
+  size_t split_exponent = br->ReadBits(CeilLog2Nonzero(log_alpha_size + 1));
+  size_t msb_in_token = 0, lsb_in_token = 0;
+  if (split_exponent != log_alpha_size) {
+    // otherwise, msb/lsb don't matter.
+    size_t nbits = CeilLog2Nonzero(split_exponent + 1);
+    msb_in_token = br->ReadBits(nbits);
+    if (msb_in_token > split_exponent) {
+      // This could be invalid here already and we need to check this before
+      // we use its value to read more bits.
+      return JXL_FAILURE("Invalid HybridUintConfig");
+    }
+    nbits = CeilLog2Nonzero(split_exponent - msb_in_token + 1);
+    lsb_in_token = br->ReadBits(nbits);
+  }
+  if (lsb_in_token + msb_in_token > split_exponent) {
+    return JXL_FAILURE("Invalid HybridUintConfig");
+  }
+  *uint_config = HybridUintConfig(split_exponent, msb_in_token, lsb_in_token);
+  return true;
+}
+
+Status DecodeUintConfigs(size_t log_alpha_size,
+                         std::vector<HybridUintConfig>* uint_config,
+                         BitReader* br) {
+  // TODO(veluca): RLE?
+  for (size_t i = 0; i < uint_config->size(); i++) {
+    JXL_RETURN_IF_ERROR(
+        DecodeUintConfig(log_alpha_size, &(*uint_config)[i], br));
+  }
+  return true;
+}
+
+LZ77Params::LZ77Params() { Bundle::Init(this); }
+Status LZ77Params::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &enabled));
+  if (!visitor->Conditional(enabled)) return true;
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(224), Val(512), Val(4096),
+                                         BitsOffset(15, 8), 224, &min_symbol));
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(3), Val(4), BitsOffset(2, 5),
+                                         BitsOffset(8, 9), 3, &min_length));
+  return true;
+}
+
+void ANSCode::UpdateMaxNumBits(size_t ctx, size_t symbol) {
+  HybridUintConfig* cfg = &uint_config[ctx];
+  // LZ77 symbols use a different uint config.
+  if (lz77.enabled && lz77.nonserialized_distance_context != ctx &&
+      symbol >= lz77.min_symbol) {
+    symbol -= lz77.min_symbol;
+    cfg = &lz77.length_uint_config;
+  }
+  size_t split_token = cfg->split_token;
+  size_t msb_in_token = cfg->msb_in_token;
+  size_t lsb_in_token = cfg->lsb_in_token;
+  size_t split_exponent = cfg->split_exponent;
+  if (symbol < split_token) {
+    max_num_bits = std::max(max_num_bits, split_exponent);
+    return;
+  }
+  uint32_t n_extra_bits =
+      split_exponent - (msb_in_token + lsb_in_token) +
+      ((symbol - split_token) >> (msb_in_token + lsb_in_token));
+  size_t total_bits = msb_in_token + lsb_in_token + n_extra_bits + 1;
+  max_num_bits = std::max(max_num_bits, total_bits);
+}
+
+Status DecodeHistograms(BitReader* br, size_t num_contexts, ANSCode* code,
+                        std::vector<uint8_t>* context_map, bool disallow_lz77) {
+  PROFILER_FUNC;
+  JXL_RETURN_IF_ERROR(Bundle::Read(br, &code->lz77));
+  if (code->lz77.enabled) {
+    num_contexts++;
+    JXL_RETURN_IF_ERROR(DecodeUintConfig(/*log_alpha_size=*/8,
+                                         &code->lz77.length_uint_config, br));
+  }
+  if (code->lz77.enabled && disallow_lz77) {
+    return JXL_FAILURE("Using LZ77 when explicitly disallowed");
+  }
+  size_t num_histograms = 1;
+  context_map->resize(num_contexts);
+  if (num_contexts > 1) {
+    JXL_RETURN_IF_ERROR(DecodeContextMap(context_map, &num_histograms, br));
+  }
+  code->lz77.nonserialized_distance_context = context_map->back();
+  code->use_prefix_code = br->ReadFixedBits<1>();
+  if (code->use_prefix_code) {
+    code->log_alpha_size = PREFIX_MAX_BITS;
+  } else {
+    code->log_alpha_size = br->ReadFixedBits<2>() + 5;
+  }
+  code->uint_config.resize(num_histograms);
+  JXL_RETURN_IF_ERROR(
+      DecodeUintConfigs(code->log_alpha_size, &code->uint_config, br));
+  const size_t max_alphabet_size = 1 << code->log_alpha_size;
+  JXL_RETURN_IF_ERROR(
+      DecodeANSCodes(num_histograms, max_alphabet_size, br, code));
+  // When using LZ77, flat codes might result in valid codestreams with
+  // histograms that potentially allow very large bit counts.
+  // TODO(veluca): in principle, a valid codestream might contain a histogram
+  // that could allow very large numbers of bits that is never used during ANS
+  // decoding. There's no benefit to doing that, though.
+  if (!code->lz77.enabled && code->max_num_bits > 32) {
+    // Just emit a warning as there are many opportunities for false positives.
+    JXL_WARNING("Histogram can represent numbers that are too large: %" PRIuS
+                "\n",
+                code->max_num_bits);
+  }
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_ans.h b/third_party/jpeg-xl/lib/jxl/dec_ans.h
new file mode 100644
index 0000000000..0f4406745a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_ans.h
@@ -0,0 +1,462 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_ANS_H_
+#define LIB_JXL_DEC_ANS_H_
+
+// Library to decode the ANS population counts from the bit-stream and build a
+// decoding table from them.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cstring>
+#include <vector>
+
+#include "lib/jxl/ans_common.h"
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/cache_aligned.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_huffman.h"
+#include "lib/jxl/field_encodings.h"
+
+namespace jxl {
+
+class ANSSymbolReader;
+
+// Experiments show that best performance is typically achieved for a
+// split-exponent of 3 or 4. Trend seems to be that '4' is better
+// for large-ish pictures, and '3' better for rather small-ish pictures.
+// This is plausible - the more special symbols we have, the better
+// statistics we need to get a benefit out of them.
+
+// Our hybrid-encoding scheme has dedicated tokens for the smallest
+// (1 << split_exponents) numbers, and for the rest
+// encodes (number of bits) + (msb_in_token sub-leading binary digits) +
+// (lsb_in_token lowest binary digits) in the token, with the remaining bits
+// then being encoded as data.
+//
+// Example with split_exponent = 4, msb_in_token = 2, lsb_in_token = 0.
+//
+// Numbers N in [0 .. 15]:
+//   These get represented as (token=N, bits='').
+// Numbers N >= 16:
+//   If n is such that 2**n <= N < 2**(n+1),
+//   and m = N - 2**n is the 'mantissa',
+//   these get represented as:
+// (token=split_token +
+//        ((n - split_exponent) * 4) +
+//        (m >> (n - msb_in_token)),
+//  bits=m & (1 << (n - msb_in_token)) - 1)
+// Specifically, we would get:
+// N = 0 - 15:          (token=N, nbits=0, bits='')
+// N = 16 (10000):      (token=16, nbits=2, bits='00')
+// N = 17 (10001):      (token=16, nbits=2, bits='01')
+// N = 20 (10100):      (token=17, nbits=2, bits='00')
+// N = 24 (11000):      (token=18, nbits=2, bits='00')
+// N = 28 (11100):      (token=19, nbits=2, bits='00')
+// N = 32 (100000):     (token=20, nbits=3, bits='000')
+// N = 65535:           (token=63, nbits=13, bits='1111111111111')
+struct HybridUintConfig {
+  uint32_t split_exponent;
+  uint32_t split_token;
+  uint32_t msb_in_token;
+  uint32_t lsb_in_token;
+  JXL_INLINE void Encode(uint32_t value, uint32_t* JXL_RESTRICT token,
+                         uint32_t* JXL_RESTRICT nbits,
+                         uint32_t* JXL_RESTRICT bits) const {
+    if (value < split_token) {
+      *token = value;
+      *nbits = 0;
+      *bits = 0;
+    } else {
+      uint32_t n = FloorLog2Nonzero(value);
+      uint32_t m = value - (1 << n);
+      *token = split_token +
+               ((n - split_exponent) << (msb_in_token + lsb_in_token)) +
+               ((m >> (n - msb_in_token)) << lsb_in_token) +
+               (m & ((1 << lsb_in_token) - 1));
+      *nbits = n - msb_in_token - lsb_in_token;
+      *bits = (value >> lsb_in_token) & ((1UL << *nbits) - 1);
+    }
+  }
+
+  explicit HybridUintConfig(uint32_t split_exponent = 4,
+                            uint32_t msb_in_token = 2,
+                            uint32_t lsb_in_token = 0)
+      : split_exponent(split_exponent),
+        split_token(1 << split_exponent),
+        msb_in_token(msb_in_token),
+        lsb_in_token(lsb_in_token) {
+    JXL_DASSERT(split_exponent >= msb_in_token + lsb_in_token);
+  }
+};
+
+struct LZ77Params : public Fields {
+  LZ77Params();
+  JXL_FIELDS_NAME(LZ77Params)
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+  bool enabled;
+
+  // Symbols above min_symbol use a special hybrid uint encoding and
+  // represent a length, to be added to min_length.
+  uint32_t min_symbol;
+  uint32_t min_length;
+
+  // Not serialized by VisitFields.
+  HybridUintConfig length_uint_config{0, 0, 0};
+
+  size_t nonserialized_distance_context;
+};
+
+static constexpr size_t kWindowSize = 1 << 20;
+static constexpr size_t kNumSpecialDistances = 120;
+// Table of special distance codes from WebP lossless.
+static constexpr int8_t kSpecialDistances[kNumSpecialDistances][2] = {
+    {0, 1},  {1, 0},  {1, 1},  {-1, 1}, {0, 2},  {2, 0},  {1, 2},  {-1, 2},
+    {2, 1},  {-2, 1}, {2, 2},  {-2, 2}, {0, 3},  {3, 0},  {1, 3},  {-1, 3},
+    {3, 1},  {-3, 1}, {2, 3},  {-2, 3}, {3, 2},  {-3, 2}, {0, 4},  {4, 0},
+    {1, 4},  {-1, 4}, {4, 1},  {-4, 1}, {3, 3},  {-3, 3}, {2, 4},  {-2, 4},
+    {4, 2},  {-4, 2}, {0, 5},  {3, 4},  {-3, 4}, {4, 3},  {-4, 3}, {5, 0},
+    {1, 5},  {-1, 5}, {5, 1},  {-5, 1}, {2, 5},  {-2, 5}, {5, 2},  {-5, 2},
+    {4, 4},  {-4, 4}, {3, 5},  {-3, 5}, {5, 3},  {-5, 3}, {0, 6},  {6, 0},
+    {1, 6},  {-1, 6}, {6, 1},  {-6, 1}, {2, 6},  {-2, 6}, {6, 2},  {-6, 2},
+    {4, 5},  {-4, 5}, {5, 4},  {-5, 4}, {3, 6},  {-3, 6}, {6, 3},  {-6, 3},
+    {0, 7},  {7, 0},  {1, 7},  {-1, 7}, {5, 5},  {-5, 5}, {7, 1},  {-7, 1},
+    {4, 6},  {-4, 6}, {6, 4},  {-6, 4}, {2, 7},  {-2, 7}, {7, 2},  {-7, 2},
+    {3, 7},  {-3, 7}, {7, 3},  {-7, 3}, {5, 6},  {-5, 6}, {6, 5},  {-6, 5},
+    {8, 0},  {4, 7},  {-4, 7}, {7, 4},  {-7, 4}, {8, 1},  {8, 2},  {6, 6},
+    {-6, 6}, {8, 3},  {5, 7},  {-5, 7}, {7, 5},  {-7, 5}, {8, 4},  {6, 7},
+    {-6, 7}, {7, 6},  {-7, 6}, {8, 5},  {7, 7},  {-7, 7}, {8, 6},  {8, 7}};
+
+struct ANSCode {
+  CacheAlignedUniquePtr alias_tables;
+  std::vector<HuffmanDecodingData> huffman_data;
+  std::vector<HybridUintConfig> uint_config;
+  std::vector<int> degenerate_symbols;
+  bool use_prefix_code;
+  uint8_t log_alpha_size;  // for ANS.
+  LZ77Params lz77;
+  // Maximum number of bits necessary to represent the result of a
+  // ReadHybridUint call done with this ANSCode.
+  size_t max_num_bits = 0;
+  void UpdateMaxNumBits(size_t ctx, size_t symbol);
+};
+
+class ANSSymbolReader {
+ public:
+  // Invalid symbol reader, to be overwritten.
+  ANSSymbolReader() = default;
+  ANSSymbolReader(const ANSCode* code, BitReader* JXL_RESTRICT br,
+                  size_t distance_multiplier = 0)
+      : alias_tables_(
+            reinterpret_cast<AliasTable::Entry*>(code->alias_tables.get())),
+        huffman_data_(code->huffman_data.data()),
+        use_prefix_code_(code->use_prefix_code),
+        configs(code->uint_config.data()) {
+    if (!use_prefix_code_) {
+      state_ = static_cast<uint32_t>(br->ReadFixedBits<32>());
+      log_alpha_size_ = code->log_alpha_size;
+      log_entry_size_ = ANS_LOG_TAB_SIZE - code->log_alpha_size;
+      entry_size_minus_1_ = (1 << log_entry_size_) - 1;
+    } else {
+      state_ = (ANS_SIGNATURE << 16u);
+    }
+    if (!code->lz77.enabled) return;
+    // a std::vector incurs unacceptable decoding speed loss because of
+    // initialization.
+    lz77_window_storage_ = AllocateArray(kWindowSize * sizeof(uint32_t));
+    lz77_window_ = reinterpret_cast<uint32_t*>(lz77_window_storage_.get());
+    lz77_ctx_ = code->lz77.nonserialized_distance_context;
+    lz77_length_uint_ = code->lz77.length_uint_config;
+    lz77_threshold_ = code->lz77.min_symbol;
+    lz77_min_length_ = code->lz77.min_length;
+    num_special_distances_ =
+        distance_multiplier == 0 ? 0 : kNumSpecialDistances;
+    for (size_t i = 0; i < num_special_distances_; i++) {
+      int dist = kSpecialDistances[i][0];
+      dist += static_cast<int>(distance_multiplier) * kSpecialDistances[i][1];
+      if (dist < 1) dist = 1;
+      special_distances_[i] = dist;
+    }
+  }
+
+  JXL_INLINE size_t ReadSymbolANSWithoutRefill(const size_t histo_idx,
+                                               BitReader* JXL_RESTRICT br) {
+    const uint32_t res = state_ & (ANS_TAB_SIZE - 1u);
+
+    const AliasTable::Entry* table =
+        &alias_tables_[histo_idx << log_alpha_size_];
+    const AliasTable::Symbol symbol =
+        AliasTable::Lookup(table, res, log_entry_size_, entry_size_minus_1_);
+    state_ = symbol.freq * (state_ >> ANS_LOG_TAB_SIZE) + symbol.offset;
+
+#if 1
+    // Branchless version is about equally fast on SKX.
+    const uint32_t new_state =
+        (state_ << 16u) | static_cast<uint32_t>(br->PeekFixedBits<16>());
+    const bool normalize = state_ < (1u << 16u);
+    state_ = normalize ? new_state : state_;
+    br->Consume(normalize ? 16 : 0);
+#else
+    if (JXL_UNLIKELY(state_ < (1u << 16u))) {
+      state_ = (state_ << 16u) | br->PeekFixedBits<16>();
+      br->Consume(16);
+    }
+#endif
+    const uint32_t next_res = state_ & (ANS_TAB_SIZE - 1u);
+    AliasTable::Prefetch(table, next_res, log_entry_size_);
+
+    return symbol.value;
+  }
+
+  JXL_INLINE size_t ReadSymbolHuffWithoutRefill(const size_t histo_idx,
+                                                BitReader* JXL_RESTRICT br) {
+    return huffman_data_[histo_idx].ReadSymbol(br);
+  }
+
+  JXL_INLINE size_t ReadSymbolWithoutRefill(const size_t histo_idx,
+                                            BitReader* JXL_RESTRICT br) {
+    // TODO(veluca): hoist if in hotter loops.
+    if (JXL_UNLIKELY(use_prefix_code_)) {
+      return ReadSymbolHuffWithoutRefill(histo_idx, br);
+    }
+    return ReadSymbolANSWithoutRefill(histo_idx, br);
+  }
+
+  JXL_INLINE size_t ReadSymbol(const size_t histo_idx,
+                               BitReader* JXL_RESTRICT br) {
+    br->Refill();
+    return ReadSymbolWithoutRefill(histo_idx, br);
+  }
+
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  bool CheckANSFinalState() const { return true; }
+#else
+  bool CheckANSFinalState() const { return state_ == (ANS_SIGNATURE << 16u); }
+#endif
+
+  template <typename BitReader>
+  static JXL_INLINE uint32_t ReadHybridUintConfig(
+      const HybridUintConfig& config, size_t token, BitReader* br) {
+    size_t split_token = config.split_token;
+    size_t msb_in_token = config.msb_in_token;
+    size_t lsb_in_token = config.lsb_in_token;
+    size_t split_exponent = config.split_exponent;
+    // Fast-track version of hybrid integer decoding.
+    if (token < split_token) return token;
+    uint32_t nbits = split_exponent - (msb_in_token + lsb_in_token) +
+                     ((token - split_token) >> (msb_in_token + lsb_in_token));
+    // Max amount of bits for ReadBits is 32 and max valid left shift is 29
+    // bits. However, for speed no error is propagated here, instead limit the
+    // nbits size. If nbits > 29, the code stream is invalid, but no error is
+    // returned.
+    // Note that in most cases we will emit an error if the histogram allows
+    // representing numbers that would cause invalid shifts, but we need to
+    // keep this check as when LZ77 is enabled it might make sense to have an
+    // histogram that could in principle cause invalid shifts.
+    nbits &= 31u;
+    uint32_t low = token & ((1 << lsb_in_token) - 1);
+    token >>= lsb_in_token;
+    const size_t bits = br->PeekBits(nbits);
+    br->Consume(nbits);
+    size_t ret = (((((1 << msb_in_token) | (token & ((1 << msb_in_token) - 1)))
+                    << nbits) |
+                   bits)
+                  << lsb_in_token) |
+                 low;
+    // TODO(eustas): mark BitReader as unhealthy if nbits > 29 or ret does not
+    //               fit uint32_t
+    return static_cast<uint32_t>(ret);
+  }
+
+  // Takes a *clustered* idx. Can only use if HuffRleOnly() is true.
+  void ReadHybridUintClusteredHuffRleOnly(size_t ctx,
+                                          BitReader* JXL_RESTRICT br,
+                                          uint32_t* value, uint32_t* run) {
+    JXL_DASSERT(HuffRleOnly());
+    br->Refill();  // covers ReadSymbolWithoutRefill + PeekBits
+    size_t token = ReadSymbolHuffWithoutRefill(ctx, br);
+    if (JXL_UNLIKELY(token >= lz77_threshold_)) {
+      *run =
+          ReadHybridUintConfig(lz77_length_uint_, token - lz77_threshold_, br) +
+          lz77_min_length_ - 1;
+      return;
+    }
+    *value = ReadHybridUintConfig(configs[ctx], token, br);
+  }
+  bool HuffRleOnly() {
+    if (lz77_window_ == nullptr) return false;
+    if (!use_prefix_code_) return false;
+    for (size_t i = 0; i < kHuffmanTableBits; i++) {
+      if (huffman_data_[lz77_ctx_].table_[i].bits) return false;
+      if (huffman_data_[lz77_ctx_].table_[i].value != 1) return false;
+    }
+    if (configs[lz77_ctx_].split_token > 1) return false;
+    return true;
+  }
+
+  // Takes a *clustered* idx.
+  size_t ReadHybridUintClustered(size_t ctx, BitReader* JXL_RESTRICT br) {
+    if (JXL_UNLIKELY(num_to_copy_ > 0)) {
+      size_t ret = lz77_window_[(copy_pos_++) & kWindowMask];
+      num_to_copy_--;
+      lz77_window_[(num_decoded_++) & kWindowMask] = ret;
+      return ret;
+    }
+    br->Refill();  // covers ReadSymbolWithoutRefill + PeekBits
+    size_t token = ReadSymbolWithoutRefill(ctx, br);
+    if (JXL_UNLIKELY(token >= lz77_threshold_)) {
+      num_to_copy_ =
+          ReadHybridUintConfig(lz77_length_uint_, token - lz77_threshold_, br) +
+          lz77_min_length_;
+      br->Refill();  // covers ReadSymbolWithoutRefill + PeekBits
+      // Distance code.
+      size_t token = ReadSymbolWithoutRefill(lz77_ctx_, br);
+      size_t distance = ReadHybridUintConfig(configs[lz77_ctx_], token, br);
+      if (JXL_LIKELY(distance < num_special_distances_)) {
+        distance = special_distances_[distance];
+      } else {
+        distance = distance + 1 - num_special_distances_;
+      }
+      if (JXL_UNLIKELY(distance > num_decoded_)) {
+        distance = num_decoded_;
+      }
+      if (JXL_UNLIKELY(distance > kWindowSize)) {
+        distance = kWindowSize;
+      }
+      copy_pos_ = num_decoded_ - distance;
+      if (JXL_UNLIKELY(distance == 0)) {
+        JXL_DASSERT(lz77_window_ != nullptr);
+        // distance 0 -> num_decoded_ == copy_pos_ == 0
+        size_t to_fill = std::min<size_t>(num_to_copy_, kWindowSize);
+        memset(lz77_window_, 0, to_fill * sizeof(lz77_window_[0]));
+      }
+      // TODO(eustas): overflow; mark BitReader as unhealthy
+      if (num_to_copy_ < lz77_min_length_) return 0;
+      return ReadHybridUintClustered(ctx, br);  // will trigger a copy.
+    }
+    size_t ret = ReadHybridUintConfig(configs[ctx], token, br);
+    if (lz77_window_) lz77_window_[(num_decoded_++) & kWindowMask] = ret;
+    return ret;
+  }
+
+  JXL_INLINE size_t ReadHybridUint(size_t ctx, BitReader* JXL_RESTRICT br,
+                                   const std::vector<uint8_t>& context_map) {
+    return ReadHybridUintClustered(context_map[ctx], br);
+  }
+
+  // ctx is a *clustered* context!
+  // This function will modify the ANS state as if `count` symbols have been
+  // decoded.
+  bool IsSingleValueAndAdvance(size_t ctx, uint32_t* value, size_t count) {
+    // TODO(veluca): No optimization for Huffman mode yet.
+    if (use_prefix_code_) return false;
+    // TODO(eustas): propagate "degenerate_symbol" to simplify this method.
+    const uint32_t res = state_ & (ANS_TAB_SIZE - 1u);
+    const AliasTable::Entry* table = &alias_tables_[ctx << log_alpha_size_];
+    AliasTable::Symbol symbol =
+        AliasTable::Lookup(table, res, log_entry_size_, entry_size_minus_1_);
+    if (symbol.freq != ANS_TAB_SIZE) return false;
+    if (configs[ctx].split_token <= symbol.value) return false;
+    if (symbol.value >= lz77_threshold_) return false;
+    *value = symbol.value;
+    if (lz77_window_) {
+      for (size_t i = 0; i < count; i++) {
+        lz77_window_[(num_decoded_++) & kWindowMask] = symbol.value;
+      }
+    }
+    return true;
+  }
+
+  static constexpr size_t kMaxCheckpointInterval = 512;
+  struct Checkpoint {
+    uint32_t state;
+    uint32_t num_to_copy;
+    uint32_t copy_pos;
+    uint32_t num_decoded;
+    uint32_t lz77_window[kMaxCheckpointInterval];
+  };
+  void Save(Checkpoint* checkpoint) {
+    checkpoint->state = state_;
+    checkpoint->num_decoded = num_decoded_;
+    checkpoint->num_to_copy = num_to_copy_;
+    checkpoint->copy_pos = copy_pos_;
+    if (lz77_window_) {
+      size_t win_start = num_decoded_ & kWindowMask;
+      size_t win_end = (num_decoded_ + kMaxCheckpointInterval) & kWindowMask;
+      if (win_end > win_start) {
+        memcpy(checkpoint->lz77_window, lz77_window_ + win_start,
+               (win_end - win_start) * sizeof(*lz77_window_));
+      } else {
+        memcpy(checkpoint->lz77_window, lz77_window_ + win_start,
+               (kWindowSize - win_start) * sizeof(*lz77_window_));
+        memcpy(checkpoint->lz77_window + (kWindowSize - win_start),
+               lz77_window_, win_end * sizeof(*lz77_window_));
+      }
+    }
+  }
+  void Restore(const Checkpoint& checkpoint) {
+    state_ = checkpoint.state;
+    JXL_DASSERT(num_decoded_ <=
+                checkpoint.num_decoded + kMaxCheckpointInterval);
+    num_decoded_ = checkpoint.num_decoded;
+    num_to_copy_ = checkpoint.num_to_copy;
+    copy_pos_ = checkpoint.copy_pos;
+    if (lz77_window_) {
+      size_t win_start = num_decoded_ & kWindowMask;
+      size_t win_end = (num_decoded_ + kMaxCheckpointInterval) & kWindowMask;
+      if (win_end > win_start) {
+        memcpy(lz77_window_ + win_start, checkpoint.lz77_window,
+               (win_end - win_start) * sizeof(*lz77_window_));
+      } else {
+        memcpy(lz77_window_ + win_start, checkpoint.lz77_window,
+               (kWindowSize - win_start) * sizeof(*lz77_window_));
+        memcpy(lz77_window_, checkpoint.lz77_window + (kWindowSize - win_start),
+               win_end * sizeof(*lz77_window_));
+      }
+    }
+  }
+
+ private:
+  const AliasTable::Entry* JXL_RESTRICT alias_tables_;  // not owned
+  const HuffmanDecodingData* huffman_data_;
+  bool use_prefix_code_;
+  uint32_t state_ = ANS_SIGNATURE << 16u;
+  const HybridUintConfig* JXL_RESTRICT configs;
+  uint32_t log_alpha_size_{};
+  uint32_t log_entry_size_{};
+  uint32_t entry_size_minus_1_{};
+
+  // LZ77 structures and constants.
+  static constexpr size_t kWindowMask = kWindowSize - 1;
+  CacheAlignedUniquePtr lz77_window_storage_;
+  uint32_t* lz77_window_ = nullptr;
+  uint32_t num_decoded_ = 0;
+  uint32_t num_to_copy_ = 0;
+  uint32_t copy_pos_ = 0;
+  uint32_t lz77_ctx_ = 0;
+  uint32_t lz77_min_length_ = 0;
+  uint32_t lz77_threshold_ = 1 << 20;  // bigger than any symbol.
+  HybridUintConfig lz77_length_uint_;
+  uint32_t special_distances_[kNumSpecialDistances]{};
+  uint32_t num_special_distances_{};
+};
+
+Status DecodeHistograms(BitReader* br, size_t num_contexts, ANSCode* code,
+                        std::vector<uint8_t>* context_map,
+                        bool disallow_lz77 = false);
+
+// Exposed for tests.
+Status DecodeUintConfigs(size_t log_alpha_size,
+                         std::vector<HybridUintConfig>* uint_config,
+                         BitReader* br);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_ANS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_bit_reader.h b/third_party/jpeg-xl/lib/jxl/dec_bit_reader.h
new file mode 100644
index 0000000000..df70284e3b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_bit_reader.h
@@ -0,0 +1,354 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_BIT_READER_H_
+#define LIB_JXL_DEC_BIT_READER_H_
+
+// Bounds-checked bit reader; 64-bit buffer with support for deferred refills
+// and switching to reading byte-aligned words.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memcpy
+
+#ifdef __BMI2__
+#include <immintrin.h>
+#endif
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+
+namespace jxl {
+
+// Reads bits previously written to memory by BitWriter. Uses unaligned 8-byte
+// little-endian loads.
+class BitReader {
+ public:
+  static constexpr size_t kMaxBitsPerCall = 56;
+
+  // Constructs an invalid BitReader, to be overwritten before usage.
+  BitReader()
+      : buf_(0),
+        bits_in_buf_(0),
+        next_byte_{nullptr},
+        end_minus_8_{nullptr},
+        first_byte_(nullptr) {}
+  BitReader(const BitReader&) = delete;
+
+  // bytes need not be aligned nor padded!
+  template <class ArrayLike>
+  explicit BitReader(const ArrayLike& bytes)
+      : buf_(0),
+        bits_in_buf_(0),
+        next_byte_(bytes.data()),
+        // Assumes first_byte_ >= 8.
+        end_minus_8_(bytes.data() - 8 + bytes.size()),
+        first_byte_(bytes.data()) {
+    Refill();
+  }
+  ~BitReader() {
+    // Close() must be called before destroying an initialized bit reader.
+    // Invalid bit readers will have a nullptr in first_byte_.
+    JXL_ASSERT(close_called_ || !first_byte_);
+  }
+
+  // Move operator needs to invalidate the other BitReader such that it is
+  // irrelevant if we call Close() on it or not.
+  BitReader& operator=(BitReader&& other) noexcept {
+    // Ensure the current instance was already closed, before we overwrite it
+    // with other.
+    JXL_ASSERT(close_called_ || !first_byte_);
+
+    JXL_DASSERT(!other.close_called_);
+    buf_ = other.buf_;
+    bits_in_buf_ = other.bits_in_buf_;
+    next_byte_ = other.next_byte_;
+    end_minus_8_ = other.end_minus_8_;
+    first_byte_ = other.first_byte_;
+    overread_bytes_ = other.overread_bytes_;
+    close_called_ = other.close_called_;
+
+    other.first_byte_ = nullptr;
+    other.next_byte_ = nullptr;
+    return *this;
+  }
+  BitReader& operator=(const BitReader& other) = delete;
+
+  // For time-critical reads, refills can be shared by multiple reads.
+  // Based on variant 4 (plus bounds-checking), see
+  // fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/
+  JXL_INLINE void Refill() {
+    if (JXL_UNLIKELY(next_byte_ > end_minus_8_)) {
+      BoundsCheckedRefill();
+    } else {
+      // It's safe to load 64 bits; insert valid (possibly nonzero) bits above
+      // bits_in_buf_. The shift requires bits_in_buf_ < 64.
+      buf_ |= LoadLE64(next_byte_) << bits_in_buf_;
+
+      // Advance by bytes fully absorbed into the buffer.
+      next_byte_ += (63 - bits_in_buf_) >> 3;
+
+      // We absorbed a multiple of 8 bits, so the lower 3 bits of bits_in_buf_
+      // must remain unchanged, otherwise the next refill's shifted bits will
+      // not align with buf_. Set the three upper bits so the result >= 56.
+      bits_in_buf_ |= 56;
+      JXL_DASSERT(56 <= bits_in_buf_ && bits_in_buf_ < 64);
+    }
+  }
+
+  // Returns the bits that would be returned by Read without calling Advance().
+  // It is legal to PEEK at more bits than present in the bitstream (required
+  // by Huffman), and those bits will be zero.
+  template <size_t N>
+  JXL_INLINE uint64_t PeekFixedBits() const {
+    static_assert(N <= kMaxBitsPerCall, "Reading too many bits in one call.");
+    JXL_DASSERT(!close_called_);
+    return buf_ & ((1ULL << N) - 1);
+  }
+
+  JXL_INLINE uint64_t PeekBits(size_t nbits) const {
+    JXL_DASSERT(nbits <= kMaxBitsPerCall);
+    JXL_DASSERT(!close_called_);
+
+    // Slightly faster but requires BMI2. It is infeasible to make the many
+    // callers reside between begin/end_target, especially because only the
+    // callers in dec_ans are time-critical. Therefore only enabled if the
+    // entire binary is compiled for (and thus requires) BMI2.
+#if defined(__BMI2__) && defined(__x86_64__)
+    return _bzhi_u64(buf_, nbits);
+#else
+    const uint64_t mask = (1ULL << nbits) - 1;
+    return buf_ & mask;
+#endif
+  }
+
+  // Removes bits from the buffer. Need not match the previous Peek size, but
+  // the buffer must contain at least num_bits (this prevents consuming more
+  // than the total number of bits).
+  JXL_INLINE void Consume(size_t num_bits) {
+    JXL_DASSERT(!close_called_);
+    JXL_DASSERT(bits_in_buf_ >= num_bits);
+#ifdef JXL_CRASH_ON_ERROR
+    // When JXL_CRASH_ON_ERROR is defined, it is a fatal error to read more bits
+    // than available in the stream. A non-zero overread_bytes_ implies that
+    // next_byte_ is already at the end of the stream, so we don't need to
+    // check that.
+    JXL_ASSERT(bits_in_buf_ >= num_bits + overread_bytes_ * kBitsPerByte);
+#endif
+    bits_in_buf_ -= num_bits;
+    buf_ >>= num_bits;
+  }
+
+  JXL_INLINE uint64_t ReadBits(size_t nbits) {
+    JXL_DASSERT(!close_called_);
+    Refill();
+    const uint64_t bits = PeekBits(nbits);
+    Consume(nbits);
+    return bits;
+  }
+
+  template <size_t N>
+  JXL_INLINE uint64_t ReadFixedBits() {
+    JXL_DASSERT(!close_called_);
+    Refill();
+    const uint64_t bits = PeekFixedBits<N>();
+    Consume(N);
+    return bits;
+  }
+
+  // Equivalent to calling ReadFixedBits(1) `skip` times, but much faster.
+  // `skip` is typically large.
+  void SkipBits(size_t skip) {
+    JXL_DASSERT(!close_called_);
+    // Buffer is large enough - don't zero buf_ below.
+    if (JXL_UNLIKELY(skip <= bits_in_buf_)) {
+      Consume(skip);
+      return;
+    }
+
+    // First deduct what we can satisfy from the buffer
+    skip -= bits_in_buf_;
+    bits_in_buf_ = 0;
+    // Not enough to call Advance - that may leave some bits in the buffer
+    // which were previously ABOVE bits_in_buf.
+    buf_ = 0;
+
+    // Skip whole bytes
+    const size_t whole_bytes = skip / kBitsPerByte;
+    skip %= kBitsPerByte;
+    if (JXL_UNLIKELY(whole_bytes >
+                     static_cast<size_t>(end_minus_8_ + 8 - next_byte_))) {
+      // This is already an overflow condition (skipping past the end of the bit
+      // stream). However if we increase next_byte_ too much we risk overflowing
+      // that value and potentially making it valid again (next_byte_ < end).
+      // This will set next_byte_ to the end of the stream and still consume
+      // some bits in overread_bytes_, however the TotalBitsConsumed() will be
+      // incorrect (still larger than the TotalBytes()).
+      next_byte_ = end_minus_8_ + 8;
+      skip += kBitsPerByte;
+    } else {
+      next_byte_ += whole_bytes;
+    }
+
+    Refill();
+    Consume(skip);
+  }
+
+  size_t TotalBitsConsumed() const {
+    const size_t bytes_read = static_cast<size_t>(next_byte_ - first_byte_);
+    return (bytes_read + overread_bytes_) * kBitsPerByte - bits_in_buf_;
+  }
+
+  Status JumpToByteBoundary() {
+    const size_t remainder = TotalBitsConsumed() % kBitsPerByte;
+    if (remainder == 0) return true;
+    if (JXL_UNLIKELY(ReadBits(kBitsPerByte - remainder) != 0)) {
+      return JXL_FAILURE("Non-zero padding bits");
+    }
+    return true;
+  }
+
+  // For interoperability with other bitreaders (for resuming at
+  // non-byte-aligned positions).
+  const uint8_t* FirstByte() const { return first_byte_; }
+  size_t TotalBytes() const {
+    return static_cast<size_t>(end_minus_8_ + 8 - first_byte_);
+  }
+
+  // Returns span of the remaining (unconsumed) bytes, e.g. for passing to
+  // external decoders such as Brotli.
+  Span<const uint8_t> GetSpan() const {
+    JXL_DASSERT(first_byte_ != nullptr);
+    JXL_ASSERT(TotalBitsConsumed() % kBitsPerByte == 0);
+    const size_t offset = TotalBitsConsumed() / kBitsPerByte;  // no remainder
+    JXL_ASSERT(offset <= TotalBytes());
+    return Span<const uint8_t>(first_byte_ + offset, TotalBytes() - offset);
+  }
+
+  // Returns whether all the bits read so far have been within the input bounds.
+  // When reading past the EOF, the Read*() and Consume() functions return zeros
+  // but flag a failure when calling Close() without checking this function.
+  Status AllReadsWithinBounds() {
+    // Mark up to which point the user checked the out of bounds condition. If
+    // the user handles the condition at higher level (e.g. fetch more bytes
+    // from network, return a custom JXL_FAILURE, ...), Close() should not
+    // output a debug error (which would break tests with JXL_CRASH_ON_ERROR
+    // even when legitimately handling the situation at higher level). This is
+    // used by Bundle::CanRead.
+    checked_out_of_bounds_bits_ = TotalBitsConsumed();
+    if (TotalBitsConsumed() > TotalBytes() * kBitsPerByte) {
+      return false;
+    }
+    return true;
+  }
+
+  // Close the bit reader and return whether all the previous reads were
+  // successful. Close must be called once.
+  Status Close() {
+    JXL_DASSERT(!close_called_);
+    close_called_ = true;
+    if (!first_byte_) return true;
+    if (TotalBitsConsumed() > checked_out_of_bounds_bits_ &&
+        TotalBitsConsumed() > TotalBytes() * kBitsPerByte) {
+      return JXL_FAILURE("Read more bits than available in the bit_reader");
+    }
+    return true;
+  }
+
+ private:
+  // Separate function avoids inlining this relatively cold code into callers.
+  JXL_NOINLINE void BoundsCheckedRefill() {
+    PROFILER_FUNC;
+    const uint8_t* end = end_minus_8_ + 8;
+
+    // Read whole bytes until we have [56, 64) bits (same as LoadLE64)
+    for (; bits_in_buf_ < 64 - kBitsPerByte; bits_in_buf_ += kBitsPerByte) {
+      if (next_byte_ >= end) break;
+      buf_ |= static_cast<uint64_t>(*next_byte_++) << bits_in_buf_;
+    }
+    JXL_DASSERT(bits_in_buf_ < 64);
+
+    // Add extra bytes as 0 at the end of the stream in the bit_buffer_. If
+    // these bits are read, Close() will return a failure.
+    size_t extra_bytes = (63 - bits_in_buf_) / kBitsPerByte;
+    overread_bytes_ += extra_bytes;
+    bits_in_buf_ += extra_bytes * kBitsPerByte;
+
+    JXL_DASSERT(bits_in_buf_ < 64);
+    JXL_DASSERT(bits_in_buf_ >= 56);
+  }
+
+  JXL_NOINLINE uint32_t BoundsCheckedReadByteAlignedWord() {
+    if (next_byte_ + 1 < end_minus_8_ + 8) {
+      uint32_t ret = LoadLE16(next_byte_);
+      next_byte_ += 2;
+      return ret;
+    }
+    overread_bytes_ += 2;
+    return 0;
+  }
+
+  uint64_t buf_;
+  size_t bits_in_buf_;  // [0, 64)
+  const uint8_t* JXL_RESTRICT next_byte_;
+  const uint8_t* end_minus_8_;  // for refill bounds check
+  const uint8_t* first_byte_;   // for GetSpan
+
+  // Number of bytes past the end that were loaded into the buf_. These bytes
+  // are not read from memory, but instead assumed 0. It is an error (likely due
+  // to an invalid stream) to Consume() more bits than specified in the range
+  // passed to the constructor.
+  uint64_t overread_bytes_{0};
+  bool close_called_{false};
+
+  uint64_t checked_out_of_bounds_bits_{0};
+};
+
+// Closes a BitReader when the BitReaderScopedCloser goes out of scope. When
+// closing the bit reader, if the status result was failure it sets this failure
+// to the passed variable pointer. Typical usage.
+//
+// Status ret = true;
+// {
+//   BitReader reader(...);
+//   BitReaderScopedCloser reader_closer(&reader, &ret);
+//
+//   // ... code that can return errors here ...
+// }
+// // ... more code that doesn't use the BitReader.
+// return ret;
+
+class BitReaderScopedCloser {
+ public:
+  BitReaderScopedCloser(BitReader* reader, Status* status)
+      : reader_(reader), status_(status) {
+    JXL_DASSERT(reader_ != nullptr);
+    JXL_DASSERT(status_ != nullptr);
+  }
+  ~BitReaderScopedCloser() {
+    if (reader_ != nullptr) {
+      Status close_ret = reader_->Close();
+      if (!close_ret) *status_ = close_ret;
+    }
+  }
+  void CloseAndSuppressError() {
+    JXL_ASSERT(reader_ != nullptr);
+    (void)reader_->Close();
+    reader_ = nullptr;
+  }
+  BitReaderScopedCloser(const BitReaderScopedCloser&) = delete;
+
+ private:
+  BitReader* reader_;
+  Status* status_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_BIT_READER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_cache.cc b/third_party/jpeg-xl/lib/jxl/dec_cache.cc
new file mode 100644
index 0000000000..4db6f1d9a5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_cache.cc
@@ -0,0 +1,229 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_cache.h"
+
+#include "lib/jxl/blending.h"
+#include "lib/jxl/render_pipeline/stage_blending.h"
+#include "lib/jxl/render_pipeline/stage_chroma_upsampling.h"
+#include "lib/jxl/render_pipeline/stage_epf.h"
+#include "lib/jxl/render_pipeline/stage_from_linear.h"
+#include "lib/jxl/render_pipeline/stage_gaborish.h"
+#include "lib/jxl/render_pipeline/stage_noise.h"
+#include "lib/jxl/render_pipeline/stage_patches.h"
+#include "lib/jxl/render_pipeline/stage_splines.h"
+#include "lib/jxl/render_pipeline/stage_spot.h"
+#include "lib/jxl/render_pipeline/stage_to_linear.h"
+#include "lib/jxl/render_pipeline/stage_tone_mapping.h"
+#include "lib/jxl/render_pipeline/stage_upsampling.h"
+#include "lib/jxl/render_pipeline/stage_write.h"
+#include "lib/jxl/render_pipeline/stage_xyb.h"
+#include "lib/jxl/render_pipeline/stage_ycbcr.h"
+
+namespace jxl {
+
+Status PassesDecoderState::PreparePipeline(ImageBundle* decoded,
+                                           PipelineOptions options) {
+  const FrameHeader& frame_header = shared->frame_header;
+  size_t num_c = 3 + frame_header.nonserialized_metadata->m.num_extra_channels;
+  if ((frame_header.flags & FrameHeader::kNoise) != 0) {
+    num_c += 3;
+  }
+
+  if (frame_header.CanBeReferenced()) {
+    // Necessary so that SetInputSizes() can allocate output buffers as needed.
+    frame_storage_for_referencing = ImageBundle(decoded->metadata());
+  }
+
+  RenderPipeline::Builder builder(num_c);
+
+  if (options.use_slow_render_pipeline) {
+    builder.UseSimpleImplementation();
+  }
+
+  if (!frame_header.chroma_subsampling.Is444()) {
+    for (size_t c = 0; c < 3; c++) {
+      if (frame_header.chroma_subsampling.HShift(c) != 0) {
+        builder.AddStage(GetChromaUpsamplingStage(c, /*horizontal=*/true));
+      }
+      if (frame_header.chroma_subsampling.VShift(c) != 0) {
+        builder.AddStage(GetChromaUpsamplingStage(c, /*horizontal=*/false));
+      }
+    }
+  }
+
+  if (frame_header.loop_filter.gab) {
+    builder.AddStage(GetGaborishStage(frame_header.loop_filter));
+  }
+
+  {
+    const LoopFilter& lf = frame_header.loop_filter;
+    if (lf.epf_iters >= 3) {
+      builder.AddStage(GetEPFStage(lf, sigma, 0));
+    }
+    if (lf.epf_iters >= 1) {
+      builder.AddStage(GetEPFStage(lf, sigma, 1));
+    }
+    if (lf.epf_iters >= 2) {
+      builder.AddStage(GetEPFStage(lf, sigma, 2));
+    }
+  }
+
+  bool late_ec_upsample = frame_header.upsampling != 1;
+  for (auto ecups : frame_header.extra_channel_upsampling) {
+    if (ecups != frame_header.upsampling) {
+      // If patches are applied, either frame_header.upsampling == 1 or
+      // late_ec_upsample is true.
+      late_ec_upsample = false;
+    }
+  }
+
+  if (!late_ec_upsample) {
+    for (size_t ec = 0; ec < frame_header.extra_channel_upsampling.size();
+         ec++) {
+      if (frame_header.extra_channel_upsampling[ec] != 1) {
+        builder.AddStage(GetUpsamplingStage(
+            frame_header.nonserialized_metadata->transform_data, 3 + ec,
+            CeilLog2Nonzero(frame_header.extra_channel_upsampling[ec])));
+      }
+    }
+  }
+
+  if ((frame_header.flags & FrameHeader::kPatches) != 0) {
+    builder.AddStage(
+        GetPatchesStage(&shared->image_features.patches,
+                        3 + shared->metadata->m.num_extra_channels));
+  }
+  if ((frame_header.flags & FrameHeader::kSplines) != 0) {
+    builder.AddStage(GetSplineStage(&shared->image_features.splines));
+  }
+
+  if (frame_header.upsampling != 1) {
+    size_t nb_channels =
+        3 +
+        (late_ec_upsample ? frame_header.extra_channel_upsampling.size() : 0);
+    for (size_t c = 0; c < nb_channels; c++) {
+      builder.AddStage(GetUpsamplingStage(
+          frame_header.nonserialized_metadata->transform_data, c,
+          CeilLog2Nonzero(frame_header.upsampling)));
+    }
+  }
+
+  if ((frame_header.flags & FrameHeader::kNoise) != 0) {
+    builder.AddStage(GetConvolveNoiseStage(num_c - 3));
+    builder.AddStage(GetAddNoiseStage(shared->image_features.noise_params,
+                                      shared->cmap, num_c - 3));
+  }
+  if (frame_header.dc_level != 0) {
+    builder.AddStage(GetWriteToImage3FStage(
+        &shared_storage.dc_frames[frame_header.dc_level - 1]));
+  }
+
+  if (frame_header.CanBeReferenced() &&
+      frame_header.save_before_color_transform) {
+    builder.AddStage(GetWriteToImageBundleStage(
+        &frame_storage_for_referencing, output_encoding_info.color_encoding));
+  }
+
+  bool has_alpha = false;
+  size_t alpha_c = 0;
+  for (size_t i = 0; i < decoded->metadata()->extra_channel_info.size(); i++) {
+    if (decoded->metadata()->extra_channel_info[i].type ==
+        ExtraChannel::kAlpha) {
+      has_alpha = true;
+      alpha_c = 3 + i;
+      break;
+    }
+  }
+
+  if (fast_xyb_srgb8_conversion) {
+    JXL_ASSERT(!NeedsBlending(this));
+    JXL_ASSERT(!frame_header.CanBeReferenced() ||
+               frame_header.save_before_color_transform);
+    JXL_ASSERT(!options.render_spotcolors ||
+               !decoded->metadata()->Find(ExtraChannel::kSpotColor));
+    bool is_rgba = (main_output.format.num_channels == 4);
+    uint8_t* rgb_output = reinterpret_cast<uint8_t*>(main_output.buffer);
+    builder.AddStage(GetFastXYBTosRGB8Stage(rgb_output, main_output.stride,
+                                            width, height, is_rgba, has_alpha,
+                                            alpha_c));
+  } else {
+    bool linear = false;
+    if (frame_header.color_transform == ColorTransform::kYCbCr) {
+      builder.AddStage(GetYCbCrStage());
+    } else if (frame_header.color_transform == ColorTransform::kXYB) {
+      builder.AddStage(GetXYBStage(output_encoding_info));
+      if (output_encoding_info.color_encoding.GetColorSpace() !=
+          ColorSpace::kXYB) {
+        linear = true;
+      }
+    }  // Nothing to do for kNone.
+
+    if (options.coalescing && NeedsBlending(this)) {
+      if (linear) {
+        builder.AddStage(GetFromLinearStage(output_encoding_info));
+        linear = false;
+      }
+      builder.AddStage(
+          GetBlendingStage(this, output_encoding_info.color_encoding));
+    }
+
+    if (options.coalescing && frame_header.CanBeReferenced() &&
+        !frame_header.save_before_color_transform) {
+      if (linear) {
+        builder.AddStage(GetFromLinearStage(output_encoding_info));
+        linear = false;
+      }
+      builder.AddStage(GetWriteToImageBundleStage(
+          &frame_storage_for_referencing, output_encoding_info.color_encoding));
+    }
+
+    if (options.render_spotcolors &&
+        frame_header.nonserialized_metadata->m.Find(ExtraChannel::kSpotColor)) {
+      for (size_t i = 0; i < decoded->metadata()->extra_channel_info.size();
+           i++) {
+        // Don't use Find() because there may be multiple spot color channels.
+        const ExtraChannelInfo& eci =
+            decoded->metadata()->extra_channel_info[i];
+        if (eci.type == ExtraChannel::kSpotColor) {
+          builder.AddStage(GetSpotColorStage(3 + i, eci.spot_color));
+        }
+      }
+    }
+
+    auto tone_mapping_stage = GetToneMappingStage(output_encoding_info);
+    if (tone_mapping_stage) {
+      if (!linear) {
+        auto to_linear_stage = GetToLinearStage(output_encoding_info);
+        if (!to_linear_stage) {
+          return JXL_FAILURE(
+              "attempting to perform tone mapping on colorspace not "
+              "convertible to linear");
+        }
+        builder.AddStage(std::move(to_linear_stage));
+        linear = true;
+      }
+      builder.AddStage(std::move(tone_mapping_stage));
+    }
+
+    if (linear) {
+      builder.AddStage(GetFromLinearStage(output_encoding_info));
+      linear = false;
+    }
+
+    if (main_output.callback.IsPresent() || main_output.buffer) {
+      builder.AddStage(GetWriteToOutputStage(main_output, width, height,
+                                             has_alpha, unpremul_alpha, alpha_c,
+                                             undo_orientation, extra_output));
+    } else {
+      builder.AddStage(GetWriteToImageBundleStage(
+          decoded, output_encoding_info.color_encoding));
+    }
+  }
+  render_pipeline = std::move(builder).Finalize(shared->frame_dim);
+  return render_pipeline->IsInitialized();
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_cache.h b/third_party/jpeg-xl/lib/jxl/dec_cache.h
new file mode 100644
index 0000000000..7c9fe9a6c3
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_cache.h
@@ -0,0 +1,261 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_CACHE_H_
+#define LIB_JXL_DEC_CACHE_H_
+
+#include <jxl/decode.h>
+#include <stdint.h>
+
+#include <atomic>
+#include <hwy/base.h>  // HWY_ALIGN_MAX
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/dec_group_border.h"
+#include "lib/jxl/dec_noise.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/passes_state.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+#include "lib/jxl/render_pipeline/stage_upsampling.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+
+constexpr size_t kSigmaBorder = 1;
+constexpr size_t kSigmaPadding = 2;
+
+struct PixelCallback {
+  PixelCallback() = default;
+  PixelCallback(JxlImageOutInitCallback init, JxlImageOutRunCallback run,
+                JxlImageOutDestroyCallback destroy, void* init_opaque)
+      : init(init), run(run), destroy(destroy), init_opaque(init_opaque) {
+#if JXL_ENABLE_ASSERT
+    const bool has_init = init != nullptr;
+    const bool has_run = run != nullptr;
+    const bool has_destroy = destroy != nullptr;
+    JXL_ASSERT(has_init == has_run && has_run == has_destroy);
+#endif
+  }
+
+  bool IsPresent() const { return run != nullptr; }
+
+  void* Init(size_t num_threads, size_t num_pixels) const {
+    return init(init_opaque, num_threads, num_pixels);
+  }
+
+  JxlImageOutInitCallback init = nullptr;
+  JxlImageOutRunCallback run = nullptr;
+  JxlImageOutDestroyCallback destroy = nullptr;
+  void* init_opaque = nullptr;
+};
+
+struct ImageOutput {
+  // Pixel format of the output pixels, used for buffer and callback output.
+  JxlPixelFormat format;
+  // Output bit depth for unsigned data types, used for float to int conversion.
+  size_t bits_per_sample;
+  // Callback for line-by-line output.
+  PixelCallback callback;
+  // Pixel buffer for image output.
+  void* buffer;
+  size_t buffer_size;
+  // Length of a row of image_buffer in bytes (based on oriented width).
+  size_t stride;
+};
+
+// Per-frame decoder state. All the images here should be accessed through a
+// group rect (either with block units or pixel units).
+struct PassesDecoderState {
+  PassesSharedState shared_storage;
+  // Allows avoiding copies for encoder loop.
+  const PassesSharedState* JXL_RESTRICT shared = &shared_storage;
+
+  // 8x upsampling stage for DC.
+  std::unique_ptr<RenderPipelineStage> upsampler8x;
+
+  // For ANS decoding.
+  std::vector<ANSCode> code;
+  std::vector<std::vector<uint8_t>> context_map;
+
+  // Multiplier to be applied to the quant matrices of the x channel.
+  float x_dm_multiplier;
+  float b_dm_multiplier;
+
+  // Sigma values for EPF.
+  ImageF sigma;
+
+  // Image dimensions before applying undo_orientation.
+  size_t width;
+  size_t height;
+  ImageOutput main_output;
+  std::vector<ImageOutput> extra_output;
+
+  // Whether to use int16 float-XYB-to-uint8-srgb conversion.
+  bool fast_xyb_srgb8_conversion;
+
+  // If true, the RGBA output will be unpremultiplied before writing to the
+  // output.
+  bool unpremul_alpha;
+
+  // The render pipeline will apply this orientation to bring the image to the
+  // intended display orientation.
+  Orientation undo_orientation;
+
+  // Used for seeding noise.
+  size_t visible_frame_index = 0;
+  size_t nonvisible_frame_index = 0;
+
+  // Keep track of the transform types used.
+  std::atomic<uint32_t> used_acs{0};
+
+  // Storage for coefficients if in "accumulate" mode.
+  std::unique_ptr<ACImage> coefficients = make_unique<ACImageT<int32_t>>(0, 0);
+
+  // Rendering pipeline.
+  std::unique_ptr<RenderPipeline> render_pipeline;
+
+  // Storage for the current frame if it can be referenced by future frames.
+  ImageBundle frame_storage_for_referencing;
+
+  struct PipelineOptions {
+    bool use_slow_render_pipeline;
+    bool coalescing;
+    bool render_spotcolors;
+  };
+
+  Status PreparePipeline(ImageBundle* decoded, PipelineOptions options);
+
+  // Information for colour conversions.
+  OutputEncodingInfo output_encoding_info;
+
+  // Initializes decoder-specific structures using information from *shared.
+  Status Init() {
+    x_dm_multiplier =
+        std::pow(1 / (1.25f), shared->frame_header.x_qm_scale - 2.0f);
+    b_dm_multiplier =
+        std::pow(1 / (1.25f), shared->frame_header.b_qm_scale - 2.0f);
+
+    main_output.callback = PixelCallback();
+    main_output.buffer = nullptr;
+    extra_output.clear();
+
+    fast_xyb_srgb8_conversion = false;
+    unpremul_alpha = false;
+    undo_orientation = Orientation::kIdentity;
+
+    used_acs = 0;
+
+    upsampler8x = GetUpsamplingStage(shared->metadata->transform_data, 0, 3);
+    if (shared->frame_header.loop_filter.epf_iters > 0) {
+      sigma = ImageF(shared->frame_dim.xsize_blocks + 2 * kSigmaPadding,
+                     shared->frame_dim.ysize_blocks + 2 * kSigmaPadding);
+    }
+    return true;
+  }
+
+  // Initialize the decoder state after all of DC is decoded.
+  Status InitForAC(ThreadPool* pool) {
+    shared_storage.coeff_order_size = 0;
+    for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
+      if (((1 << o) & used_acs) == 0) continue;
+      uint8_t ord = kStrategyOrder[o];
+      shared_storage.coeff_order_size =
+          std::max(kCoeffOrderOffset[3 * (ord + 1)] * kDCTBlockSize,
+                   shared_storage.coeff_order_size);
+    }
+    size_t sz = shared_storage.frame_header.passes.num_passes *
+                shared_storage.coeff_order_size;
+    if (sz > shared_storage.coeff_orders.size()) {
+      shared_storage.coeff_orders.resize(sz);
+    }
+    return true;
+  }
+
+  // Fills the `state->filter_weights.sigma` image with the precomputed sigma
+  // values in the area inside `block_rect`. Accesses the AC strategy, quant
+  // field and epf_sharpness fields in the corresponding positions.
+  void ComputeSigma(const Rect& block_rect, PassesDecoderState* state);
+};
+
+// Temp images required for decoding a single group. Reduces memory allocations
+// for large images because we only initialize min(#threads, #groups) instances.
+struct GroupDecCache {
+  void InitOnce(size_t num_passes, size_t used_acs) {
+    PROFILER_FUNC;
+
+    for (size_t i = 0; i < num_passes; i++) {
+      if (num_nzeroes[i].xsize() == 0) {
+        // Allocate enough for a whole group - partial groups on the
+        // right/bottom border just use a subset. The valid size is passed via
+        // Rect.
+
+        num_nzeroes[i] = Image3I(kGroupDimInBlocks, kGroupDimInBlocks);
+      }
+    }
+    size_t max_block_area = 0;
+
+    for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
+      AcStrategy acs = AcStrategy::FromRawStrategy(o);
+      if ((used_acs & (1 << o)) == 0) continue;
+      size_t area =
+          acs.covered_blocks_x() * acs.covered_blocks_y() * kDCTBlockSize;
+      max_block_area = std::max(area, max_block_area);
+    }
+
+    if (max_block_area > max_block_area_) {
+      max_block_area_ = max_block_area;
+      // We need 3x float blocks for dequantized coefficients and 1x for scratch
+      // space for transforms.
+      float_memory_ = hwy::AllocateAligned<float>(max_block_area_ * 4);
+      // We need 3x int32 or int16 blocks for quantized coefficients.
+      int32_memory_ = hwy::AllocateAligned<int32_t>(max_block_area_ * 3);
+      int16_memory_ = hwy::AllocateAligned<int16_t>(max_block_area_ * 3);
+    }
+
+    dec_group_block = float_memory_.get();
+    scratch_space = dec_group_block + max_block_area_ * 3;
+    dec_group_qblock = int32_memory_.get();
+    dec_group_qblock16 = int16_memory_.get();
+  }
+
+  void InitDCBufferOnce() {
+    if (dc_buffer.xsize() == 0) {
+      dc_buffer = ImageF(kGroupDimInBlocks + kRenderPipelineXOffset * 2,
+                         kGroupDimInBlocks + 4);
+    }
+  }
+
+  // Scratch space used by DecGroupImpl().
+  float* dec_group_block;
+  int32_t* dec_group_qblock;
+  int16_t* dec_group_qblock16;
+
+  // For TransformToPixels.
+  float* scratch_space;
+  // Note that scratch_space is never used at the same time as dec_group_qblock.
+  // Moreover, only one of dec_group_qblock16 is ever used.
+  // TODO(veluca): figure out if we can save allocations.
+
+  // AC decoding
+  Image3I num_nzeroes[kMaxNumPasses];
+
+  // Buffer for DC upsampling.
+  ImageF dc_buffer;
+
+ private:
+  hwy::AlignedFreeUniquePtr<float[]> float_memory_;
+  hwy::AlignedFreeUniquePtr<int32_t[]> int32_memory_;
+  hwy::AlignedFreeUniquePtr<int16_t[]> int16_memory_;
+  size_t max_block_area_ = 0;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_CACHE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_context_map.cc b/third_party/jpeg-xl/lib/jxl/dec_context_map.cc
new file mode 100644
index 0000000000..1b291650d7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_context_map.cc
@@ -0,0 +1,86 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_context_map.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/inverse_mtf-inl.h"
+
+namespace jxl {
+
+namespace {
+
+Status VerifyContextMap(const std::vector<uint8_t>& context_map,
+                        const size_t num_htrees) {
+  std::vector<bool> have_htree(num_htrees);
+  size_t num_found = 0;
+  for (const uint8_t htree : context_map) {
+    if (htree >= num_htrees) {
+      return JXL_FAILURE("Invalid histogram index in context map.");
+    }
+    if (!have_htree[htree]) {
+      have_htree[htree] = true;
+      ++num_found;
+    }
+  }
+  if (num_found != num_htrees) {
+    return JXL_FAILURE("Incomplete context map.");
+  }
+  return true;
+}
+
+}  // namespace
+
+Status DecodeContextMap(std::vector<uint8_t>* context_map, size_t* num_htrees,
+                        BitReader* input) {
+  bool is_simple = input->ReadFixedBits<1>();
+  if (is_simple) {
+    int bits_per_entry = input->ReadFixedBits<2>();
+    if (bits_per_entry != 0) {
+      for (size_t i = 0; i < context_map->size(); i++) {
+        (*context_map)[i] = input->ReadBits(bits_per_entry);
+      }
+    } else {
+      std::fill(context_map->begin(), context_map->end(), 0);
+    }
+  } else {
+    bool use_mtf = input->ReadFixedBits<1>();
+    ANSCode code;
+    std::vector<uint8_t> dummy_ctx_map;
+    // Usage of LZ77 is disallowed if decoding only two symbols. This doesn't
+    // make sense in non-malicious bitstreams, and could cause a stack overflow
+    // in malicious bitstreams by making every context map require its own
+    // context map.
+    JXL_RETURN_IF_ERROR(
+        DecodeHistograms(input, 1, &code, &dummy_ctx_map,
+                         /*disallow_lz77=*/context_map->size() <= 2));
+    ANSSymbolReader reader(&code, input);
+    size_t i = 0;
+    while (i < context_map->size()) {
+      uint32_t sym = reader.ReadHybridUint(0, input, dummy_ctx_map);
+      if (sym >= kMaxClusters) {
+        return JXL_FAILURE("Invalid cluster ID");
+      }
+      (*context_map)[i] = sym;
+      i++;
+    }
+    if (!reader.CheckANSFinalState()) {
+      return JXL_FAILURE("Invalid context map");
+    }
+    if (use_mtf) {
+      InverseMoveToFrontTransform(context_map->data(), context_map->size());
+    }
+  }
+  *num_htrees = *std::max_element(context_map->begin(), context_map->end()) + 1;
+  return VerifyContextMap(*context_map, *num_htrees);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_context_map.h b/third_party/jpeg-xl/lib/jxl/dec_context_map.h
new file mode 100644
index 0000000000..95b8a0ca92
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_context_map.h
@@ -0,0 +1,30 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_CONTEXT_MAP_H_
+#define LIB_JXL_DEC_CONTEXT_MAP_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/dec_bit_reader.h"
+
+namespace jxl {
+
+// Context map uses uint8_t.
+constexpr size_t kMaxClusters = 256;
+
+// Reads the context map from the bit stream. On calling this function,
+// context_map->size() must be the number of possible context ids.
+// Sets *num_htrees to the number of different histogram ids in
+// *context_map.
+Status DecodeContextMap(std::vector<uint8_t>* context_map, size_t* num_htrees,
+                        BitReader* input);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_CONTEXT_MAP_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_external_image.cc b/third_party/jpeg-xl/lib/jxl/dec_external_image.cc
new file mode 100644
index 0000000000..bbf457ba91
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_external_image.cc
@@ -0,0 +1,493 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_external_image.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/dec_external_image.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/cache_aligned.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::NearestInt;
+
+// TODO(jon): check if this can be replaced by a FloatToU16 function
+void FloatToU32(const float* in, uint32_t* out, size_t num, float mul,
+                size_t bits_per_sample) {
+  const HWY_FULL(float) d;
+  const hwy::HWY_NAMESPACE::Rebind<uint32_t, decltype(d)> du;
+
+  // Unpoison accessing partially-uninitialized vectors with memory sanitizer.
+  // This is because we run NearestInt() on the vector, which triggers msan even
+  // it it safe to do so since the values are not mixed between lanes.
+  const size_t num_round_up = RoundUpTo(num, Lanes(d));
+  msan::UnpoisonMemory(in + num, sizeof(in[0]) * (num_round_up - num));
+
+  const auto one = Set(d, 1.0f);
+  const auto scale = Set(d, mul);
+  for (size_t x = 0; x < num; x += Lanes(d)) {
+    auto v = Load(d, in + x);
+    // Clamp turns NaN to 'min'.
+    v = Clamp(v, Zero(d), one);
+    auto i = NearestInt(Mul(v, scale));
+    Store(BitCast(du, i), du, out + x);
+  }
+
+  // Poison back the output.
+  msan::PoisonMemory(out + num, sizeof(out[0]) * (num_round_up - num));
+}
+
+void FloatToF16(const float* in, hwy::float16_t* out, size_t num) {
+  const HWY_FULL(float) d;
+  const hwy::HWY_NAMESPACE::Rebind<hwy::float16_t, decltype(d)> du;
+
+  // Unpoison accessing partially-uninitialized vectors with memory sanitizer.
+  // This is because we run DemoteTo() on the vector which triggers msan.
+  const size_t num_round_up = RoundUpTo(num, Lanes(d));
+  msan::UnpoisonMemory(in + num, sizeof(in[0]) * (num_round_up - num));
+
+  for (size_t x = 0; x < num; x += Lanes(d)) {
+    auto v = Load(d, in + x);
+    auto v16 = DemoteTo(du, v);
+    Store(v16, du, out + x);
+  }
+
+  // Poison back the output.
+  msan::PoisonMemory(out + num, sizeof(out[0]) * (num_round_up - num));
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace jxl {
+namespace {
+
+// Stores a float in big endian
+void StoreBEFloat(float value, uint8_t* p) {
+  uint32_t u;
+  memcpy(&u, &value, 4);
+  StoreBE32(u, p);
+}
+
+// Stores a float in little endian
+void StoreLEFloat(float value, uint8_t* p) {
+  uint32_t u;
+  memcpy(&u, &value, 4);
+  StoreLE32(u, p);
+}
+
+// The orientation may not be identity.
+// TODO(lode): SIMDify where possible
+template <typename T>
+Status UndoOrientation(jxl::Orientation undo_orientation, const Plane<T>& image,
+                       Plane<T>& out, jxl::ThreadPool* pool) {
+  const size_t xsize = image.xsize();
+  const size_t ysize = image.ysize();
+
+  if (undo_orientation == Orientation::kFlipHorizontal) {
+    out = Plane<T>(xsize, ysize);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+        [&](const uint32_t task, size_t /*thread*/) {
+          const int64_t y = task;
+          const T* JXL_RESTRICT row_in = image.Row(y);
+          T* JXL_RESTRICT row_out = out.Row(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            row_out[xsize - x - 1] = row_in[x];
+          }
+        },
+        "UndoOrientation"));
+  } else if (undo_orientation == Orientation::kRotate180) {
+    out = Plane<T>(xsize, ysize);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+        [&](const uint32_t task, size_t /*thread*/) {
+          const int64_t y = task;
+          const T* JXL_RESTRICT row_in = image.Row(y);
+          T* JXL_RESTRICT row_out = out.Row(ysize - y - 1);
+          for (size_t x = 0; x < xsize; ++x) {
+            row_out[xsize - x - 1] = row_in[x];
+          }
+        },
+        "UndoOrientation"));
+  } else if (undo_orientation == Orientation::kFlipVertical) {
+    out = Plane<T>(xsize, ysize);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+        [&](const uint32_t task, size_t /*thread*/) {
+          const int64_t y = task;
+          const T* JXL_RESTRICT row_in = image.Row(y);
+          T* JXL_RESTRICT row_out = out.Row(ysize - y - 1);
+          for (size_t x = 0; x < xsize; ++x) {
+            row_out[x] = row_in[x];
+          }
+        },
+        "UndoOrientation"));
+  } else if (undo_orientation == Orientation::kTranspose) {
+    out = Plane<T>(ysize, xsize);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+        [&](const uint32_t task, size_t /*thread*/) {
+          const int64_t y = task;
+          const T* JXL_RESTRICT row_in = image.Row(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            out.Row(x)[y] = row_in[x];
+          }
+        },
+        "UndoOrientation"));
+  } else if (undo_orientation == Orientation::kRotate90) {
+    out = Plane<T>(ysize, xsize);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+        [&](const uint32_t task, size_t /*thread*/) {
+          const int64_t y = task;
+          const T* JXL_RESTRICT row_in = image.Row(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            out.Row(x)[ysize - y - 1] = row_in[x];
+          }
+        },
+        "UndoOrientation"));
+  } else if (undo_orientation == Orientation::kAntiTranspose) {
+    out = Plane<T>(ysize, xsize);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+        [&](const uint32_t task, size_t /*thread*/) {
+          const int64_t y = task;
+          const T* JXL_RESTRICT row_in = image.Row(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            out.Row(xsize - x - 1)[ysize - y - 1] = row_in[x];
+          }
+        },
+        "UndoOrientation"));
+  } else if (undo_orientation == Orientation::kRotate270) {
+    out = Plane<T>(ysize, xsize);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+        [&](const uint32_t task, size_t /*thread*/) {
+          const int64_t y = task;
+          const T* JXL_RESTRICT row_in = image.Row(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            out.Row(xsize - x - 1)[y] = row_in[x];
+          }
+        },
+        "UndoOrientation"));
+  }
+  return true;
+}
+}  // namespace
+
+HWY_EXPORT(FloatToU32);
+HWY_EXPORT(FloatToF16);
+
+namespace {
+
+using StoreFuncType = void(uint32_t value, uint8_t* dest);
+template <StoreFuncType StoreFunc>
+void StoreUintRow(uint32_t* JXL_RESTRICT* rows_u32, size_t num_channels,
+                  size_t xsize, size_t bytes_per_sample,
+                  uint8_t* JXL_RESTRICT out) {
+  for (size_t x = 0; x < xsize; ++x) {
+    for (size_t c = 0; c < num_channels; c++) {
+      StoreFunc(rows_u32[c][x],
+                out + (num_channels * x + c) * bytes_per_sample);
+    }
+  }
+}
+
+template <void(StoreFunc)(float, uint8_t*)>
+void StoreFloatRow(const float* JXL_RESTRICT* rows_in, size_t num_channels,
+                   size_t xsize, uint8_t* JXL_RESTRICT out) {
+  for (size_t x = 0; x < xsize; ++x) {
+    for (size_t c = 0; c < num_channels; c++) {
+      StoreFunc(rows_in[c][x], out + (num_channels * x + c) * sizeof(float));
+    }
+  }
+}
+
+void JXL_INLINE Store8(uint32_t value, uint8_t* dest) { *dest = value & 0xff; }
+
+// Maximum number of channels for the ConvertChannelsToExternal function.
+const size_t kConvertMaxChannels = 4;
+
+// Converts a list of channels to an interleaved image, applying transformations
+// when needed.
+// The input channels are given as a (non-const!) array of channel pointers and
+// interleaved in that order.
+//
+// Note: if a pointer in channels[] is nullptr, a 1.0 value will be used
+// instead. This is useful for handling when a user requests an alpha channel
+// from an image that doesn't have one. The first channel in the list may not
+// be nullptr, since it is used to determine the image size.
+Status ConvertChannelsToExternal(const ImageF* channels[], size_t num_channels,
+                                 size_t bits_per_sample, bool float_out,
+                                 JxlEndianness endianness, size_t stride,
+                                 jxl::ThreadPool* pool, void* out_image,
+                                 size_t out_size,
+                                 const PixelCallback& out_callback,
+                                 jxl::Orientation undo_orientation) {
+  JXL_DASSERT(num_channels != 0 && num_channels <= kConvertMaxChannels);
+  JXL_DASSERT(channels[0] != nullptr);
+  JXL_CHECK(float_out ? bits_per_sample == 16 || bits_per_sample == 32
+                      : bits_per_sample > 0 && bits_per_sample <= 16);
+  if (!!out_image == out_callback.IsPresent()) {
+    return JXL_FAILURE(
+        "Must provide either an out_image or an out_callback, but not both.");
+  }
+
+  const size_t bytes_per_channel = DivCeil(bits_per_sample, jxl::kBitsPerByte);
+  const size_t bytes_per_pixel = num_channels * bytes_per_channel;
+
+  std::vector<std::vector<uint8_t>> row_out_callback;
+  const auto FreeCallbackOpaque = [&out_callback](void* p) {
+    out_callback.destroy(p);
+  };
+  std::unique_ptr<void, decltype(FreeCallbackOpaque)> out_run_opaque(
+      nullptr, FreeCallbackOpaque);
+  auto InitOutCallback = [&](size_t num_threads) -> Status {
+    if (out_callback.IsPresent()) {
+      out_run_opaque.reset(out_callback.Init(num_threads, stride));
+      JXL_RETURN_IF_ERROR(out_run_opaque != nullptr);
+      row_out_callback.resize(num_threads);
+      for (size_t i = 0; i < num_threads; ++i) {
+        row_out_callback[i].resize(stride);
+      }
+    }
+    return true;
+  };
+
+  // Channels used to store the transformed original channels if needed.
+  ImageF temp_channels[kConvertMaxChannels];
+  if (undo_orientation != Orientation::kIdentity) {
+    for (size_t c = 0; c < num_channels; ++c) {
+      if (channels[c]) {
+        JXL_RETURN_IF_ERROR(UndoOrientation(undo_orientation, *channels[c],
+                                            temp_channels[c], pool));
+        channels[c] = &(temp_channels[c]);
+      }
+    }
+  }
+
+  // First channel may not be nullptr.
+  size_t xsize = channels[0]->xsize();
+  size_t ysize = channels[0]->ysize();
+  if (stride < bytes_per_pixel * xsize) {
+    return JXL_FAILURE("stride is smaller than scanline width in bytes: %" PRIuS
+                       " vs %" PRIuS,
+                       stride, bytes_per_pixel * xsize);
+  }
+  if (!out_callback.IsPresent() &&
+      out_size < (ysize - 1) * stride + bytes_per_pixel * xsize) {
+    return JXL_FAILURE("out_size is too small to store image");
+  }
+
+  const bool little_endian =
+      endianness == JXL_LITTLE_ENDIAN ||
+      (endianness == JXL_NATIVE_ENDIAN && IsLittleEndian());
+
+  // Handle the case where a channel is nullptr by creating a single row with
+  // ones to use instead.
+  ImageF ones;
+  for (size_t c = 0; c < num_channels; ++c) {
+    if (!channels[c]) {
+      ones = ImageF(xsize, 1);
+      FillImage(1.0f, &ones);
+      break;
+    }
+  }
+
+  if (float_out) {
+    if (bits_per_sample == 16) {
+      bool swap_endianness = little_endian != IsLittleEndian();
+      Plane<hwy::float16_t> f16_cache;
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, static_cast<uint32_t>(ysize),
+          [&](size_t num_threads) {
+            f16_cache =
+                Plane<hwy::float16_t>(xsize, num_channels * num_threads);
+            return InitOutCallback(num_threads);
+          },
+          [&](const uint32_t task, const size_t thread) {
+            const int64_t y = task;
+            const float* JXL_RESTRICT row_in[kConvertMaxChannels];
+            for (size_t c = 0; c < num_channels; c++) {
+              row_in[c] = channels[c] ? channels[c]->Row(y) : ones.Row(0);
+            }
+            hwy::float16_t* JXL_RESTRICT row_f16[kConvertMaxChannels];
+            for (size_t c = 0; c < num_channels; c++) {
+              row_f16[c] = f16_cache.Row(c + thread * num_channels);
+              HWY_DYNAMIC_DISPATCH(FloatToF16)
+              (row_in[c], row_f16[c], xsize);
+            }
+            uint8_t* row_out =
+                out_callback.IsPresent()
+                    ? row_out_callback[thread].data()
+                    : &(reinterpret_cast<uint8_t*>(out_image))[stride * y];
+            // interleave the one scanline
+            hwy::float16_t* row_f16_out =
+                reinterpret_cast<hwy::float16_t*>(row_out);
+            for (size_t x = 0; x < xsize; x++) {
+              for (size_t c = 0; c < num_channels; c++) {
+                row_f16_out[x * num_channels + c] = row_f16[c][x];
+              }
+            }
+            if (swap_endianness) {
+              size_t size = xsize * num_channels * 2;
+              for (size_t i = 0; i < size; i += 2) {
+                std::swap(row_out[i + 0], row_out[i + 1]);
+              }
+            }
+            if (out_callback.IsPresent()) {
+              out_callback.run(out_run_opaque.get(), thread, 0, y, xsize,
+                               row_out);
+            }
+          },
+          "ConvertF16"));
+    } else if (bits_per_sample == 32) {
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, static_cast<uint32_t>(ysize),
+          [&](size_t num_threads) { return InitOutCallback(num_threads); },
+          [&](const uint32_t task, const size_t thread) {
+            const int64_t y = task;
+            uint8_t* row_out =
+                out_callback.IsPresent()
+                    ? row_out_callback[thread].data()
+                    : &(reinterpret_cast<uint8_t*>(out_image))[stride * y];
+            const float* JXL_RESTRICT row_in[kConvertMaxChannels];
+            for (size_t c = 0; c < num_channels; c++) {
+              row_in[c] = channels[c] ? channels[c]->Row(y) : ones.Row(0);
+            }
+            if (little_endian) {
+              StoreFloatRow<StoreLEFloat>(row_in, num_channels, xsize, row_out);
+            } else {
+              StoreFloatRow<StoreBEFloat>(row_in, num_channels, xsize, row_out);
+            }
+            if (out_callback.IsPresent()) {
+              out_callback.run(out_run_opaque.get(), thread, 0, y, xsize,
+                               row_out);
+            }
+          },
+          "ConvertFloat"));
+    } else {
+      return JXL_FAILURE("float other than 16-bit and 32-bit not supported");
+    }
+  } else {
+    // Multiplier to convert from floating point 0-1 range to the integer
+    // range.
+    float mul = (1ull << bits_per_sample) - 1;
+    Plane<uint32_t> u32_cache;
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize),
+        [&](size_t num_threads) {
+          u32_cache = Plane<uint32_t>(xsize, num_channels * num_threads);
+          return InitOutCallback(num_threads);
+        },
+        [&](const uint32_t task, const size_t thread) {
+          const int64_t y = task;
+          uint8_t* row_out =
+              out_callback.IsPresent()
+                  ? row_out_callback[thread].data()
+                  : &(reinterpret_cast<uint8_t*>(out_image))[stride * y];
+          const float* JXL_RESTRICT row_in[kConvertMaxChannels];
+          for (size_t c = 0; c < num_channels; c++) {
+            row_in[c] = channels[c] ? channels[c]->Row(y) : ones.Row(0);
+          }
+          uint32_t* JXL_RESTRICT row_u32[kConvertMaxChannels];
+          for (size_t c = 0; c < num_channels; c++) {
+            row_u32[c] = u32_cache.Row(c + thread * num_channels);
+            // row_u32[] is a per-thread temporary row storage, this isn't
+            // intended to be initialized on a previous run.
+            msan::PoisonMemory(row_u32[c], xsize * sizeof(row_u32[c][0]));
+            HWY_DYNAMIC_DISPATCH(FloatToU32)
+            (row_in[c], row_u32[c], xsize, mul, bits_per_sample);
+          }
+          if (bits_per_sample <= 8) {
+            StoreUintRow<Store8>(row_u32, num_channels, xsize, 1, row_out);
+          } else {
+            if (little_endian) {
+              StoreUintRow<StoreLE16>(row_u32, num_channels, xsize, 2, row_out);
+            } else {
+              StoreUintRow<StoreBE16>(row_u32, num_channels, xsize, 2, row_out);
+            }
+          }
+          if (out_callback.IsPresent()) {
+            out_callback.run(out_run_opaque.get(), thread, 0, y, xsize,
+                             row_out);
+          }
+        },
+        "ConvertUint"));
+  }
+  return true;
+}
+
+}  // namespace
+
+Status ConvertToExternal(const jxl::ImageBundle& ib, size_t bits_per_sample,
+                         bool float_out, size_t num_channels,
+                         JxlEndianness endianness, size_t stride,
+                         jxl::ThreadPool* pool, void* out_image,
+                         size_t out_size, const PixelCallback& out_callback,
+                         jxl::Orientation undo_orientation,
+                         bool unpremul_alpha) {
+  bool want_alpha = num_channels == 2 || num_channels == 4;
+  size_t color_channels = num_channels <= 2 ? 1 : 3;
+
+  const Image3F* color = &ib.color();
+  // Undo premultiplied alpha.
+  Image3F unpremul;
+  if (ib.AlphaIsPremultiplied() && ib.HasAlpha() && unpremul_alpha) {
+    unpremul = Image3F(color->xsize(), color->ysize());
+    CopyImageTo(*color, &unpremul);
+    for (size_t y = 0; y < unpremul.ysize(); y++) {
+      UnpremultiplyAlpha(unpremul.PlaneRow(0, y), unpremul.PlaneRow(1, y),
+                         unpremul.PlaneRow(2, y), ib.alpha().Row(y),
+                         unpremul.xsize());
+    }
+    color = &unpremul;
+  }
+
+  const ImageF* channels[kConvertMaxChannels];
+  size_t c = 0;
+  for (; c < color_channels; c++) {
+    channels[c] = &color->Plane(c);
+  }
+  if (want_alpha) {
+    channels[c++] = ib.HasAlpha() ? &ib.alpha() : nullptr;
+  }
+  JXL_ASSERT(num_channels == c);
+
+  return ConvertChannelsToExternal(
+      channels, num_channels, bits_per_sample, float_out, endianness, stride,
+      pool, out_image, out_size, out_callback, undo_orientation);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/dec_external_image.h b/third_party/jpeg-xl/lib/jxl/dec_external_image.h
new file mode 100644
index 0000000000..6ca7abff62
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_external_image.h
@@ -0,0 +1,46 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_EXTERNAL_IMAGE_H_
+#define LIB_JXL_DEC_EXTERNAL_IMAGE_H_
+
+// Interleaved image for color transforms and Codec.
+
+#include <jxl/decode.h>
+#include <jxl/types.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+// Converts ib to interleaved void* pixel buffer with the given format.
+// bits_per_sample: must be 16 or 32 if float_out is true, and at most 16
+// if it is false. No bit packing is done.
+// num_channels: must be 1, 2, 3 or 4 for gray, gray+alpha, RGB, RGB+alpha.
+// This supports the features needed for the C API and does not perform
+// color space conversion.
+// TODO(lode): support rectangle crop.
+// stride_out is output scanline size in bytes, must be >=
+// output_xsize * output_bytes_per_pixel.
+// undo_orientation is an EXIF orientation to undo. Depending on the
+// orientation, the output xsize and ysize are swapped compared to input
+// xsize and ysize.
+Status ConvertToExternal(const jxl::ImageBundle& ib, size_t bits_per_sample,
+                         bool float_out, size_t num_channels,
+                         JxlEndianness endianness, size_t stride_out,
+                         jxl::ThreadPool* thread_pool, void* out_image,
+                         size_t out_size, const PixelCallback& out_callback,
+                         jxl::Orientation undo_orientation,
+                         bool unpremul_alpha = false);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_EXTERNAL_IMAGE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_external_image_gbench.cc b/third_party/jpeg-xl/lib/jxl/dec_external_image_gbench.cc
new file mode 100644
index 0000000000..c87a4d5f36
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_external_image_gbench.cc
@@ -0,0 +1,56 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "benchmark/benchmark.h"
+#include "lib/jxl/dec_external_image.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+namespace {
+
+// Decoder case, interleaves an internal float image.
+void BM_DecExternalImage_ConvertImageRGBA(benchmark::State& state) {
+  const size_t kNumIter = 5;
+  size_t xsize = state.range();
+  size_t ysize = state.range();
+  size_t num_channels = 4;
+
+  ImageMetadata im;
+  im.SetAlphaBits(8);
+  ImageBundle ib(&im);
+  Image3F color(xsize, ysize);
+  ZeroFillImage(&color);
+  ib.SetFromImage(std::move(color), ColorEncoding::SRGB());
+  ImageF alpha(xsize, ysize);
+  ZeroFillImage(&alpha);
+  ib.SetAlpha(std::move(alpha));
+
+  const size_t bytes_per_row = xsize * num_channels;
+  std::vector<uint8_t> interleaved(bytes_per_row * ysize);
+
+  for (auto _ : state) {
+    for (size_t i = 0; i < kNumIter; ++i) {
+      JXL_CHECK(ConvertToExternal(
+          ib,
+          /*bits_per_sample=*/8,
+          /*float_out=*/false, num_channels, JXL_NATIVE_ENDIAN,
+          /*stride*/ bytes_per_row,
+          /*thread_pool=*/nullptr, interleaved.data(), interleaved.size(),
+          /*out_callback=*/{},
+          /*undo_orientation=*/jxl::Orientation::kIdentity));
+    }
+  }
+
+  // Pixels per second.
+  state.SetItemsProcessed(kNumIter * state.iterations() * xsize * ysize);
+  state.SetBytesProcessed(kNumIter * state.iterations() * interleaved.size());
+}
+
+BENCHMARK(BM_DecExternalImage_ConvertImageRGBA)
+    ->RangeMultiplier(2)
+    ->Range(256, 2048);
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_frame.cc b/third_party/jpeg-xl/lib/jxl/dec_frame.cc
new file mode 100644
index 0000000000..98508e431b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_frame.cc
@@ -0,0 +1,878 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_frame.h"
+
+#include <jxl/types.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <atomic>
+#include <hwy/aligned_allocator.h>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/compressed_dc.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/dec_group.h"
+#include "lib/jxl/dec_modular.h"
+#include "lib/jxl/dec_patch_dictionary.h"
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/epf.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/passes_state.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/splines.h"
+#include "lib/jxl/toc.h"
+
+namespace jxl {
+
+namespace {
+Status DecodeGlobalDCInfo(BitReader* reader, bool is_jpeg,
+                          PassesDecoderState* state, ThreadPool* pool) {
+  PROFILER_FUNC;
+  JXL_RETURN_IF_ERROR(state->shared_storage.quantizer.Decode(reader));
+
+  JXL_RETURN_IF_ERROR(
+      DecodeBlockCtxMap(reader, &state->shared_storage.block_ctx_map));
+
+  JXL_RETURN_IF_ERROR(state->shared_storage.cmap.DecodeDC(reader));
+
+  // Pre-compute info for decoding a group.
+  if (is_jpeg) {
+    state->shared_storage.quantizer.ClearDCMul();  // Don't dequant DC
+  }
+
+  state->shared_storage.ac_strategy.FillInvalid();
+  return true;
+}
+}  // namespace
+
+Status DecodeFrame(PassesDecoderState* dec_state, ThreadPool* JXL_RESTRICT pool,
+                   const uint8_t* next_in, size_t avail_in,
+                   ImageBundle* decoded, const CodecMetadata& metadata,
+                   bool use_slow_rendering_pipeline) {
+  FrameDecoder frame_decoder(dec_state, metadata, pool,
+                             use_slow_rendering_pipeline);
+
+  BitReader reader(Span<const uint8_t>(next_in, avail_in));
+  JXL_RETURN_IF_ERROR(frame_decoder.InitFrame(&reader, decoded,
+                                              /*is_preview=*/false));
+  JXL_RETURN_IF_ERROR(frame_decoder.InitFrameOutput());
+
+  JXL_RETURN_IF_ERROR(reader.AllReadsWithinBounds());
+  size_t header_bytes = reader.TotalBitsConsumed() / kBitsPerByte;
+  JXL_RETURN_IF_ERROR(reader.Close());
+
+  size_t processed_bytes = header_bytes;
+  Status close_ok = true;
+  std::vector<std::unique_ptr<BitReader>> section_readers;
+  {
+    std::vector<std::unique_ptr<BitReaderScopedCloser>> section_closers;
+    std::vector<FrameDecoder::SectionInfo> section_info;
+    std::vector<FrameDecoder::SectionStatus> section_status;
+    size_t pos = header_bytes;
+    size_t index = 0;
+    for (auto toc_entry : frame_decoder.Toc()) {
+      JXL_RETURN_IF_ERROR(pos + toc_entry.size <= avail_in);
+      auto br = make_unique<BitReader>(
+          Span<const uint8_t>(next_in + pos, toc_entry.size));
+      section_info.emplace_back(
+          FrameDecoder::SectionInfo{br.get(), toc_entry.id, index++});
+      section_closers.emplace_back(
+          make_unique<BitReaderScopedCloser>(br.get(), &close_ok));
+      section_readers.emplace_back(std::move(br));
+      pos += toc_entry.size;
+    }
+    section_status.resize(section_info.size());
+    JXL_RETURN_IF_ERROR(frame_decoder.ProcessSections(
+        section_info.data(), section_info.size(), section_status.data()));
+    for (size_t i = 0; i < section_status.size(); i++) {
+      JXL_RETURN_IF_ERROR(section_status[i] == FrameDecoder::kDone);
+      processed_bytes += frame_decoder.Toc()[i].size;
+    }
+  }
+  JXL_RETURN_IF_ERROR(close_ok);
+  JXL_RETURN_IF_ERROR(frame_decoder.FinalizeFrame());
+  decoded->SetDecodedBytes(processed_bytes);
+  return true;
+}
+
+Status FrameDecoder::InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded,
+                               bool is_preview) {
+  PROFILER_FUNC;
+  decoded_ = decoded;
+  JXL_ASSERT(is_finalized_);
+
+  // Reset the dequantization matrices to their default values.
+  dec_state_->shared_storage.matrices = DequantMatrices();
+
+  frame_header_.nonserialized_is_preview = is_preview;
+  JXL_ASSERT(frame_header_.nonserialized_metadata != nullptr);
+  JXL_RETURN_IF_ERROR(ReadFrameHeader(br, &frame_header_));
+  frame_dim_ = frame_header_.ToFrameDimensions();
+  JXL_DEBUG_V(2, "FrameHeader: %s", frame_header_.DebugString().c_str());
+
+  const size_t num_passes = frame_header_.passes.num_passes;
+  const size_t num_groups = frame_dim_.num_groups;
+
+  // If the previous frame was not a kRegularFrame, `decoded` may have different
+  // dimensions; must reset to avoid errors.
+  decoded->RemoveColor();
+  decoded->ClearExtraChannels();
+
+  decoded->duration = frame_header_.animation_frame.duration;
+
+  if (!frame_header_.nonserialized_is_preview &&
+      (frame_header_.is_last || frame_header_.animation_frame.duration > 0) &&
+      (frame_header_.frame_type == kRegularFrame ||
+       frame_header_.frame_type == kSkipProgressive)) {
+    ++dec_state_->visible_frame_index;
+    dec_state_->nonvisible_frame_index = 0;
+  } else {
+    ++dec_state_->nonvisible_frame_index;
+  }
+
+  // Read TOC.
+  const bool has_ac_global = true;
+  const size_t toc_entries = NumTocEntries(num_groups, frame_dim_.num_dc_groups,
+                                           num_passes, has_ac_global);
+  std::vector<uint32_t> sizes;
+  std::vector<coeff_order_t> permutation;
+  JXL_RETURN_IF_ERROR(ReadToc(toc_entries, br, &sizes, &permutation));
+  bool have_permutation = !permutation.empty();
+  toc_.resize(toc_entries);
+  section_sizes_sum_ = 0;
+  for (size_t i = 0; i < toc_entries; ++i) {
+    toc_[i].size = sizes[i];
+    size_t index = have_permutation ? permutation[i] : i;
+    toc_[index].id = i;
+    if (section_sizes_sum_ + toc_[i].size < section_sizes_sum_) {
+      return JXL_FAILURE("group offset overflow");
+    }
+    section_sizes_sum_ += toc_[i].size;
+  }
+
+  JXL_DASSERT((br->TotalBitsConsumed() % kBitsPerByte) == 0);
+  const size_t group_codes_begin = br->TotalBitsConsumed() / kBitsPerByte;
+  JXL_DASSERT(!toc_.empty());
+
+  // Overflow check.
+  if (group_codes_begin + section_sizes_sum_ < group_codes_begin) {
+    return JXL_FAILURE("Invalid group codes");
+  }
+
+  if (!frame_header_.chroma_subsampling.Is444() &&
+      !(frame_header_.flags & FrameHeader::kSkipAdaptiveDCSmoothing) &&
+      frame_header_.encoding == FrameEncoding::kVarDCT) {
+    return JXL_FAILURE(
+        "Non-444 chroma subsampling is not allowed when adaptive DC "
+        "smoothing is enabled");
+  }
+  return true;
+}
+
+Status FrameDecoder::InitFrameOutput() {
+  JXL_RETURN_IF_ERROR(
+      InitializePassesSharedState(frame_header_, &dec_state_->shared_storage));
+  JXL_RETURN_IF_ERROR(dec_state_->Init());
+  modular_frame_decoder_.Init(frame_dim_);
+
+  if (decoded_->IsJPEG()) {
+    if (frame_header_.encoding == FrameEncoding::kModular) {
+      return JXL_FAILURE("Cannot output JPEG from Modular");
+    }
+    jpeg::JPEGData* jpeg_data = decoded_->jpeg_data.get();
+    size_t num_components = jpeg_data->components.size();
+    if (num_components != 1 && num_components != 3) {
+      return JXL_FAILURE("Invalid number of components");
+    }
+    if (frame_header_.nonserialized_metadata->m.xyb_encoded) {
+      return JXL_FAILURE("Cannot decode to JPEG an XYB image");
+    }
+    auto jpeg_c_map = JpegOrder(ColorTransform::kYCbCr, num_components == 1);
+    decoded_->jpeg_data->width = frame_dim_.xsize;
+    decoded_->jpeg_data->height = frame_dim_.ysize;
+    for (size_t c = 0; c < num_components; c++) {
+      auto& component = jpeg_data->components[jpeg_c_map[c]];
+      component.width_in_blocks =
+          frame_dim_.xsize_blocks >> frame_header_.chroma_subsampling.HShift(c);
+      component.height_in_blocks =
+          frame_dim_.ysize_blocks >> frame_header_.chroma_subsampling.VShift(c);
+      component.h_samp_factor =
+          1 << frame_header_.chroma_subsampling.RawHShift(c);
+      component.v_samp_factor =
+          1 << frame_header_.chroma_subsampling.RawVShift(c);
+      component.coeffs.resize(component.width_in_blocks *
+                              component.height_in_blocks * jxl::kDCTBlockSize);
+    }
+  }
+
+  // Clear the state.
+  decoded_dc_global_ = false;
+  decoded_ac_global_ = false;
+  is_finalized_ = false;
+  finalized_dc_ = false;
+  num_sections_done_ = 0;
+  decoded_dc_groups_.clear();
+  decoded_dc_groups_.resize(frame_dim_.num_dc_groups);
+  decoded_passes_per_ac_group_.clear();
+  decoded_passes_per_ac_group_.resize(frame_dim_.num_groups, 0);
+  processed_section_.clear();
+  processed_section_.resize(toc_.size());
+  allocated_ = false;
+  return true;
+}
+
+Status FrameDecoder::ProcessDCGlobal(BitReader* br) {
+  PROFILER_FUNC;
+  PassesSharedState& shared = dec_state_->shared_storage;
+  if (shared.frame_header.flags & FrameHeader::kPatches) {
+    bool uses_extra_channels = false;
+    JXL_RETURN_IF_ERROR(shared.image_features.patches.Decode(
+        br, frame_dim_.xsize_padded, frame_dim_.ysize_padded,
+        &uses_extra_channels));
+    if (uses_extra_channels && frame_header_.upsampling != 1) {
+      for (size_t ecups : frame_header_.extra_channel_upsampling) {
+        if (ecups != frame_header_.upsampling) {
+          return JXL_FAILURE(
+              "Cannot use extra channels in patches if color channels are "
+              "subsampled differently from extra channels");
+        }
+      }
+    }
+  } else {
+    shared.image_features.patches.Clear();
+  }
+  shared.image_features.splines.Clear();
+  if (shared.frame_header.flags & FrameHeader::kSplines) {
+    JXL_RETURN_IF_ERROR(shared.image_features.splines.Decode(
+        br, frame_dim_.xsize * frame_dim_.ysize));
+  }
+  if (shared.frame_header.flags & FrameHeader::kNoise) {
+    JXL_RETURN_IF_ERROR(DecodeNoise(br, &shared.image_features.noise_params));
+  }
+  JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.DecodeDC(br));
+
+  if (frame_header_.encoding == FrameEncoding::kVarDCT) {
+    JXL_RETURN_IF_ERROR(
+        jxl::DecodeGlobalDCInfo(br, decoded_->IsJPEG(), dec_state_, pool_));
+  }
+  // Splines' draw cache uses the color correlation map.
+  if (shared.frame_header.flags & FrameHeader::kSplines) {
+    JXL_RETURN_IF_ERROR(shared.image_features.splines.InitializeDrawCache(
+        frame_dim_.xsize_upsampled, frame_dim_.ysize_upsampled,
+        dec_state_->shared->cmap));
+  }
+  Status dec_status = modular_frame_decoder_.DecodeGlobalInfo(
+      br, frame_header_, /*allow_truncated_group=*/false);
+  if (dec_status.IsFatalError()) return dec_status;
+  if (dec_status) {
+    decoded_dc_global_ = true;
+  }
+  return dec_status;
+}
+
+Status FrameDecoder::ProcessDCGroup(size_t dc_group_id, BitReader* br) {
+  PROFILER_FUNC;
+  const size_t gx = dc_group_id % frame_dim_.xsize_dc_groups;
+  const size_t gy = dc_group_id / frame_dim_.xsize_dc_groups;
+  const LoopFilter& lf = dec_state_->shared->frame_header.loop_filter;
+  if (frame_header_.encoding == FrameEncoding::kVarDCT &&
+      !(frame_header_.flags & FrameHeader::kUseDcFrame)) {
+    JXL_RETURN_IF_ERROR(
+        modular_frame_decoder_.DecodeVarDCTDC(dc_group_id, br, dec_state_));
+  }
+  const Rect mrect(gx * frame_dim_.dc_group_dim, gy * frame_dim_.dc_group_dim,
+                   frame_dim_.dc_group_dim, frame_dim_.dc_group_dim);
+  JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup(
+      mrect, br, 3, 1000, ModularStreamId::ModularDC(dc_group_id),
+      /*zerofill=*/false, nullptr, nullptr,
+      /*allow_truncated=*/false));
+  if (frame_header_.encoding == FrameEncoding::kVarDCT) {
+    JXL_RETURN_IF_ERROR(
+        modular_frame_decoder_.DecodeAcMetadata(dc_group_id, br, dec_state_));
+  } else if (lf.epf_iters > 0) {
+    FillImage(kInvSigmaNum / lf.epf_sigma_for_modular, &dec_state_->sigma);
+  }
+  decoded_dc_groups_[dc_group_id] = uint8_t{true};
+  return true;
+}
+
+void FrameDecoder::FinalizeDC() {
+  // Do Adaptive DC smoothing if enabled. This *must* happen between all the
+  // ProcessDCGroup and ProcessACGroup.
+  if (frame_header_.encoding == FrameEncoding::kVarDCT &&
+      !(frame_header_.flags & FrameHeader::kSkipAdaptiveDCSmoothing) &&
+      !(frame_header_.flags & FrameHeader::kUseDcFrame)) {
+    AdaptiveDCSmoothing(dec_state_->shared->quantizer.MulDC(),
+                        &dec_state_->shared_storage.dc_storage, pool_);
+  }
+
+  finalized_dc_ = true;
+}
+
+Status FrameDecoder::AllocateOutput() {
+  if (allocated_) return true;
+  modular_frame_decoder_.MaybeDropFullImage();
+  decoded_->origin = dec_state_->shared->frame_header.frame_origin;
+  JXL_RETURN_IF_ERROR(dec_state_->InitForAC(nullptr));
+  allocated_ = true;
+  return true;
+}
+
+Status FrameDecoder::ProcessACGlobal(BitReader* br) {
+  JXL_CHECK(finalized_dc_);
+
+  // Decode AC group.
+  if (frame_header_.encoding == FrameEncoding::kVarDCT) {
+    JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.Decode(
+        br, &modular_frame_decoder_));
+    JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.EnsureComputed(
+        dec_state_->used_acs));
+
+    size_t num_histo_bits =
+        CeilLog2Nonzero(dec_state_->shared->frame_dim.num_groups);
+    dec_state_->shared_storage.num_histograms =
+        1 + br->ReadBits(num_histo_bits);
+
+    dec_state_->code.resize(kMaxNumPasses);
+    dec_state_->context_map.resize(kMaxNumPasses);
+    // Read coefficient orders and histograms.
+    size_t max_num_bits_ac = 0;
+    for (size_t i = 0;
+         i < dec_state_->shared_storage.frame_header.passes.num_passes; i++) {
+      uint16_t used_orders = U32Coder::Read(kOrderEnc, br);
+      JXL_RETURN_IF_ERROR(DecodeCoeffOrders(
+          used_orders, dec_state_->used_acs,
+          &dec_state_->shared_storage
+               .coeff_orders[i * dec_state_->shared_storage.coeff_order_size],
+          br));
+      size_t num_contexts =
+          dec_state_->shared->num_histograms *
+          dec_state_->shared_storage.block_ctx_map.NumACContexts();
+      JXL_RETURN_IF_ERROR(DecodeHistograms(
+          br, num_contexts, &dec_state_->code[i], &dec_state_->context_map[i]));
+      // Add extra values to enable the cheat in hot loop of DecodeACVarBlock.
+      dec_state_->context_map[i].resize(
+          num_contexts + kZeroDensityContextLimit - kZeroDensityContextCount);
+      max_num_bits_ac =
+          std::max(max_num_bits_ac, dec_state_->code[i].max_num_bits);
+    }
+    max_num_bits_ac += CeilLog2Nonzero(
+        dec_state_->shared_storage.frame_header.passes.num_passes);
+    // 16-bit buffer for decoding to JPEG are not implemented.
+    // TODO(veluca): figure out the exact limit - 16 should still work with
+    // 16-bit buffers, but we are excluding it for safety.
+    bool use_16_bit = max_num_bits_ac < 16 && !decoded_->IsJPEG();
+    bool store = frame_header_.passes.num_passes > 1;
+    size_t xs = store ? kGroupDim * kGroupDim : 0;
+    size_t ys = store ? frame_dim_.num_groups : 0;
+    if (use_16_bit) {
+      dec_state_->coefficients = make_unique<ACImageT<int16_t>>(xs, ys);
+    } else {
+      dec_state_->coefficients = make_unique<ACImageT<int32_t>>(xs, ys);
+    }
+    if (store) {
+      dec_state_->coefficients->ZeroFill();
+    }
+  }
+
+  // Set JPEG decoding data.
+  if (decoded_->IsJPEG()) {
+    decoded_->color_transform = frame_header_.color_transform;
+    decoded_->chroma_subsampling = frame_header_.chroma_subsampling;
+    const std::vector<QuantEncoding>& qe =
+        dec_state_->shared_storage.matrices.encodings();
+    if (qe.empty() || qe[0].mode != QuantEncoding::Mode::kQuantModeRAW ||
+        std::abs(qe[0].qraw.qtable_den - 1.f / (8 * 255)) > 1e-8f) {
+      return JXL_FAILURE(
+          "Quantization table is not a JPEG quantization table.");
+    }
+    jpeg::JPEGData* jpeg_data = decoded_->jpeg_data.get();
+    size_t num_components = jpeg_data->components.size();
+    bool is_gray = (num_components == 1);
+    auto jpeg_c_map = JpegOrder(frame_header_.color_transform, is_gray);
+    size_t qt_set = 0;
+    for (size_t c = 0; c < num_components; c++) {
+      // TODO(eustas): why 1-st quant table for gray?
+      size_t quant_c = is_gray ? 1 : c;
+      size_t qpos = jpeg_data->components[jpeg_c_map[c]].quant_idx;
+      JXL_CHECK(qpos != jpeg_data->quant.size());
+      qt_set |= 1 << qpos;
+      for (size_t x = 0; x < 8; x++) {
+        for (size_t y = 0; y < 8; y++) {
+          jpeg_data->quant[qpos].values[x * 8 + y] =
+              (*qe[0].qraw.qtable)[quant_c * 64 + y * 8 + x];
+        }
+      }
+    }
+    for (size_t i = 0; i < jpeg_data->quant.size(); i++) {
+      if (qt_set & (1 << i)) continue;
+      if (i == 0) return JXL_FAILURE("First quant table unused.");
+      // Unused quant table is set to copy of previous quant table
+      for (size_t j = 0; j < 64; j++) {
+        jpeg_data->quant[i].values[j] = jpeg_data->quant[i - 1].values[j];
+      }
+    }
+  }
+  decoded_ac_global_ = true;
+  return true;
+}
+
+Status FrameDecoder::ProcessACGroup(size_t ac_group_id,
+                                    BitReader* JXL_RESTRICT* br,
+                                    size_t num_passes, size_t thread,
+                                    bool force_draw, bool dc_only) {
+  PROFILER_ZONE("process_group");
+  size_t group_dim = frame_dim_.group_dim;
+  const size_t gx = ac_group_id % frame_dim_.xsize_groups;
+  const size_t gy = ac_group_id / frame_dim_.xsize_groups;
+  const size_t x = gx * group_dim;
+  const size_t y = gy * group_dim;
+  JXL_DEBUG_V(3,
+              "Processing AC group %" PRIuS "(%" PRIuS ",%" PRIuS
+              ") group_dim: %" PRIuS " decoded passes: %u new passes: %" PRIuS,
+              ac_group_id, gx, gy, group_dim,
+              decoded_passes_per_ac_group_[ac_group_id], num_passes);
+
+  RenderPipelineInput render_pipeline_input =
+      dec_state_->render_pipeline->GetInputBuffers(ac_group_id, thread);
+
+  bool should_run_pipeline = true;
+
+  if (frame_header_.encoding == FrameEncoding::kVarDCT) {
+    group_dec_caches_[thread].InitOnce(frame_header_.passes.num_passes,
+                                       dec_state_->used_acs);
+    JXL_RETURN_IF_ERROR(DecodeGroup(br, num_passes, ac_group_id, dec_state_,
+                                    &group_dec_caches_[thread], thread,
+                                    render_pipeline_input, decoded_,
+                                    decoded_passes_per_ac_group_[ac_group_id],
+                                    force_draw, dc_only, &should_run_pipeline));
+  }
+
+  // don't limit to image dimensions here (is done in DecodeGroup)
+  const Rect mrect(x, y, group_dim, group_dim);
+  bool modular_ready = false;
+  size_t pass0 = decoded_passes_per_ac_group_[ac_group_id];
+  size_t pass1 =
+      force_draw ? frame_header_.passes.num_passes : pass0 + num_passes;
+  for (size_t i = pass0; i < pass1; ++i) {
+    int minShift, maxShift;
+    frame_header_.passes.GetDownsamplingBracket(i, minShift, maxShift);
+    bool modular_pass_ready = true;
+    if (i < pass0 + num_passes) {
+      JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup(
+          mrect, br[i - pass0], minShift, maxShift,
+          ModularStreamId::ModularAC(ac_group_id, i),
+          /*zerofill=*/false, dec_state_, &render_pipeline_input,
+          /*allow_truncated=*/false, &modular_pass_ready));
+    } else {
+      JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup(
+          mrect, nullptr, minShift, maxShift,
+          ModularStreamId::ModularAC(ac_group_id, i), /*zerofill=*/true,
+          dec_state_, &render_pipeline_input,
+          /*allow_truncated=*/false, &modular_pass_ready));
+    }
+    if (modular_pass_ready) modular_ready = true;
+  }
+  decoded_passes_per_ac_group_[ac_group_id] += num_passes;
+
+  if ((frame_header_.flags & FrameHeader::kNoise) != 0) {
+    PROFILER_ZONE("GenerateNoise");
+    size_t noise_c_start =
+        3 + frame_header_.nonserialized_metadata->m.num_extra_channels;
+    // When the color channels are downsampled, we need to generate more noise
+    // input for the current group than just the group dimensions.
+    std::pair<ImageF*, Rect> rects[3];
+    for (size_t iy = 0; iy < frame_header_.upsampling; iy++) {
+      for (size_t ix = 0; ix < frame_header_.upsampling; ix++) {
+        for (size_t c = 0; c < 3; c++) {
+          auto r = render_pipeline_input.GetBuffer(noise_c_start + c);
+          rects[c].first = r.first;
+          size_t x1 = r.second.x0() + r.second.xsize();
+          size_t y1 = r.second.y0() + r.second.ysize();
+          rects[c].second = Rect(r.second.x0() + ix * group_dim,
+                                 r.second.y0() + iy * group_dim, group_dim,
+                                 group_dim, x1, y1);
+        }
+        Random3Planes(dec_state_->visible_frame_index,
+                      dec_state_->nonvisible_frame_index,
+                      (gx * frame_header_.upsampling + ix) * group_dim,
+                      (gy * frame_header_.upsampling + iy) * group_dim,
+                      rects[0], rects[1], rects[2]);
+      }
+    }
+  }
+
+  if (!modular_frame_decoder_.UsesFullImage() && !decoded_->IsJPEG()) {
+    if (should_run_pipeline && modular_ready) {
+      render_pipeline_input.Done();
+    } else if (force_draw) {
+      return JXL_FAILURE("Modular group decoding failed.");
+    }
+  }
+  return true;
+}
+
+void FrameDecoder::MarkSections(const SectionInfo* sections, size_t num,
+                                SectionStatus* section_status) {
+  num_sections_done_ += num;
+  for (size_t i = 0; i < num; i++) {
+    if (section_status[i] != SectionStatus::kDone) {
+      processed_section_[sections[i].id] = false;
+      num_sections_done_--;
+    }
+  }
+}
+
+Status FrameDecoder::ProcessSections(const SectionInfo* sections, size_t num,
+                                     SectionStatus* section_status) {
+  if (num == 0) return true;  // Nothing to process
+  std::fill(section_status, section_status + num, SectionStatus::kSkipped);
+  size_t dc_global_sec = num;
+  size_t ac_global_sec = num;
+  std::vector<size_t> dc_group_sec(frame_dim_.num_dc_groups, num);
+  std::vector<std::vector<size_t>> ac_group_sec(
+      frame_dim_.num_groups,
+      std::vector<size_t>(frame_header_.passes.num_passes, num));
+  // This keeps track of the number of ac passes we want to process during this
+  // call of ProcessSections.
+  std::vector<size_t> desired_num_ac_passes(frame_dim_.num_groups);
+  bool single_section =
+      frame_dim_.num_groups == 1 && frame_header_.passes.num_passes == 1;
+  if (single_section) {
+    JXL_ASSERT(num == 1);
+    JXL_ASSERT(sections[0].id == 0);
+    if (processed_section_[0] == false) {
+      processed_section_[0] = true;
+      ac_group_sec[0].resize(1);
+      dc_global_sec = ac_global_sec = dc_group_sec[0] = ac_group_sec[0][0] = 0;
+      desired_num_ac_passes[0] = 1;
+    } else {
+      section_status[0] = SectionStatus::kDuplicate;
+    }
+  } else {
+    size_t ac_global_index = frame_dim_.num_dc_groups + 1;
+    for (size_t i = 0; i < num; i++) {
+      JXL_ASSERT(sections[i].id < processed_section_.size());
+      if (processed_section_[sections[i].id]) {
+        section_status[i] = SectionStatus::kDuplicate;
+        continue;
+      }
+      if (sections[i].id == 0) {
+        dc_global_sec = i;
+      } else if (sections[i].id < ac_global_index) {
+        dc_group_sec[sections[i].id - 1] = i;
+      } else if (sections[i].id == ac_global_index) {
+        ac_global_sec = i;
+      } else {
+        size_t ac_idx = sections[i].id - ac_global_index - 1;
+        size_t acg = ac_idx % frame_dim_.num_groups;
+        size_t acp = ac_idx / frame_dim_.num_groups;
+        if (acp >= frame_header_.passes.num_passes) {
+          return JXL_FAILURE("Invalid section ID");
+        }
+        ac_group_sec[acg][acp] = i;
+      }
+      processed_section_[sections[i].id] = true;
+    }
+    // Count number of new passes per group.
+    for (size_t g = 0; g < ac_group_sec.size(); g++) {
+      size_t j = 0;
+      for (; j + decoded_passes_per_ac_group_[g] <
+             frame_header_.passes.num_passes;
+           j++) {
+        if (ac_group_sec[g][j + decoded_passes_per_ac_group_[g]] == num) {
+          break;
+        }
+      }
+      desired_num_ac_passes[g] = j;
+    }
+  }
+  if (dc_global_sec != num) {
+    Status dc_global_status = ProcessDCGlobal(sections[dc_global_sec].br);
+    if (dc_global_status.IsFatalError()) return dc_global_status;
+    if (dc_global_status) {
+      section_status[dc_global_sec] = SectionStatus::kDone;
+    } else {
+      section_status[dc_global_sec] = SectionStatus::kPartial;
+    }
+  }
+
+  std::atomic<bool> has_error{false};
+  if (decoded_dc_global_) {
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool_, 0, dc_group_sec.size(), ThreadPool::NoInit,
+        [this, &dc_group_sec, &num, &sections, &section_status, &has_error](
+            size_t i, size_t thread) {
+          if (dc_group_sec[i] != num) {
+            if (!ProcessDCGroup(i, sections[dc_group_sec[i]].br)) {
+              has_error = true;
+            } else {
+              section_status[dc_group_sec[i]] = SectionStatus::kDone;
+            }
+          }
+        },
+        "DecodeDCGroup"));
+  }
+  if (has_error) return JXL_FAILURE("Error in DC group");
+
+  if (*std::min_element(decoded_dc_groups_.begin(), decoded_dc_groups_.end()) &&
+      !finalized_dc_) {
+    PassesDecoderState::PipelineOptions pipeline_options;
+    pipeline_options.use_slow_render_pipeline = use_slow_rendering_pipeline_;
+    pipeline_options.coalescing = coalescing_;
+    pipeline_options.render_spotcolors = render_spotcolors_;
+    JXL_RETURN_IF_ERROR(
+        dec_state_->PreparePipeline(decoded_, pipeline_options));
+    FinalizeDC();
+    JXL_RETURN_IF_ERROR(AllocateOutput());
+    if (progressive_detail_ >= JxlProgressiveDetail::kDC) {
+      MarkSections(sections, num, section_status);
+      return true;
+    }
+  }
+
+  if (finalized_dc_ && ac_global_sec != num && !decoded_ac_global_) {
+    JXL_RETURN_IF_ERROR(ProcessACGlobal(sections[ac_global_sec].br));
+    section_status[ac_global_sec] = SectionStatus::kDone;
+  }
+
+  if (progressive_detail_ >= JxlProgressiveDetail::kLastPasses) {
+    // Mark that we only want the next progression pass.
+    size_t target_complete_passes = NextNumPassesToPause();
+    for (size_t i = 0; i < ac_group_sec.size(); i++) {
+      desired_num_ac_passes[i] =
+          std::min(desired_num_ac_passes[i],
+                   target_complete_passes - decoded_passes_per_ac_group_[i]);
+    }
+  }
+
+  if (decoded_ac_global_) {
+    // Mark all the AC groups that we received as not complete yet.
+    for (size_t i = 0; i < ac_group_sec.size(); i++) {
+      if (desired_num_ac_passes[i] != 0) {
+        dec_state_->render_pipeline->ClearDone(i);
+      }
+    }
+
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool_, 0, ac_group_sec.size(),
+        [this](size_t num_threads) {
+          return PrepareStorage(num_threads,
+                                decoded_passes_per_ac_group_.size());
+        },
+        [this, &ac_group_sec, &desired_num_ac_passes, &num, &sections,
+         &section_status, &has_error](size_t g, size_t thread) {
+          if (desired_num_ac_passes[g] == 0) {
+            // no new AC pass, nothing to do
+            return;
+          }
+          (void)num;
+          size_t first_pass = decoded_passes_per_ac_group_[g];
+          BitReader* JXL_RESTRICT readers[kMaxNumPasses];
+          for (size_t i = 0; i < desired_num_ac_passes[g]; i++) {
+            JXL_ASSERT(ac_group_sec[g][first_pass + i] != num);
+            readers[i] = sections[ac_group_sec[g][first_pass + i]].br;
+          }
+          if (!ProcessACGroup(g, readers, desired_num_ac_passes[g],
+                              GetStorageLocation(thread, g),
+                              /*force_draw=*/false, /*dc_only=*/false)) {
+            has_error = true;
+          } else {
+            for (size_t i = 0; i < desired_num_ac_passes[g]; i++) {
+              section_status[ac_group_sec[g][first_pass + i]] =
+                  SectionStatus::kDone;
+            }
+          }
+        },
+        "DecodeGroup"));
+  }
+  if (has_error) return JXL_FAILURE("Error in AC group");
+
+  MarkSections(sections, num, section_status);
+  return true;
+}
+
+Status FrameDecoder::Flush() {
+  bool has_blending = frame_header_.blending_info.mode != BlendMode::kReplace ||
+                      frame_header_.custom_size_or_origin;
+  for (const auto& blending_info_ec :
+       frame_header_.extra_channel_blending_info) {
+    if (blending_info_ec.mode != BlendMode::kReplace) has_blending = true;
+  }
+  // No early Flush() if blending is enabled.
+  if (has_blending && !is_finalized_) {
+    return false;
+  }
+  // No early Flush() - nothing to do - if the frame is a kSkipProgressive
+  // frame.
+  if (frame_header_.frame_type == FrameType::kSkipProgressive &&
+      !is_finalized_) {
+    return true;
+  }
+  if (decoded_->IsJPEG()) {
+    // Nothing to do.
+    return true;
+  }
+  JXL_RETURN_IF_ERROR(AllocateOutput());
+
+  uint32_t completely_decoded_ac_pass = *std::min_element(
+      decoded_passes_per_ac_group_.begin(), decoded_passes_per_ac_group_.end());
+  if (completely_decoded_ac_pass < frame_header_.passes.num_passes) {
+    // We don't have all AC yet: force a draw of all the missing areas.
+    // Mark all sections as not complete.
+    for (size_t i = 0; i < decoded_passes_per_ac_group_.size(); i++) {
+      if (decoded_passes_per_ac_group_[i] < frame_header_.passes.num_passes) {
+        dec_state_->render_pipeline->ClearDone(i);
+      }
+    }
+    std::atomic<bool> has_error{false};
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool_, 0, decoded_passes_per_ac_group_.size(),
+        [this](const size_t num_threads) {
+          return PrepareStorage(num_threads,
+                                decoded_passes_per_ac_group_.size());
+        },
+        [this, &has_error](const uint32_t g, size_t thread) {
+          if (decoded_passes_per_ac_group_[g] ==
+              frame_header_.passes.num_passes) {
+            // This group was drawn already, nothing to do.
+            return;
+          }
+          BitReader* JXL_RESTRICT readers[kMaxNumPasses] = {};
+          bool ok = ProcessACGroup(
+              g, readers, /*num_passes=*/0, GetStorageLocation(thread, g),
+              /*force_draw=*/true, /*dc_only=*/!decoded_ac_global_);
+          if (!ok) has_error = true;
+        },
+        "ForceDrawGroup"));
+    if (has_error) {
+      return JXL_FAILURE("Drawing groups failed");
+    }
+  }
+
+  // undo global modular transforms and copy int pixel buffers to float ones
+  JXL_RETURN_IF_ERROR(modular_frame_decoder_.FinalizeDecoding(dec_state_, pool_,
+                                                              is_finalized_));
+
+  return true;
+}
+
+int FrameDecoder::SavedAs(const FrameHeader& header) {
+  if (header.frame_type == FrameType::kDCFrame) {
+    // bits 16, 32, 64, 128 for DC level
+    return 16 << (header.dc_level - 1);
+  } else if (header.CanBeReferenced()) {
+    // bits 1, 2, 4 and 8 for the references
+    return 1 << header.save_as_reference;
+  }
+
+  return 0;
+}
+
+bool FrameDecoder::HasEverything() const {
+  if (!decoded_dc_global_) return false;
+  if (!decoded_ac_global_) return false;
+  for (auto& have_dc_group : decoded_dc_groups_) {
+    if (!have_dc_group) return false;
+  }
+  for (auto& nb_passes : decoded_passes_per_ac_group_) {
+    if (nb_passes < frame_header_.passes.num_passes) return false;
+  }
+  return true;
+}
+
+int FrameDecoder::References() const {
+  if (is_finalized_) {
+    return 0;
+  }
+  if (!HasEverything()) return 0;
+
+  int result = 0;
+
+  // Blending
+  if (frame_header_.frame_type == FrameType::kRegularFrame ||
+      frame_header_.frame_type == FrameType::kSkipProgressive) {
+    bool cropped = frame_header_.custom_size_or_origin;
+    if (cropped || frame_header_.blending_info.mode != BlendMode::kReplace) {
+      result |= (1 << frame_header_.blending_info.source);
+    }
+    const auto& extra = frame_header_.extra_channel_blending_info;
+    for (size_t i = 0; i < extra.size(); ++i) {
+      if (cropped || extra[i].mode != BlendMode::kReplace) {
+        result |= (1 << extra[i].source);
+      }
+    }
+  }
+
+  // Patches
+  if (frame_header_.flags & FrameHeader::kPatches) {
+    result |= dec_state_->shared->image_features.patches.GetReferences();
+  }
+
+  // DC Level
+  if (frame_header_.flags & FrameHeader::kUseDcFrame) {
+    // Reads from the next dc level
+    int dc_level = frame_header_.dc_level + 1;
+    // bits 16, 32, 64, 128 for DC level
+    result |= (16 << (dc_level - 1));
+  }
+
+  return result;
+}
+
+Status FrameDecoder::FinalizeFrame() {
+  if (is_finalized_) {
+    return JXL_FAILURE("FinalizeFrame called multiple times");
+  }
+  is_finalized_ = true;
+  if (decoded_->IsJPEG()) {
+    // Nothing to do.
+    return true;
+  }
+
+  // undo global modular transforms and copy int pixel buffers to float ones
+  JXL_RETURN_IF_ERROR(
+      modular_frame_decoder_.FinalizeDecoding(dec_state_, pool_,
+                                              /*inplace=*/true));
+
+  if (frame_header_.CanBeReferenced()) {
+    auto& info = dec_state_->shared_storage
+                     .reference_frames[frame_header_.save_as_reference];
+    info.frame = std::move(dec_state_->frame_storage_for_referencing);
+    info.ib_is_in_xyb = frame_header_.save_before_color_transform;
+  }
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_frame.h b/third_party/jpeg-xl/lib/jxl/dec_frame.h
new file mode 100644
index 0000000000..6b54ac631f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_frame.h
@@ -0,0 +1,329 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_FRAME_H_
+#define LIB_JXL_DEC_FRAME_H_
+
+#include <jxl/decode.h>
+#include <jxl/types.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/blending.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/dec_modular.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+// Decodes a frame. Groups may be processed in parallel by `pool`.
+// `metadata` is the metadata that applies to all frames of the codestream
+// `decoded->metadata` must already be set and must match metadata.m.
+// Used in the encoder to model decoder behaviour, and in tests.
+Status DecodeFrame(PassesDecoderState* dec_state, ThreadPool* JXL_RESTRICT pool,
+                   const uint8_t* next_in, size_t avail_in,
+                   ImageBundle* decoded, const CodecMetadata& metadata,
+                   bool use_slow_rendering_pipeline = false);
+
+// TODO(veluca): implement "forced drawing".
+class FrameDecoder {
+ public:
+  // All parameters must outlive the FrameDecoder.
+  FrameDecoder(PassesDecoderState* dec_state, const CodecMetadata& metadata,
+               ThreadPool* pool, bool use_slow_rendering_pipeline)
+      : dec_state_(dec_state),
+        pool_(pool),
+        frame_header_(&metadata),
+        use_slow_rendering_pipeline_(use_slow_rendering_pipeline) {}
+
+  void SetRenderSpotcolors(bool rsc) { render_spotcolors_ = rsc; }
+  void SetCoalescing(bool c) { coalescing_ = c; }
+
+  // Read FrameHeader and table of contents from the given BitReader.
+  Status InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded,
+                   bool is_preview);
+
+  // Checks frame dimensions for their limits, and sets the output
+  // image buffer.
+  Status InitFrameOutput();
+
+  struct SectionInfo {
+    BitReader* JXL_RESTRICT br;
+    // Logical index of the section, regardless of any permutation that may be
+    // applied in the table of contents or of the physical position in the file.
+    size_t id;
+    // Index of the section in the order of the bytes inside the frame.
+    size_t index;
+  };
+
+  struct TocEntry {
+    size_t size;
+    size_t id;
+  };
+
+  enum SectionStatus {
+    // Processed correctly.
+    kDone = 0,
+    // Skipped because other required sections were not yet processed.
+    kSkipped = 1,
+    // Skipped because the section was already processed.
+    kDuplicate = 2,
+    // Only partially decoded: the section will need to be processed again.
+    kPartial = 3,
+  };
+
+  // Processes `num` sections; each SectionInfo contains the index
+  // of the section and a BitReader that only contains the data of the section.
+  // `section_status` should point to `num` elements, and will be filled with
+  // information about whether each section was processed or not.
+  // A section is a part of the encoded file that is indexed by the TOC.
+  Status ProcessSections(const SectionInfo* sections, size_t num,
+                         SectionStatus* section_status);
+
+  // Flushes all the data decoded so far to pixels.
+  Status Flush();
+
+  // Runs final operations once a frame data is decoded.
+  // Must be called exactly once per frame, after all calls to ProcessSections.
+  Status FinalizeFrame();
+
+  // Returns dependencies of this frame on reference ids as a bit mask: bits 0-3
+  // indicate reference frame 0-3 for patches and blending, bits 4-7 indicate DC
+  // frames this frame depends on. Only returns a valid result after all calls
+  // to ProcessSections are finished and before FinalizeFrame.
+  int References() const;
+
+  // Returns reference id of storage location where this frame is stored as a
+  // bit flag, or 0 if not stored.
+  // Matches the bit mask used for GetReferences: bits 0-3 indicate it is stored
+  // for patching or blending, bits 4-7 indicate DC frame.
+  // Unlike References, can be ran at any time as
+  // soon as the frame header is known.
+  static int SavedAs(const FrameHeader& header);
+
+  uint64_t SumSectionSizes() const { return section_sizes_sum_; }
+  const std::vector<TocEntry>& Toc() const { return toc_; }
+
+  const FrameHeader& GetFrameHeader() const { return frame_header_; }
+
+  // Returns whether a DC image has been decoded, accessible at low resolution
+  // at passes.shared_storage.dc_storage
+  bool HasDecodedDC() const { return finalized_dc_; }
+  bool HasDecodedAll() const { return toc_.size() == num_sections_done_; }
+
+  size_t NumCompletePasses() const {
+    return *std::min_element(decoded_passes_per_ac_group_.begin(),
+                             decoded_passes_per_ac_group_.end());
+  }
+
+  // If enabled, ProcessSections will stop and return true when the DC
+  // sections have been processed, instead of starting the AC sections. This
+  // will only occur if supported (that is, flushing will produce a valid
+  // 1/8th*1/8th resolution image). The return value of true then does not mean
+  // all sections have been processed, use HasDecodedDC and HasDecodedAll
+  // to check the true finished state.
+  // Returns the progressive detail that will be effective for the frame.
+  JxlProgressiveDetail SetPauseAtProgressive(JxlProgressiveDetail prog_detail) {
+    bool single_section =
+        frame_dim_.num_groups == 1 && frame_header_.passes.num_passes == 1;
+    if (frame_header_.frame_type != kSkipProgressive &&
+        // If there's only one group and one pass, there is no separate section
+        // for DC and the entire full resolution image is available at once.
+        !single_section &&
+        // If extra channels are encoded with modular without squeeze, they
+        // don't support DC. If the are encoded with squeeze, DC works in theory
+        // but the implementation may not yet correctly support this for Flush.
+        // Therefore, can't correctly pause for a progressive step if there is
+        // an extra channel (including alpha channel)
+        // TODO(firsching): Check if this is still the case.
+        decoded_->metadata()->extra_channel_info.empty() &&
+        // DC is not guaranteed to be available in modular mode and may be a
+        // black image. If squeeze is used, it may be available depending on the
+        // current implementation.
+        // TODO(lode): do return DC if it's known that flushing at this point
+        // will produce a valid 1/8th downscaled image with modular encoding.
+        frame_header_.encoding == FrameEncoding::kVarDCT) {
+      progressive_detail_ = prog_detail;
+    } else {
+      progressive_detail_ = JxlProgressiveDetail::kFrames;
+    }
+    if (progressive_detail_ >= JxlProgressiveDetail::kPasses) {
+      for (size_t i = 1; i < frame_header_.passes.num_passes; ++i) {
+        passes_to_pause_.push_back(i);
+      }
+    } else if (progressive_detail_ >= JxlProgressiveDetail::kLastPasses) {
+      for (size_t i = 0; i < frame_header_.passes.num_downsample; ++i) {
+        passes_to_pause_.push_back(frame_header_.passes.last_pass[i] + 1);
+      }
+      // The format does not guarantee that these values are sorted.
+      std::sort(passes_to_pause_.begin(), passes_to_pause_.end());
+    }
+    return progressive_detail_;
+  }
+
+  size_t NextNumPassesToPause() const {
+    auto it = std::upper_bound(passes_to_pause_.begin(), passes_to_pause_.end(),
+                               NumCompletePasses());
+    return (it != passes_to_pause_.end() ? *it
+                                         : std::numeric_limits<size_t>::max());
+  }
+
+  // Sets the pixel callback or image buffer where the pixels will be decoded.
+  //
+  // @param undo_orientation: if true, indicates the frame decoder should apply
+  // the exif orientation to bring the image to the intended display
+  // orientation.
+  void SetImageOutput(const PixelCallback& pixel_callback, void* image_buffer,
+                      size_t image_buffer_size, size_t xsize, size_t ysize,
+                      JxlPixelFormat format, size_t bits_per_sample,
+                      bool unpremul_alpha, bool undo_orientation) const {
+    dec_state_->width = xsize;
+    dec_state_->height = ysize;
+    dec_state_->main_output.format = format;
+    dec_state_->main_output.bits_per_sample = bits_per_sample;
+    dec_state_->main_output.callback = pixel_callback;
+    dec_state_->main_output.buffer = image_buffer;
+    dec_state_->main_output.buffer_size = image_buffer_size;
+    dec_state_->main_output.stride = GetStride(xsize, format);
+    const jxl::ExtraChannelInfo* alpha =
+        decoded_->metadata()->Find(jxl::ExtraChannel::kAlpha);
+    if (alpha && alpha->alpha_associated && unpremul_alpha) {
+      dec_state_->unpremul_alpha = true;
+    }
+    if (undo_orientation) {
+      dec_state_->undo_orientation = decoded_->metadata()->GetOrientation();
+      if (static_cast<int>(dec_state_->undo_orientation) > 4) {
+        std::swap(dec_state_->width, dec_state_->height);
+      }
+    }
+    dec_state_->extra_output.clear();
+#if !JXL_HIGH_PRECISION
+    if (dec_state_->main_output.buffer &&
+        (format.data_type == JXL_TYPE_UINT8) && (format.num_channels >= 3) &&
+        !dec_state_->unpremul_alpha &&
+        (dec_state_->undo_orientation == Orientation::kIdentity) &&
+        decoded_->metadata()->xyb_encoded &&
+        dec_state_->output_encoding_info.color_encoding.IsSRGB() &&
+        dec_state_->output_encoding_info.all_default_opsin &&
+        (dec_state_->output_encoding_info.desired_intensity_target ==
+         dec_state_->output_encoding_info.orig_intensity_target) &&
+        HasFastXYBTosRGB8() && frame_header_.needs_color_transform()) {
+      dec_state_->fast_xyb_srgb8_conversion = true;
+    }
+#endif
+  }
+
+  void AddExtraChannelOutput(void* buffer, size_t buffer_size, size_t xsize,
+                             JxlPixelFormat format, size_t bits_per_sample) {
+    ImageOutput out;
+    out.format = format;
+    out.bits_per_sample = bits_per_sample;
+    out.buffer = buffer;
+    out.buffer_size = buffer_size;
+    out.stride = GetStride(xsize, format);
+    dec_state_->extra_output.push_back(out);
+  }
+
+ private:
+  Status ProcessDCGlobal(BitReader* br);
+  Status ProcessDCGroup(size_t dc_group_id, BitReader* br);
+  void FinalizeDC();
+  Status AllocateOutput();
+  Status ProcessACGlobal(BitReader* br);
+  Status ProcessACGroup(size_t ac_group_id, BitReader* JXL_RESTRICT* br,
+                        size_t num_passes, size_t thread, bool force_draw,
+                        bool dc_only);
+  void MarkSections(const SectionInfo* sections, size_t num,
+                    SectionStatus* section_status);
+
+  // Allocates storage for parallel decoding using up to `num_threads` threads
+  // of up to `num_tasks` tasks. The value of `thread` passed to
+  // `GetStorageLocation` must be smaller than the `num_threads` value passed
+  // here. The value of `task` passed to `GetStorageLocation` must be smaller
+  // than the value of `num_tasks` passed here.
+  Status PrepareStorage(size_t num_threads, size_t num_tasks) {
+    size_t storage_size = std::min(num_threads, num_tasks);
+    if (storage_size > group_dec_caches_.size()) {
+      group_dec_caches_.resize(storage_size);
+    }
+    use_task_id_ = num_threads > num_tasks;
+    bool use_group_ids = (modular_frame_decoder_.UsesFullImage() &&
+                          (frame_header_.encoding == FrameEncoding::kVarDCT ||
+                           (frame_header_.flags & FrameHeader::kNoise)));
+    if (dec_state_->render_pipeline) {
+      JXL_RETURN_IF_ERROR(dec_state_->render_pipeline->PrepareForThreads(
+          storage_size, use_group_ids));
+    }
+    return true;
+  }
+
+  size_t GetStorageLocation(size_t thread, size_t task) {
+    if (use_task_id_) return task;
+    return thread;
+  }
+
+  static size_t BytesPerChannel(JxlDataType data_type) {
+    return (data_type == JXL_TYPE_UINT8   ? 1u
+            : data_type == JXL_TYPE_FLOAT ? 4u
+                                          : 2u);
+  }
+
+  static size_t GetStride(const size_t xsize, JxlPixelFormat format) {
+    size_t stride =
+        (xsize * BytesPerChannel(format.data_type) * format.num_channels);
+    if (format.align > 1) {
+      stride = (jxl::DivCeil(stride, format.align) * format.align);
+    }
+    return stride;
+  }
+
+  PassesDecoderState* dec_state_;
+  ThreadPool* pool_;
+  std::vector<TocEntry> toc_;
+  uint64_t section_sizes_sum_;
+  // TODO(veluca): figure out the duplication between these and dec_state_.
+  FrameHeader frame_header_;
+  FrameDimensions frame_dim_;
+  ImageBundle* decoded_;
+  ModularFrameDecoder modular_frame_decoder_;
+  bool render_spotcolors_ = true;
+  bool coalescing_ = true;
+
+  std::vector<uint8_t> processed_section_;
+  std::vector<uint8_t> decoded_passes_per_ac_group_;
+  std::vector<uint8_t> decoded_dc_groups_;
+  bool decoded_dc_global_;
+  bool decoded_ac_global_;
+  bool HasEverything() const;
+  bool finalized_dc_ = true;
+  size_t num_sections_done_ = 0;
+  bool is_finalized_ = true;
+  bool allocated_ = false;
+
+  std::vector<GroupDecCache> group_dec_caches_;
+
+  // Whether or not the task id should be used for storage indexing, instead of
+  // the thread id.
+  bool use_task_id_ = false;
+
+  // Testing setting: whether or not to use the slow rendering pipeline.
+  bool use_slow_rendering_pipeline_;
+
+  JxlProgressiveDetail progressive_detail_ = kFrames;
+  // Number of completed passes where section decoding should pause.
+  // Used for progressive details at least kLastPasses.
+  std::vector<int> passes_to_pause_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_FRAME_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_group.cc b/third_party/jpeg-xl/lib/jxl/dec_group.cc
new file mode 100644
index 0000000000..be8df9b062
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_group.cc
@@ -0,0 +1,801 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_group.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+#include "lib/jxl/frame_header.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/dec_group.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/dec_transforms-inl.h"
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/epf.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer-inl.h"
+#include "lib/jxl/quantizer.h"
+
+#ifndef LIB_JXL_DEC_GROUP_CC
+#define LIB_JXL_DEC_GROUP_CC
+namespace jxl {
+
+struct AuxOut;
+
+// Interface for reading groups for DecodeGroupImpl.
+class GetBlock {
+ public:
+  virtual void StartRow(size_t by) = 0;
+  virtual Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs,
+                           size_t size, size_t log2_covered_blocks,
+                           ACPtr block[3], ACType ac_type) = 0;
+  virtual ~GetBlock() {}
+};
+
+// Controls whether DecodeGroupImpl renders to pixels or not.
+enum DrawMode {
+  // Render to pixels.
+  kDraw = 0,
+  // Don't render to pixels.
+  kDontDraw = 1,
+};
+
+}  // namespace jxl
+#endif  // LIB_JXL_DEC_GROUP_CC
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ShiftRight;
+
+using D = HWY_FULL(float);
+using DU = HWY_FULL(uint32_t);
+using DI = HWY_FULL(int32_t);
+using DI16 = Rebind<int16_t, DI>;
+constexpr D d;
+constexpr DI di;
+constexpr DI16 di16;
+
+// TODO(veluca): consider SIMDfying.
+void Transpose8x8InPlace(int32_t* JXL_RESTRICT block) {
+  for (size_t x = 0; x < 8; x++) {
+    for (size_t y = x + 1; y < 8; y++) {
+      std::swap(block[y * 8 + x], block[x * 8 + y]);
+    }
+  }
+}
+
+template <ACType ac_type>
+void DequantLane(Vec<D> scaled_dequant_x, Vec<D> scaled_dequant_y,
+                 Vec<D> scaled_dequant_b,
+                 const float* JXL_RESTRICT dequant_matrices, size_t size,
+                 size_t k, Vec<D> x_cc_mul, Vec<D> b_cc_mul,
+                 const float* JXL_RESTRICT biases, ACPtr qblock[3],
+                 float* JXL_RESTRICT block) {
+  const auto x_mul = Mul(Load(d, dequant_matrices + k), scaled_dequant_x);
+  const auto y_mul =
+      Mul(Load(d, dequant_matrices + size + k), scaled_dequant_y);
+  const auto b_mul =
+      Mul(Load(d, dequant_matrices + 2 * size + k), scaled_dequant_b);
+
+  Vec<DI> quantized_x_int;
+  Vec<DI> quantized_y_int;
+  Vec<DI> quantized_b_int;
+  if (ac_type == ACType::k16) {
+    Rebind<int16_t, DI> di16;
+    quantized_x_int = PromoteTo(di, Load(di16, qblock[0].ptr16 + k));
+    quantized_y_int = PromoteTo(di, Load(di16, qblock[1].ptr16 + k));
+    quantized_b_int = PromoteTo(di, Load(di16, qblock[2].ptr16 + k));
+  } else {
+    quantized_x_int = Load(di, qblock[0].ptr32 + k);
+    quantized_y_int = Load(di, qblock[1].ptr32 + k);
+    quantized_b_int = Load(di, qblock[2].ptr32 + k);
+  }
+
+  const auto dequant_x_cc =
+      Mul(AdjustQuantBias(di, 0, quantized_x_int, biases), x_mul);
+  const auto dequant_y =
+      Mul(AdjustQuantBias(di, 1, quantized_y_int, biases), y_mul);
+  const auto dequant_b_cc =
+      Mul(AdjustQuantBias(di, 2, quantized_b_int, biases), b_mul);
+
+  const auto dequant_x = MulAdd(x_cc_mul, dequant_y, dequant_x_cc);
+  const auto dequant_b = MulAdd(b_cc_mul, dequant_y, dequant_b_cc);
+  Store(dequant_x, d, block + k);
+  Store(dequant_y, d, block + size + k);
+  Store(dequant_b, d, block + 2 * size + k);
+}
+
+template <ACType ac_type>
+void DequantBlock(const AcStrategy& acs, float inv_global_scale, int quant,
+                  float x_dm_multiplier, float b_dm_multiplier, Vec<D> x_cc_mul,
+                  Vec<D> b_cc_mul, size_t kind, size_t size,
+                  const Quantizer& quantizer, size_t covered_blocks,
+                  const size_t* sbx,
+                  const float* JXL_RESTRICT* JXL_RESTRICT dc_row,
+                  size_t dc_stride, const float* JXL_RESTRICT biases,
+                  ACPtr qblock[3], float* JXL_RESTRICT block) {
+  PROFILER_FUNC;
+
+  const auto scaled_dequant_s = inv_global_scale / quant;
+
+  const auto scaled_dequant_x = Set(d, scaled_dequant_s * x_dm_multiplier);
+  const auto scaled_dequant_y = Set(d, scaled_dequant_s);
+  const auto scaled_dequant_b = Set(d, scaled_dequant_s * b_dm_multiplier);
+
+  const float* dequant_matrices = quantizer.DequantMatrix(kind, 0);
+
+  for (size_t k = 0; k < covered_blocks * kDCTBlockSize; k += Lanes(d)) {
+    DequantLane<ac_type>(scaled_dequant_x, scaled_dequant_y, scaled_dequant_b,
+                         dequant_matrices, size, k, x_cc_mul, b_cc_mul, biases,
+                         qblock, block);
+  }
+  for (size_t c = 0; c < 3; c++) {
+    LowestFrequenciesFromDC(acs.Strategy(), dc_row[c] + sbx[c], dc_stride,
+                            block + c * size);
+  }
+}
+
+Status DecodeGroupImpl(GetBlock* JXL_RESTRICT get_block,
+                       GroupDecCache* JXL_RESTRICT group_dec_cache,
+                       PassesDecoderState* JXL_RESTRICT dec_state,
+                       size_t thread, size_t group_idx,
+                       RenderPipelineInput& render_pipeline_input,
+                       ImageBundle* decoded, DrawMode draw) {
+  // TODO(veluca): investigate cache usage in this function.
+  PROFILER_FUNC;
+  const Rect block_rect = dec_state->shared->BlockGroupRect(group_idx);
+  const AcStrategyImage& ac_strategy = dec_state->shared->ac_strategy;
+
+  const size_t xsize_blocks = block_rect.xsize();
+  const size_t ysize_blocks = block_rect.ysize();
+
+  const size_t dc_stride = dec_state->shared->dc->PixelsPerRow();
+
+  const float inv_global_scale = dec_state->shared->quantizer.InvGlobalScale();
+
+  const YCbCrChromaSubsampling& cs =
+      dec_state->shared->frame_header.chroma_subsampling;
+
+  size_t idct_stride[3];
+  for (size_t c = 0; c < 3; c++) {
+    idct_stride[c] = render_pipeline_input.GetBuffer(c).first->PixelsPerRow();
+  }
+
+  HWY_ALIGN int32_t scaled_qtable[64 * 3];
+
+  ACType ac_type = dec_state->coefficients->Type();
+  auto dequant_block = ac_type == ACType::k16 ? DequantBlock<ACType::k16>
+                                              : DequantBlock<ACType::k32>;
+  // Whether or not coefficients should be stored for future usage, and/or read
+  // from past usage.
+  bool accumulate = !dec_state->coefficients->IsEmpty();
+  // Offset of the current block in the group.
+  size_t offset = 0;
+
+  std::array<int, 3> jpeg_c_map;
+  bool jpeg_is_gray = false;
+  std::array<int, 3> dcoff = {};
+
+  // TODO(veluca): all of this should be done only once per image.
+  if (decoded->IsJPEG()) {
+    if (!dec_state->shared->cmap.IsJPEGCompatible()) {
+      return JXL_FAILURE("The CfL map is not JPEG-compatible");
+    }
+    jpeg_is_gray = (decoded->jpeg_data->components.size() == 1);
+    jpeg_c_map = JpegOrder(dec_state->shared->frame_header.color_transform,
+                           jpeg_is_gray);
+    const std::vector<QuantEncoding>& qe =
+        dec_state->shared->matrices.encodings();
+    if (qe.empty() || qe[0].mode != QuantEncoding::Mode::kQuantModeRAW ||
+        std::abs(qe[0].qraw.qtable_den - 1.f / (8 * 255)) > 1e-8f) {
+      return JXL_FAILURE(
+          "Quantization table is not a JPEG quantization table.");
+    }
+    for (size_t c = 0; c < 3; c++) {
+      if (dec_state->shared->frame_header.color_transform ==
+          ColorTransform::kNone) {
+        dcoff[c] = 1024 / (*qe[0].qraw.qtable)[64 * c];
+      }
+      for (size_t i = 0; i < 64; i++) {
+        // Transpose the matrix, as it will be used on the transposed block.
+        int n = qe[0].qraw.qtable->at(64 + i);
+        int d = qe[0].qraw.qtable->at(64 * c + i);
+        if (n <= 0 || d <= 0 || n >= 65536 || d >= 65536) {
+          return JXL_FAILURE("Invalid JPEG quantization table");
+        }
+        scaled_qtable[64 * c + (i % 8) * 8 + (i / 8)] =
+            (1 << kCFLFixedPointPrecision) * n / d;
+      }
+    }
+  }
+
+  size_t hshift[3] = {cs.HShift(0), cs.HShift(1), cs.HShift(2)};
+  size_t vshift[3] = {cs.VShift(0), cs.VShift(1), cs.VShift(2)};
+  Rect r[3];
+  for (size_t i = 0; i < 3; i++) {
+    r[i] =
+        Rect(block_rect.x0() >> hshift[i], block_rect.y0() >> vshift[i],
+             block_rect.xsize() >> hshift[i], block_rect.ysize() >> vshift[i]);
+    if (!r[i].IsInside({0, 0, dec_state->shared->dc->Plane(i).xsize(),
+                        dec_state->shared->dc->Plane(i).ysize()})) {
+      return JXL_FAILURE("Frame dimensions are too big for the image.");
+    }
+  }
+
+  for (size_t by = 0; by < ysize_blocks; ++by) {
+    get_block->StartRow(by);
+    size_t sby[3] = {by >> vshift[0], by >> vshift[1], by >> vshift[2]};
+
+    const int32_t* JXL_RESTRICT row_quant =
+        block_rect.ConstRow(dec_state->shared->raw_quant_field, by);
+
+    const float* JXL_RESTRICT dc_rows[3] = {
+        r[0].ConstPlaneRow(*dec_state->shared->dc, 0, sby[0]),
+        r[1].ConstPlaneRow(*dec_state->shared->dc, 1, sby[1]),
+        r[2].ConstPlaneRow(*dec_state->shared->dc, 2, sby[2]),
+    };
+
+    const size_t ty = (block_rect.y0() + by) / kColorTileDimInBlocks;
+    AcStrategyRow acs_row = ac_strategy.ConstRow(block_rect, by);
+
+    const int8_t* JXL_RESTRICT row_cmap[3] = {
+        dec_state->shared->cmap.ytox_map.ConstRow(ty),
+        nullptr,
+        dec_state->shared->cmap.ytob_map.ConstRow(ty),
+    };
+
+    float* JXL_RESTRICT idct_row[3];
+    int16_t* JXL_RESTRICT jpeg_row[3];
+    for (size_t c = 0; c < 3; c++) {
+      idct_row[c] = render_pipeline_input.GetBuffer(c).second.Row(
+          render_pipeline_input.GetBuffer(c).first, sby[c] * kBlockDim);
+      if (decoded->IsJPEG()) {
+        auto& component = decoded->jpeg_data->components[jpeg_c_map[c]];
+        jpeg_row[c] =
+            component.coeffs.data() +
+            (component.width_in_blocks * (r[c].y0() + sby[c]) + r[c].x0()) *
+                kDCTBlockSize;
+      }
+    }
+
+    size_t bx = 0;
+    for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks);
+         tx++) {
+      size_t abs_tx = tx + block_rect.x0() / kColorTileDimInBlocks;
+      auto x_cc_mul =
+          Set(d, dec_state->shared->cmap.YtoXRatio(row_cmap[0][abs_tx]));
+      auto b_cc_mul =
+          Set(d, dec_state->shared->cmap.YtoBRatio(row_cmap[2][abs_tx]));
+      // Increment bx by llf_x because those iterations would otherwise
+      // immediately continue (!IsFirstBlock). Reduces mispredictions.
+      for (; bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks;) {
+        size_t sbx[3] = {bx >> hshift[0], bx >> hshift[1], bx >> hshift[2]};
+        AcStrategy acs = acs_row[bx];
+        const size_t llf_x = acs.covered_blocks_x();
+
+        // Can only happen in the second or lower rows of a varblock.
+        if (JXL_UNLIKELY(!acs.IsFirstBlock())) {
+          bx += llf_x;
+          continue;
+        }
+        PROFILER_ZONE("DecodeGroupImpl inner");
+        const size_t log2_covered_blocks = acs.log2_covered_blocks();
+
+        const size_t covered_blocks = 1 << log2_covered_blocks;
+        const size_t size = covered_blocks * kDCTBlockSize;
+
+        ACPtr qblock[3];
+        if (accumulate) {
+          for (size_t c = 0; c < 3; c++) {
+            qblock[c] = dec_state->coefficients->PlaneRow(c, group_idx, offset);
+          }
+        } else {
+          // No point in reading from bitstream without accumulating and not
+          // drawing.
+          JXL_ASSERT(draw == kDraw);
+          if (ac_type == ACType::k16) {
+            memset(group_dec_cache->dec_group_qblock16, 0,
+                   size * 3 * sizeof(int16_t));
+            for (size_t c = 0; c < 3; c++) {
+              qblock[c].ptr16 = group_dec_cache->dec_group_qblock16 + c * size;
+            }
+          } else {
+            memset(group_dec_cache->dec_group_qblock, 0,
+                   size * 3 * sizeof(int32_t));
+            for (size_t c = 0; c < 3; c++) {
+              qblock[c].ptr32 = group_dec_cache->dec_group_qblock + c * size;
+            }
+          }
+        }
+        JXL_RETURN_IF_ERROR(get_block->LoadBlock(
+            bx, by, acs, size, log2_covered_blocks, qblock, ac_type));
+        offset += size;
+        if (draw == kDontDraw) {
+          bx += llf_x;
+          continue;
+        }
+
+        if (JXL_UNLIKELY(decoded->IsJPEG())) {
+          if (acs.Strategy() != AcStrategy::Type::DCT) {
+            return JXL_FAILURE(
+                "Can only decode to JPEG if only DCT-8 is used.");
+          }
+
+          HWY_ALIGN int32_t transposed_dct_y[64];
+          for (size_t c : {1, 0, 2}) {
+            // Propagate only Y for grayscale.
+            if (jpeg_is_gray && c != 1) {
+              continue;
+            }
+            if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) {
+              continue;
+            }
+            int16_t* JXL_RESTRICT jpeg_pos =
+                jpeg_row[c] + sbx[c] * kDCTBlockSize;
+            // JPEG XL is transposed, JPEG is not.
+            auto transposed_dct = qblock[c].ptr32;
+            Transpose8x8InPlace(transposed_dct);
+            // No CfL - no need to store the y block converted to integers.
+            if (!cs.Is444() ||
+                (row_cmap[0][abs_tx] == 0 && row_cmap[2][abs_tx] == 0)) {
+              for (size_t i = 0; i < 64; i += Lanes(d)) {
+                const auto ini = Load(di, transposed_dct + i);
+                const auto ini16 = DemoteTo(di16, ini);
+                StoreU(ini16, di16, jpeg_pos + i);
+              }
+            } else if (c == 1) {
+              // Y channel: save for restoring X/B, but nothing else to do.
+              for (size_t i = 0; i < 64; i += Lanes(d)) {
+                const auto ini = Load(di, transposed_dct + i);
+                Store(ini, di, transposed_dct_y + i);
+                const auto ini16 = DemoteTo(di16, ini);
+                StoreU(ini16, di16, jpeg_pos + i);
+              }
+            } else {
+              // transposed_dct_y contains the y channel block, transposed.
+              const auto scale = Set(
+                  di, dec_state->shared->cmap.RatioJPEG(row_cmap[c][abs_tx]));
+              const auto round = Set(di, 1 << (kCFLFixedPointPrecision - 1));
+              for (int i = 0; i < 64; i += Lanes(d)) {
+                auto in = Load(di, transposed_dct + i);
+                auto in_y = Load(di, transposed_dct_y + i);
+                auto qt = Load(di, scaled_qtable + c * size + i);
+                auto coeff_scale = ShiftRight<kCFLFixedPointPrecision>(
+                    Add(Mul(qt, scale), round));
+                auto cfl_factor = ShiftRight<kCFLFixedPointPrecision>(
+                    Add(Mul(in_y, coeff_scale), round));
+                StoreU(DemoteTo(di16, Add(in, cfl_factor)), di16, jpeg_pos + i);
+              }
+            }
+            jpeg_pos[0] =
+                Clamp1<float>(dc_rows[c][sbx[c]] - dcoff[c], -2047, 2047);
+          }
+        } else {
+          HWY_ALIGN float* const block = group_dec_cache->dec_group_block;
+          // Dequantize and add predictions.
+          dequant_block(
+              acs, inv_global_scale, row_quant[bx], dec_state->x_dm_multiplier,
+              dec_state->b_dm_multiplier, x_cc_mul, b_cc_mul, acs.RawStrategy(),
+              size, dec_state->shared->quantizer,
+              acs.covered_blocks_y() * acs.covered_blocks_x(), sbx, dc_rows,
+              dc_stride,
+              dec_state->output_encoding_info.opsin_params.quant_biases, qblock,
+              block);
+
+          for (size_t c : {1, 0, 2}) {
+            if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) {
+              continue;
+            }
+            // IDCT
+            float* JXL_RESTRICT idct_pos = idct_row[c] + sbx[c] * kBlockDim;
+            TransformToPixels(acs.Strategy(), block + c * size, idct_pos,
+                              idct_stride[c], group_dec_cache->scratch_space);
+          }
+        }
+        bx += llf_x;
+      }
+    }
+  }
+  if (draw == kDontDraw) {
+    return true;
+  }
+  return true;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+namespace {
+// Decode quantized AC coefficients of DCT blocks.
+// LLF components in the output block will not be modified.
+template <ACType ac_type>
+Status DecodeACVarBlock(size_t ctx_offset, size_t log2_covered_blocks,
+                        int32_t* JXL_RESTRICT row_nzeros,
+                        const int32_t* JXL_RESTRICT row_nzeros_top,
+                        size_t nzeros_stride, size_t c, size_t bx, size_t by,
+                        size_t lbx, AcStrategy acs,
+                        const coeff_order_t* JXL_RESTRICT coeff_order,
+                        BitReader* JXL_RESTRICT br,
+                        ANSSymbolReader* JXL_RESTRICT decoder,
+                        const std::vector<uint8_t>& context_map,
+                        const uint8_t* qdc_row, const int32_t* qf_row,
+                        const BlockCtxMap& block_ctx_map, ACPtr block,
+                        size_t shift = 0) {
+  PROFILER_FUNC;
+  // Equal to number of LLF coefficients.
+  const size_t covered_blocks = 1 << log2_covered_blocks;
+  const size_t size = covered_blocks * kDCTBlockSize;
+  int32_t predicted_nzeros =
+      PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32);
+
+  size_t ord = kStrategyOrder[acs.RawStrategy()];
+  const coeff_order_t* JXL_RESTRICT order =
+      &coeff_order[CoeffOrderOffset(ord, c)];
+
+  size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c);
+  const int32_t nzero_ctx =
+      block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset;
+
+  size_t nzeros = decoder->ReadHybridUint(nzero_ctx, br, context_map);
+  if (nzeros + covered_blocks > size) {
+    return JXL_FAILURE("Invalid AC: nzeros too large");
+  }
+  for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
+    for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
+      row_nzeros[bx + x + y * nzeros_stride] =
+          (nzeros + covered_blocks - 1) >> log2_covered_blocks;
+    }
+  }
+
+  const size_t histo_offset =
+      ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx);
+
+  // Skip LLF
+  {
+    PROFILER_ZONE("AcDecSkipLLF, reader");
+    size_t prev = (nzeros > size / 16 ? 0 : 1);
+    for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
+      const size_t ctx =
+          histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
+                                            log2_covered_blocks, prev);
+      const size_t u_coeff = decoder->ReadHybridUint(ctx, br, context_map);
+      // Hand-rolled version of UnpackSigned, shifting before the conversion to
+      // signed integer to avoid undefined behavior of shifting negative
+      // numbers.
+      const size_t magnitude = u_coeff >> 1;
+      const size_t neg_sign = (~u_coeff) & 1;
+      const intptr_t coeff =
+          static_cast<intptr_t>((magnitude ^ (neg_sign - 1)) << shift);
+      if (ac_type == ACType::k16) {
+        block.ptr16[order[k]] += coeff;
+      } else {
+        block.ptr32[order[k]] += coeff;
+      }
+      prev = static_cast<size_t>(u_coeff != 0);
+      nzeros -= prev;
+    }
+    if (JXL_UNLIKELY(nzeros != 0)) {
+      return JXL_FAILURE("Invalid AC: nzeros not 0. Block (%" PRIuS ", %" PRIuS
+                         "), channel %" PRIuS,
+                         bx, by, c);
+    }
+  }
+  return true;
+}
+
+// Structs used by DecodeGroupImpl to get a quantized block.
+// GetBlockFromBitstream uses ANS decoding (and thus keeps track of row
+// pointers in row_nzeros), GetBlockFromEncoder simply reads the coefficient
+// image provided by the encoder.
+
+struct GetBlockFromBitstream : public GetBlock {
+  void StartRow(size_t by) override {
+    qf_row = rect.ConstRow(*qf, by);
+    for (size_t c = 0; c < 3; c++) {
+      size_t sby = by >> vshift[c];
+      quant_dc_row = quant_dc->ConstRow(rect.y0() + by) + rect.x0();
+      for (size_t i = 0; i < num_passes; i++) {
+        row_nzeros[i][c] = group_dec_cache->num_nzeroes[i].PlaneRow(c, sby);
+        row_nzeros_top[i][c] =
+            sby == 0
+                ? nullptr
+                : group_dec_cache->num_nzeroes[i].ConstPlaneRow(c, sby - 1);
+      }
+    }
+  }
+
+  Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size,
+                   size_t log2_covered_blocks, ACPtr block[3],
+                   ACType ac_type) override {
+    auto decode_ac_varblock = ac_type == ACType::k16
+                                  ? DecodeACVarBlock<ACType::k16>
+                                  : DecodeACVarBlock<ACType::k32>;
+    for (size_t c : {1, 0, 2}) {
+      size_t sbx = bx >> hshift[c];
+      size_t sby = by >> vshift[c];
+      if (JXL_UNLIKELY((sbx << hshift[c] != bx) || (sby << vshift[c] != by))) {
+        continue;
+      }
+
+      for (size_t pass = 0; JXL_UNLIKELY(pass < num_passes); pass++) {
+        JXL_RETURN_IF_ERROR(decode_ac_varblock(
+            ctx_offset[pass], log2_covered_blocks, row_nzeros[pass][c],
+            row_nzeros_top[pass][c], nzeros_stride, c, sbx, sby, bx, acs,
+            &coeff_orders[pass * coeff_order_size], readers[pass],
+            &decoders[pass], context_map[pass], quant_dc_row, qf_row,
+            *block_ctx_map, block[c], shift_for_pass[pass]));
+      }
+    }
+    return true;
+  }
+
+  Status Init(BitReader* JXL_RESTRICT* JXL_RESTRICT readers, size_t num_passes,
+              size_t group_idx, size_t histo_selector_bits, const Rect& rect,
+              GroupDecCache* JXL_RESTRICT group_dec_cache,
+              PassesDecoderState* dec_state, size_t first_pass) {
+    for (size_t i = 0; i < 3; i++) {
+      hshift[i] = dec_state->shared->frame_header.chroma_subsampling.HShift(i);
+      vshift[i] = dec_state->shared->frame_header.chroma_subsampling.VShift(i);
+    }
+    this->coeff_order_size = dec_state->shared->coeff_order_size;
+    this->coeff_orders =
+        dec_state->shared->coeff_orders.data() + first_pass * coeff_order_size;
+    this->context_map = dec_state->context_map.data() + first_pass;
+    this->readers = readers;
+    this->num_passes = num_passes;
+    this->shift_for_pass =
+        dec_state->shared->frame_header.passes.shift + first_pass;
+    this->group_dec_cache = group_dec_cache;
+    this->rect = rect;
+    block_ctx_map = &dec_state->shared->block_ctx_map;
+    qf = &dec_state->shared->raw_quant_field;
+    quant_dc = &dec_state->shared->quant_dc;
+
+    for (size_t pass = 0; pass < num_passes; pass++) {
+      // Select which histogram set to use among those of the current pass.
+      size_t cur_histogram = 0;
+      if (histo_selector_bits != 0) {
+        cur_histogram = readers[pass]->ReadBits(histo_selector_bits);
+      }
+      if (cur_histogram >= dec_state->shared->num_histograms) {
+        return JXL_FAILURE("Invalid histogram selector");
+      }
+      ctx_offset[pass] = cur_histogram * block_ctx_map->NumACContexts();
+
+      decoders[pass] =
+          ANSSymbolReader(&dec_state->code[pass + first_pass], readers[pass]);
+    }
+    nzeros_stride = group_dec_cache->num_nzeroes[0].PixelsPerRow();
+    for (size_t i = 0; i < num_passes; i++) {
+      JXL_ASSERT(
+          nzeros_stride ==
+          static_cast<size_t>(group_dec_cache->num_nzeroes[i].PixelsPerRow()));
+    }
+    return true;
+  }
+
+  const uint32_t* shift_for_pass = nullptr;  // not owned
+  const coeff_order_t* JXL_RESTRICT coeff_orders;
+  size_t coeff_order_size;
+  const std::vector<uint8_t>* JXL_RESTRICT context_map;
+  ANSSymbolReader decoders[kMaxNumPasses];
+  BitReader* JXL_RESTRICT* JXL_RESTRICT readers;
+  size_t num_passes;
+  size_t ctx_offset[kMaxNumPasses];
+  size_t nzeros_stride;
+  int32_t* JXL_RESTRICT row_nzeros[kMaxNumPasses][3];
+  const int32_t* JXL_RESTRICT row_nzeros_top[kMaxNumPasses][3];
+  GroupDecCache* JXL_RESTRICT group_dec_cache;
+  const BlockCtxMap* block_ctx_map;
+  const ImageI* qf;
+  const ImageB* quant_dc;
+  const int32_t* qf_row;
+  const uint8_t* quant_dc_row;
+  Rect rect;
+  size_t hshift[3], vshift[3];
+};
+
+struct GetBlockFromEncoder : public GetBlock {
+  void StartRow(size_t by) override {}
+
+  Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size,
+                   size_t log2_covered_blocks, ACPtr block[3],
+                   ACType ac_type) override {
+    JXL_DASSERT(ac_type == ACType::k32);
+    for (size_t c = 0; c < 3; c++) {
+      // for each pass
+      for (size_t i = 0; i < quantized_ac->size(); i++) {
+        for (size_t k = 0; k < size; k++) {
+          // TODO(veluca): SIMD.
+          block[c].ptr32[k] +=
+              rows[i][c][offset + k] * (1 << shift_for_pass[i]);
+        }
+      }
+    }
+    offset += size;
+    return true;
+  }
+
+  GetBlockFromEncoder(const std::vector<std::unique_ptr<ACImage>>& ac,
+                      size_t group_idx, const uint32_t* shift_for_pass)
+      : quantized_ac(&ac), shift_for_pass(shift_for_pass) {
+    // TODO(veluca): not supported with chroma subsampling.
+    for (size_t i = 0; i < quantized_ac->size(); i++) {
+      JXL_CHECK((*quantized_ac)[i]->Type() == ACType::k32);
+      for (size_t c = 0; c < 3; c++) {
+        rows[i][c] = (*quantized_ac)[i]->PlaneRow(c, group_idx, 0).ptr32;
+      }
+    }
+  }
+
+  const std::vector<std::unique_ptr<ACImage>>* JXL_RESTRICT quantized_ac;
+  size_t offset = 0;
+  const int32_t* JXL_RESTRICT rows[kMaxNumPasses][3];
+  const uint32_t* shift_for_pass = nullptr;  // not owned
+};
+
+HWY_EXPORT(DecodeGroupImpl);
+
+}  // namespace
+
+Status DecodeGroup(BitReader* JXL_RESTRICT* JXL_RESTRICT readers,
+                   size_t num_passes, size_t group_idx,
+                   PassesDecoderState* JXL_RESTRICT dec_state,
+                   GroupDecCache* JXL_RESTRICT group_dec_cache, size_t thread,
+                   RenderPipelineInput& render_pipeline_input,
+                   ImageBundle* JXL_RESTRICT decoded, size_t first_pass,
+                   bool force_draw, bool dc_only, bool* should_run_pipeline) {
+  PROFILER_FUNC;
+
+  DrawMode draw = (num_passes + first_pass ==
+                   dec_state->shared->frame_header.passes.num_passes) ||
+                          force_draw
+                      ? kDraw
+                      : kDontDraw;
+
+  if (should_run_pipeline) {
+    *should_run_pipeline = draw != kDontDraw;
+  }
+
+  if (draw == kDraw && num_passes == 0 && first_pass == 0) {
+    group_dec_cache->InitDCBufferOnce();
+    const YCbCrChromaSubsampling& cs =
+        dec_state->shared->frame_header.chroma_subsampling;
+    for (size_t c : {0, 1, 2}) {
+      size_t hs = cs.HShift(c);
+      size_t vs = cs.VShift(c);
+      // We reuse filter_input_storage here as it is not currently in use.
+      const Rect src_rect_precs = dec_state->shared->BlockGroupRect(group_idx);
+      const Rect src_rect =
+          Rect(src_rect_precs.x0() >> hs, src_rect_precs.y0() >> vs,
+               src_rect_precs.xsize() >> hs, src_rect_precs.ysize() >> vs);
+      const Rect copy_rect(kRenderPipelineXOffset, 2, src_rect.xsize(),
+                           src_rect.ysize());
+      CopyImageToWithPadding(src_rect, dec_state->shared->dc->Plane(c), 2,
+                             copy_rect, &group_dec_cache->dc_buffer);
+      // Mirrorpad. Interleaving left and right padding ensures that padding
+      // works out correctly even for images with DC size of 1.
+      for (size_t y = 0; y < src_rect.ysize() + 4; y++) {
+        size_t xend = kRenderPipelineXOffset +
+                      (dec_state->shared->dc->Plane(c).xsize() >> hs) -
+                      src_rect.x0();
+        for (size_t ix = 0; ix < 2; ix++) {
+          if (src_rect.x0() == 0) {
+            group_dec_cache->dc_buffer.Row(y)[kRenderPipelineXOffset - ix - 1] =
+                group_dec_cache->dc_buffer.Row(y)[kRenderPipelineXOffset + ix];
+          }
+          if (src_rect.x0() + src_rect.xsize() + 2 >=
+              (dec_state->shared->dc->xsize() >> hs)) {
+            group_dec_cache->dc_buffer.Row(y)[xend + ix] =
+                group_dec_cache->dc_buffer.Row(y)[xend - ix - 1];
+          }
+        }
+      }
+      Rect dst_rect = render_pipeline_input.GetBuffer(c).second;
+      ImageF* upsampling_dst = render_pipeline_input.GetBuffer(c).first;
+      JXL_ASSERT(dst_rect.IsInside(*upsampling_dst));
+
+      RenderPipelineStage::RowInfo input_rows(1, std::vector<float*>(5));
+      RenderPipelineStage::RowInfo output_rows(1, std::vector<float*>(8));
+      for (size_t y = src_rect.y0(); y < src_rect.y0() + src_rect.ysize();
+           y++) {
+        for (ssize_t iy = 0; iy < 5; iy++) {
+          input_rows[0][iy] = group_dec_cache->dc_buffer.Row(
+              Mirror(ssize_t(y) + iy - 2,
+                     dec_state->shared->dc->Plane(c).ysize() >> vs) +
+              2 - src_rect.y0());
+        }
+        for (size_t iy = 0; iy < 8; iy++) {
+          output_rows[0][iy] =
+              dst_rect.Row(upsampling_dst, ((y - src_rect.y0()) << 3) + iy) -
+              kRenderPipelineXOffset;
+        }
+        // Arguments set to 0/nullptr are not used.
+        dec_state->upsampler8x->ProcessRow(input_rows, output_rows,
+                                           /*xextra=*/0, src_rect.xsize(), 0, 0,
+                                           thread);
+      }
+    }
+    return true;
+  }
+
+  size_t histo_selector_bits = 0;
+  if (dc_only) {
+    JXL_ASSERT(num_passes == 0);
+  } else {
+    JXL_ASSERT(dec_state->shared->num_histograms > 0);
+    histo_selector_bits = CeilLog2Nonzero(dec_state->shared->num_histograms);
+  }
+
+  GetBlockFromBitstream get_block;
+  JXL_RETURN_IF_ERROR(
+      get_block.Init(readers, num_passes, group_idx, histo_selector_bits,
+                     dec_state->shared->BlockGroupRect(group_idx),
+                     group_dec_cache, dec_state, first_pass));
+
+  JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)(
+      &get_block, group_dec_cache, dec_state, thread, group_idx,
+      render_pipeline_input, decoded, draw));
+
+  for (size_t pass = 0; pass < num_passes; pass++) {
+    if (!get_block.decoders[pass].CheckANSFinalState()) {
+      return JXL_FAILURE("ANS checksum failure.");
+    }
+  }
+  return true;
+}
+
+Status DecodeGroupForRoundtrip(const std::vector<std::unique_ptr<ACImage>>& ac,
+                               size_t group_idx,
+                               PassesDecoderState* JXL_RESTRICT dec_state,
+                               GroupDecCache* JXL_RESTRICT group_dec_cache,
+                               size_t thread,
+                               RenderPipelineInput& render_pipeline_input,
+                               ImageBundle* JXL_RESTRICT decoded,
+                               AuxOut* aux_out) {
+  PROFILER_FUNC;
+
+  GetBlockFromEncoder get_block(ac, group_idx,
+                                dec_state->shared->frame_header.passes.shift);
+  group_dec_cache->InitOnce(
+      /*num_passes=*/0,
+      /*used_acs=*/(1u << AcStrategy::kNumValidStrategies) - 1);
+
+  return HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)(
+      &get_block, group_dec_cache, dec_state, thread, group_idx,
+      render_pipeline_input, decoded, kDraw);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/dec_group.h b/third_party/jpeg-xl/lib/jxl/dec_group.h
new file mode 100644
index 0000000000..e32ea67b5f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_group.h
@@ -0,0 +1,49 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_GROUP_H_
+#define LIB_JXL_DEC_GROUP_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/quantizer.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+Status DecodeGroup(BitReader* JXL_RESTRICT* JXL_RESTRICT readers,
+                   size_t num_passes, size_t group_idx,
+                   PassesDecoderState* JXL_RESTRICT dec_state,
+                   GroupDecCache* JXL_RESTRICT group_dec_cache, size_t thread,
+                   RenderPipelineInput& render_pipeline_input,
+                   ImageBundle* JXL_RESTRICT decoded, size_t first_pass,
+                   bool force_draw, bool dc_only, bool* should_run_pipeline);
+
+Status DecodeGroupForRoundtrip(const std::vector<std::unique_ptr<ACImage>>& ac,
+                               size_t group_idx,
+                               PassesDecoderState* JXL_RESTRICT dec_state,
+                               GroupDecCache* JXL_RESTRICT group_dec_cache,
+                               size_t thread,
+                               RenderPipelineInput& render_pipeline_input,
+                               ImageBundle* JXL_RESTRICT decoded,
+                               AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_GROUP_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_group_border.cc b/third_party/jpeg-xl/lib/jxl/dec_group_border.cc
new file mode 100644
index 0000000000..4bee3ae6ef
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_group_border.cc
@@ -0,0 +1,184 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_group_border.h"
+
+#include <atomic>
+
+namespace jxl {
+
+void GroupBorderAssigner::Init(const FrameDimensions& frame_dim) {
+  frame_dim_ = frame_dim;
+  size_t num_corners =
+      (frame_dim_.xsize_groups + 1) * (frame_dim_.ysize_groups + 1);
+  counters_.reset(new std::atomic<uint8_t>[num_corners]);
+  // Initialize counters.
+  for (size_t y = 0; y < frame_dim_.ysize_groups + 1; y++) {
+    for (size_t x = 0; x < frame_dim_.xsize_groups + 1; x++) {
+      // Counters at image borders don't have anything on the other side, we
+      // pre-fill their value to have more uniform handling afterwards.
+      uint8_t init_value = 0;
+      if (x == 0) {
+        init_value |= kTopLeft | kBottomLeft;
+      }
+      if (x == frame_dim_.xsize_groups) {
+        init_value |= kTopRight | kBottomRight;
+      }
+      if (y == 0) {
+        init_value |= kTopLeft | kTopRight;
+      }
+      if (y == frame_dim_.ysize_groups) {
+        init_value |= kBottomLeft | kBottomRight;
+      }
+      counters_[y * (frame_dim_.xsize_groups + 1) + x] = init_value;
+    }
+  }
+}
+
+void GroupBorderAssigner::ClearDone(size_t group_id) {
+  size_t x = group_id % frame_dim_.xsize_groups;
+  size_t y = group_id / frame_dim_.xsize_groups;
+  size_t top_left_idx = y * (frame_dim_.xsize_groups + 1) + x;
+  size_t top_right_idx = y * (frame_dim_.xsize_groups + 1) + x + 1;
+  size_t bottom_right_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x + 1;
+  size_t bottom_left_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x;
+  counters_[top_left_idx].fetch_and(~kBottomRight);
+  counters_[top_right_idx].fetch_and(~kBottomLeft);
+  counters_[bottom_left_idx].fetch_and(~kTopRight);
+  counters_[bottom_right_idx].fetch_and(~kTopLeft);
+}
+
+// Looking at each corner between groups, we can guarantee that the four
+// involved groups will agree between each other regarding the order in which
+// each of the four groups terminated. Thus, the last of the four groups
+// gets the responsibility of handling the corner. For borders, every border
+// is assigned to its top corner (for vertical borders) or to its left corner
+// (for horizontal borders): the order as seen on those corners will decide who
+// handles that border.
+
+void GroupBorderAssigner::GroupDone(size_t group_id, size_t padx, size_t pady,
+                                    Rect* rects_to_finalize,
+                                    size_t* num_to_finalize) {
+  size_t x = group_id % frame_dim_.xsize_groups;
+  size_t y = group_id / frame_dim_.xsize_groups;
+  Rect block_rect(x * frame_dim_.group_dim / kBlockDim,
+                  y * frame_dim_.group_dim / kBlockDim,
+                  frame_dim_.group_dim / kBlockDim,
+                  frame_dim_.group_dim / kBlockDim, frame_dim_.xsize_blocks,
+                  frame_dim_.ysize_blocks);
+
+  size_t top_left_idx = y * (frame_dim_.xsize_groups + 1) + x;
+  size_t top_right_idx = y * (frame_dim_.xsize_groups + 1) + x + 1;
+  size_t bottom_right_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x + 1;
+  size_t bottom_left_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x;
+
+  auto fetch_status = [this](size_t idx, uint8_t bit) {
+    // Note that the acq-rel semantics of this fetch are actually needed to
+    // ensure that the pixel data of the group is already written to memory.
+    size_t status = counters_[idx].fetch_or(bit);
+    JXL_DASSERT((bit & status) == 0);
+    return bit | status;
+  };
+
+  size_t top_left_status = fetch_status(top_left_idx, kBottomRight);
+  size_t top_right_status = fetch_status(top_right_idx, kBottomLeft);
+  size_t bottom_right_status = fetch_status(bottom_right_idx, kTopLeft);
+  size_t bottom_left_status = fetch_status(bottom_left_idx, kTopRight);
+
+  size_t x1 = block_rect.x0() + block_rect.xsize();
+  size_t y1 = block_rect.y0() + block_rect.ysize();
+
+  bool is_last_group_x = frame_dim_.xsize_groups == x + 1;
+  bool is_last_group_y = frame_dim_.ysize_groups == y + 1;
+
+  // Start of border of neighbouring group, end of border of this group, start
+  // of border of this group (on the other side), end of border of next group.
+  size_t xpos[4] = {
+      block_rect.x0() == 0 ? 0 : block_rect.x0() * kBlockDim - padx,
+      block_rect.x0() == 0
+          ? 0
+          : std::min(frame_dim_.xsize, block_rect.x0() * kBlockDim + padx),
+      is_last_group_x ? frame_dim_.xsize : x1 * kBlockDim - padx,
+      std::min(frame_dim_.xsize, x1 * kBlockDim + padx)};
+  size_t ypos[4] = {
+      block_rect.y0() == 0 ? 0 : block_rect.y0() * kBlockDim - pady,
+      block_rect.y0() == 0
+          ? 0
+          : std::min(frame_dim_.ysize, block_rect.y0() * kBlockDim + pady),
+      is_last_group_y ? frame_dim_.ysize : y1 * kBlockDim - pady,
+      std::min(frame_dim_.ysize, y1 * kBlockDim + pady)};
+
+  *num_to_finalize = 0;
+  auto append_rect = [&](size_t x0, size_t x1, size_t y0, size_t y1) {
+    Rect rect(xpos[x0], ypos[y0], xpos[x1] - xpos[x0], ypos[y1] - ypos[y0]);
+    if (rect.xsize() == 0 || rect.ysize() == 0) return;
+    JXL_DASSERT(*num_to_finalize < kMaxToFinalize);
+    rects_to_finalize[(*num_to_finalize)++] = rect;
+  };
+
+  // Because of how group borders are assigned, it is impossible that we need to
+  // process the left and right side of some area but not the center area. Thus,
+  // we compute the first/last part to process in every horizontal strip and
+  // merge them together. We first collect a mask of what parts should be
+  // processed.
+  // We do this horizontally rather than vertically because horizontal borders
+  // are larger.
+  bool available_parts_mask[3][3] = {};  // [x][y]
+  // Center
+  available_parts_mask[1][1] = true;
+  // Corners
+  if (top_left_status == 0xF) available_parts_mask[0][0] = true;
+  if (top_right_status == 0xF) available_parts_mask[2][0] = true;
+  if (bottom_right_status == 0xF) available_parts_mask[2][2] = true;
+  if (bottom_left_status == 0xF) available_parts_mask[0][2] = true;
+  // Other borders
+  if (top_left_status & kTopRight) available_parts_mask[1][0] = true;
+  if (top_left_status & kBottomLeft) available_parts_mask[0][1] = true;
+  if (top_right_status & kBottomRight) available_parts_mask[2][1] = true;
+  if (bottom_left_status & kBottomRight) available_parts_mask[1][2] = true;
+
+  // Collect horizontal ranges.
+  constexpr size_t kNoSegment = 3;
+  std::pair<size_t, size_t> horizontal_segments[3] = {{kNoSegment, kNoSegment},
+                                                      {kNoSegment, kNoSegment},
+                                                      {kNoSegment, kNoSegment}};
+  for (size_t y = 0; y < 3; y++) {
+    for (size_t x = 0; x < 3; x++) {
+      if (!available_parts_mask[x][y]) continue;
+      JXL_DASSERT(horizontal_segments[y].second == kNoSegment ||
+                  horizontal_segments[y].second == x);
+      JXL_DASSERT((horizontal_segments[y].first == kNoSegment) ==
+                  (horizontal_segments[y].second == kNoSegment));
+      if (horizontal_segments[y].first == kNoSegment) {
+        horizontal_segments[y].first = x;
+      }
+      horizontal_segments[y].second = x + 1;
+    }
+  }
+  if (horizontal_segments[0] == horizontal_segments[1] &&
+      horizontal_segments[0] == horizontal_segments[2]) {
+    append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0,
+                3);
+  } else if (horizontal_segments[0] == horizontal_segments[1]) {
+    append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0,
+                2);
+    append_rect(horizontal_segments[2].first, horizontal_segments[2].second, 2,
+                3);
+  } else if (horizontal_segments[1] == horizontal_segments[2]) {
+    append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0,
+                1);
+    append_rect(horizontal_segments[1].first, horizontal_segments[1].second, 1,
+                3);
+  } else {
+    append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0,
+                1);
+    append_rect(horizontal_segments[1].first, horizontal_segments[1].second, 1,
+                2);
+    append_rect(horizontal_segments[2].first, horizontal_segments[2].second, 2,
+                3);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_group_border.h b/third_party/jpeg-xl/lib/jxl/dec_group_border.h
new file mode 100644
index 0000000000..2d974c9987
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_group_border.h
@@ -0,0 +1,47 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_GROUP_BORDER_H_
+#define LIB_JXL_DEC_GROUP_BORDER_H_
+
+#include <stddef.h>
+
+#include <atomic>
+
+#include "lib/jxl/base/arch_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+class GroupBorderAssigner {
+ public:
+  // Prepare the GroupBorderAssigner to handle a given frame.
+  void Init(const FrameDimensions& frame_dim);
+  // Marks a group as done, and returns the (at most 3) rects to run
+  // FinalizeImageRect on. `block_rect` must be the rect corresponding
+  // to the given `group_id`, measured in blocks.
+  void GroupDone(size_t group_id, size_t padx, size_t pady,
+                 Rect* rects_to_finalize, size_t* num_to_finalize);
+  // Marks a group as not-done, for running re-paints.
+  void ClearDone(size_t group_id);
+
+  static constexpr size_t kMaxToFinalize = 3;
+
+ private:
+  FrameDimensions frame_dim_;
+  std::unique_ptr<std::atomic<uint8_t>[]> counters_;
+
+  // Constants to identify group positions relative to the corners.
+  static constexpr uint8_t kTopLeft = 0x01;
+  static constexpr uint8_t kTopRight = 0x02;
+  static constexpr uint8_t kBottomRight = 0x04;
+  static constexpr uint8_t kBottomLeft = 0x08;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_GROUP_BORDER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_huffman.cc b/third_party/jpeg-xl/lib/jxl/dec_huffman.cc
new file mode 100644
index 0000000000..05b275773a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_huffman.cc
@@ -0,0 +1,255 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_huffman.h"
+
+#include <string.h> /* for memset */
+
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/huffman_table.h"
+
+namespace jxl {
+
+static const int kCodeLengthCodes = 18;
+static const uint8_t kCodeLengthCodeOrder[kCodeLengthCodes] = {
+    1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+static const uint8_t kDefaultCodeLength = 8;
+static const uint8_t kCodeLengthRepeatCode = 16;
+
+int ReadHuffmanCodeLengths(const uint8_t* code_length_code_lengths,
+                           int num_symbols, uint8_t* code_lengths,
+                           BitReader* br) {
+  int symbol = 0;
+  uint8_t prev_code_len = kDefaultCodeLength;
+  int repeat = 0;
+  uint8_t repeat_code_len = 0;
+  int space = 32768;
+  HuffmanCode table[32];
+
+  uint16_t counts[16] = {0};
+  for (int i = 0; i < kCodeLengthCodes; ++i) {
+    ++counts[code_length_code_lengths[i]];
+  }
+  if (!BuildHuffmanTable(table, 5, code_length_code_lengths, kCodeLengthCodes,
+                         &counts[0])) {
+    return 0;
+  }
+
+  while (symbol < num_symbols && space > 0) {
+    const HuffmanCode* p = table;
+    uint8_t code_len;
+    br->Refill();
+    p += br->PeekFixedBits<5>();
+    br->Consume(p->bits);
+    code_len = (uint8_t)p->value;
+    if (code_len < kCodeLengthRepeatCode) {
+      repeat = 0;
+      code_lengths[symbol++] = code_len;
+      if (code_len != 0) {
+        prev_code_len = code_len;
+        space -= 32768u >> code_len;
+      }
+    } else {
+      const int extra_bits = code_len - 14;
+      int old_repeat;
+      int repeat_delta;
+      uint8_t new_len = 0;
+      if (code_len == kCodeLengthRepeatCode) {
+        new_len = prev_code_len;
+      }
+      if (repeat_code_len != new_len) {
+        repeat = 0;
+        repeat_code_len = new_len;
+      }
+      old_repeat = repeat;
+      if (repeat > 0) {
+        repeat -= 2;
+        repeat <<= extra_bits;
+      }
+      repeat += (int)br->ReadBits(extra_bits) + 3;
+      repeat_delta = repeat - old_repeat;
+      if (symbol + repeat_delta > num_symbols) {
+        return 0;
+      }
+      memset(&code_lengths[symbol], repeat_code_len, (size_t)repeat_delta);
+      symbol += repeat_delta;
+      if (repeat_code_len != 0) {
+        space -= repeat_delta << (15 - repeat_code_len);
+      }
+    }
+  }
+  if (space != 0) {
+    return 0;
+  }
+  memset(&code_lengths[symbol], 0, (size_t)(num_symbols - symbol));
+  return true;
+}
+
+static JXL_INLINE bool ReadSimpleCode(size_t alphabet_size, BitReader* br,
+                                      HuffmanCode* table) {
+  size_t max_bits =
+      (alphabet_size > 1u) ? FloorLog2Nonzero(alphabet_size - 1u) + 1 : 0;
+
+  size_t num_symbols = br->ReadFixedBits<2>() + 1;
+
+  uint16_t symbols[4] = {0};
+  for (size_t i = 0; i < num_symbols; ++i) {
+    uint16_t symbol = br->ReadBits(max_bits);
+    if (symbol >= alphabet_size) {
+      return false;
+    }
+    symbols[i] = symbol;
+  }
+
+  for (size_t i = 0; i < num_symbols - 1; ++i) {
+    for (size_t j = i + 1; j < num_symbols; ++j) {
+      if (symbols[i] == symbols[j]) return false;
+    }
+  }
+
+  // 4 symbols have to option to encode.
+  if (num_symbols == 4) num_symbols += br->ReadFixedBits<1>();
+
+  const auto swap_symbols = [&symbols](size_t i, size_t j) {
+    uint16_t t = symbols[j];
+    symbols[j] = symbols[i];
+    symbols[i] = t;
+  };
+
+  size_t table_size = 1;
+  switch (num_symbols) {
+    case 1:
+      table[0] = {0, symbols[0]};
+      break;
+    case 2:
+      if (symbols[0] > symbols[1]) swap_symbols(0, 1);
+      table[0] = {1, symbols[0]};
+      table[1] = {1, symbols[1]};
+      table_size = 2;
+      break;
+    case 3:
+      if (symbols[1] > symbols[2]) swap_symbols(1, 2);
+      table[0] = {1, symbols[0]};
+      table[2] = {1, symbols[0]};
+      table[1] = {2, symbols[1]};
+      table[3] = {2, symbols[2]};
+      table_size = 4;
+      break;
+    case 4: {
+      for (size_t i = 0; i < 3; ++i) {
+        for (size_t j = i + 1; j < 4; ++j) {
+          if (symbols[i] > symbols[j]) swap_symbols(i, j);
+        }
+      }
+      table[0] = {2, symbols[0]};
+      table[2] = {2, symbols[1]};
+      table[1] = {2, symbols[2]};
+      table[3] = {2, symbols[3]};
+      table_size = 4;
+      break;
+    }
+    case 5: {
+      if (symbols[2] > symbols[3]) swap_symbols(2, 3);
+      table[0] = {1, symbols[0]};
+      table[1] = {2, symbols[1]};
+      table[2] = {1, symbols[0]};
+      table[3] = {3, symbols[2]};
+      table[4] = {1, symbols[0]};
+      table[5] = {2, symbols[1]};
+      table[6] = {1, symbols[0]};
+      table[7] = {3, symbols[3]};
+      table_size = 8;
+      break;
+    }
+    default: {
+      // Unreachable.
+      return false;
+    }
+  }
+
+  const uint32_t goal_size = 1u << kHuffmanTableBits;
+  while (table_size != goal_size) {
+    memcpy(&table[table_size], &table[0],
+           (size_t)table_size * sizeof(table[0]));
+    table_size <<= 1;
+  }
+
+  return true;
+}
+
+bool HuffmanDecodingData::ReadFromBitStream(size_t alphabet_size,
+                                            BitReader* br) {
+  if (alphabet_size > (1 << PREFIX_MAX_BITS)) return false;
+
+  /* simple_code_or_skip is used as follows:
+     1 for simple code;
+     0 for no skipping, 2 skips 2 code lengths, 3 skips 3 code lengths */
+  uint32_t simple_code_or_skip = br->ReadFixedBits<2>();
+  if (simple_code_or_skip == 1u) {
+    table_.resize(1u << kHuffmanTableBits);
+    return ReadSimpleCode(alphabet_size, br, table_.data());
+  }
+
+  std::vector<uint8_t> code_lengths(alphabet_size, 0);
+  uint8_t code_length_code_lengths[kCodeLengthCodes] = {0};
+  int space = 32;
+  int num_codes = 0;
+  /* Static Huffman code for the code length code lengths */
+  static const HuffmanCode huff[16] = {
+      {2, 0}, {2, 4}, {2, 3}, {3, 2}, {2, 0}, {2, 4}, {2, 3}, {4, 1},
+      {2, 0}, {2, 4}, {2, 3}, {3, 2}, {2, 0}, {2, 4}, {2, 3}, {4, 5},
+  };
+  for (size_t i = simple_code_or_skip; i < kCodeLengthCodes && space > 0; ++i) {
+    const int code_len_idx = kCodeLengthCodeOrder[i];
+    const HuffmanCode* p = huff;
+    uint8_t v;
+    br->Refill();
+    p += br->PeekFixedBits<4>();
+    br->Consume(p->bits);
+    v = (uint8_t)p->value;
+    code_length_code_lengths[code_len_idx] = v;
+    if (v != 0) {
+      space -= (32u >> v);
+      ++num_codes;
+    }
+  }
+  bool ok = (num_codes == 1 || space == 0) &&
+            ReadHuffmanCodeLengths(code_length_code_lengths, alphabet_size,
+                                   &code_lengths[0], br);
+
+  if (!ok) return false;
+  uint16_t counts[16] = {0};
+  for (size_t i = 0; i < alphabet_size; ++i) {
+    ++counts[code_lengths[i]];
+  }
+  table_.resize(alphabet_size + 376);
+  uint32_t table_size =
+      BuildHuffmanTable(table_.data(), kHuffmanTableBits, &code_lengths[0],
+                        alphabet_size, &counts[0]);
+  table_.resize(table_size);
+  return (table_size > 0);
+}
+
+// Decodes the next Huffman coded symbol from the bit-stream.
+uint16_t HuffmanDecodingData::ReadSymbol(BitReader* br) const {
+  size_t n_bits;
+  const HuffmanCode* table = table_.data();
+  table += br->PeekBits(kHuffmanTableBits);
+  n_bits = table->bits;
+  if (n_bits > kHuffmanTableBits) {
+    br->Consume(kHuffmanTableBits);
+    n_bits -= kHuffmanTableBits;
+    table += table->value;
+    table += br->PeekBits(n_bits);
+  }
+  br->Consume(table->bits);
+  return table->value;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_huffman.h b/third_party/jpeg-xl/lib/jxl/dec_huffman.h
new file mode 100644
index 0000000000..162c3e309c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_huffman.h
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_HUFFMAN_H_
+#define LIB_JXL_DEC_HUFFMAN_H_
+
+#include <memory>
+#include <vector>
+
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/huffman_table.h"
+
+namespace jxl {
+
+static constexpr size_t kHuffmanTableBits = 8u;
+
+struct HuffmanDecodingData {
+  // Decodes the Huffman code lengths from the bit-stream and fills in the
+  // pre-allocated table with the corresponding 2-level Huffman decoding table.
+  // Returns false if the Huffman code lengths can not de decoded.
+  bool ReadFromBitStream(size_t alphabet_size, BitReader* br);
+
+  uint16_t ReadSymbol(BitReader* br) const;
+
+  std::vector<HuffmanCode> table_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_HUFFMAN_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_modular.cc b/third_party/jpeg-xl/lib/jxl/dec_modular.cc
new file mode 100644
index 0000000000..bf85eaa05c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_modular.cc
@@ -0,0 +1,774 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_modular.h"
+
+#include <stdint.h>
+
+#include <atomic>
+#include <sstream>
+#include <vector>
+
+#include "lib/jxl/frame_header.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/dec_modular.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/compressed_dc.h"
+#include "lib/jxl/epf.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::Rebind;
+
+void MultiplySum(const size_t xsize,
+                 const pixel_type* const JXL_RESTRICT row_in,
+                 const pixel_type* const JXL_RESTRICT row_in_Y,
+                 const float factor, float* const JXL_RESTRICT row_out) {
+  const HWY_FULL(float) df;
+  const Rebind<pixel_type, HWY_FULL(float)> di;  // assumes pixel_type <= float
+  const auto factor_v = Set(df, factor);
+  for (size_t x = 0; x < xsize; x += Lanes(di)) {
+    const auto in = Add(Load(di, row_in + x), Load(di, row_in_Y + x));
+    const auto out = Mul(ConvertTo(df, in), factor_v);
+    Store(out, df, row_out + x);
+  }
+}
+
+void RgbFromSingle(const size_t xsize,
+                   const pixel_type* const JXL_RESTRICT row_in,
+                   const float factor, float* out_r, float* out_g,
+                   float* out_b) {
+  const HWY_FULL(float) df;
+  const Rebind<pixel_type, HWY_FULL(float)> di;  // assumes pixel_type <= float
+
+  const auto factor_v = Set(df, factor);
+  for (size_t x = 0; x < xsize; x += Lanes(di)) {
+    const auto in = Load(di, row_in + x);
+    const auto out = Mul(ConvertTo(df, in), factor_v);
+    Store(out, df, out_r + x);
+    Store(out, df, out_g + x);
+    Store(out, df, out_b + x);
+  }
+}
+
+void SingleFromSingle(const size_t xsize,
+                      const pixel_type* const JXL_RESTRICT row_in,
+                      const float factor, float* row_out) {
+  const HWY_FULL(float) df;
+  const Rebind<pixel_type, HWY_FULL(float)> di;  // assumes pixel_type <= float
+
+  const auto factor_v = Set(df, factor);
+  for (size_t x = 0; x < xsize; x += Lanes(di)) {
+    const auto in = Load(di, row_in + x);
+    const auto out = Mul(ConvertTo(df, in), factor_v);
+    Store(out, df, row_out + x);
+  }
+}
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(MultiplySum);       // Local function
+HWY_EXPORT(RgbFromSingle);     // Local function
+HWY_EXPORT(SingleFromSingle);  // Local function
+
+// Slow conversion using double precision multiplication, only
+// needed when the bit depth is too high for single precision
+void SingleFromSingleAccurate(const size_t xsize,
+                              const pixel_type* const JXL_RESTRICT row_in,
+                              const double factor, float* row_out) {
+  for (size_t x = 0; x < xsize; x++) {
+    row_out[x] = row_in[x] * factor;
+  }
+}
+
+// convert custom [bits]-bit float (with [exp_bits] exponent bits) stored as int
+// back to binary32 float
+void int_to_float(const pixel_type* const JXL_RESTRICT row_in,
+                  float* const JXL_RESTRICT row_out, const size_t xsize,
+                  const int bits, const int exp_bits) {
+  if (bits == 32) {
+    JXL_ASSERT(sizeof(pixel_type) == sizeof(float));
+    JXL_ASSERT(exp_bits == 8);
+    memcpy(row_out, row_in, xsize * sizeof(float));
+    return;
+  }
+  int exp_bias = (1 << (exp_bits - 1)) - 1;
+  int sign_shift = bits - 1;
+  int mant_bits = bits - exp_bits - 1;
+  int mant_shift = 23 - mant_bits;
+  for (size_t x = 0; x < xsize; ++x) {
+    uint32_t f;
+    memcpy(&f, &row_in[x], 4);
+    int signbit = (f >> sign_shift);
+    f &= (1 << sign_shift) - 1;
+    if (f == 0) {
+      row_out[x] = (signbit ? -0.f : 0.f);
+      continue;
+    }
+    int exp = (f >> mant_bits);
+    int mantissa = (f & ((1 << mant_bits) - 1));
+    mantissa <<= mant_shift;
+    // Try to normalize only if there is space for maneuver.
+    if (exp == 0 && exp_bits < 8) {
+      // subnormal number
+      while ((mantissa & 0x800000) == 0) {
+        mantissa <<= 1;
+        exp--;
+      }
+      exp++;
+      // remove leading 1 because it is implicit now
+      mantissa &= 0x7fffff;
+    }
+    exp -= exp_bias;
+    // broke up the arbitrary float into its parts, now reassemble into
+    // binary32
+    exp += 127;
+    JXL_ASSERT(exp >= 0);
+    f = (signbit ? 0x80000000 : 0);
+    f |= (exp << 23);
+    f |= mantissa;
+    memcpy(&row_out[x], &f, 4);
+  }
+}
+
+std::string ModularStreamId::DebugString() const {
+  std::ostringstream os;
+  os << (kind == kGlobalData   ? "ModularGlobal"
+         : kind == kVarDCTDC   ? "VarDCTDC"
+         : kind == kModularDC  ? "ModularDC"
+         : kind == kACMetadata ? "ACMeta"
+         : kind == kQuantTable ? "QuantTable"
+         : kind == kModularAC  ? "ModularAC"
+                               : "");
+  if (kind == kVarDCTDC || kind == kModularDC || kind == kACMetadata ||
+      kind == kModularAC) {
+    os << " group " << group_id;
+  }
+  if (kind == kModularAC) {
+    os << " pass " << pass_id;
+  }
+  if (kind == kQuantTable) {
+    os << " " << quant_table_id;
+  }
+  return os.str();
+}
+
+Status ModularFrameDecoder::DecodeGlobalInfo(BitReader* reader,
+                                             const FrameHeader& frame_header,
+                                             bool allow_truncated_group) {
+  bool decode_color = frame_header.encoding == FrameEncoding::kModular;
+  const auto& metadata = frame_header.nonserialized_metadata->m;
+  bool is_gray = metadata.color_encoding.IsGray();
+  size_t nb_chans = 3;
+  if (is_gray && frame_header.color_transform == ColorTransform::kNone) {
+    nb_chans = 1;
+  }
+  do_color = decode_color;
+  size_t nb_extra = metadata.extra_channel_info.size();
+  bool has_tree = reader->ReadBits(1);
+  if (!allow_truncated_group ||
+      reader->TotalBitsConsumed() < reader->TotalBytes() * kBitsPerByte) {
+    if (has_tree) {
+      size_t tree_size_limit =
+          std::min(static_cast<size_t>(1 << 22),
+                   1024 + frame_dim.xsize * frame_dim.ysize *
+                              (nb_chans + nb_extra) / 16);
+      JXL_RETURN_IF_ERROR(DecodeTree(reader, &tree, tree_size_limit));
+      JXL_RETURN_IF_ERROR(
+          DecodeHistograms(reader, (tree.size() + 1) / 2, &code, &context_map));
+    }
+  }
+  if (!do_color) nb_chans = 0;
+
+  bool fp = metadata.bit_depth.floating_point_sample;
+
+  // bits_per_sample is just metadata for XYB images.
+  if (metadata.bit_depth.bits_per_sample >= 32 && do_color &&
+      frame_header.color_transform != ColorTransform::kXYB) {
+    if (metadata.bit_depth.bits_per_sample == 32 && fp == false) {
+      return JXL_FAILURE("uint32_t not supported in dec_modular");
+    } else if (metadata.bit_depth.bits_per_sample > 32) {
+      return JXL_FAILURE("bits_per_sample > 32 not supported");
+    }
+  }
+
+  Image gi(frame_dim.xsize, frame_dim.ysize, metadata.bit_depth.bits_per_sample,
+           nb_chans + nb_extra);
+
+  all_same_shift = true;
+  if (frame_header.color_transform == ColorTransform::kYCbCr) {
+    for (size_t c = 0; c < nb_chans; c++) {
+      gi.channel[c].hshift = frame_header.chroma_subsampling.HShift(c);
+      gi.channel[c].vshift = frame_header.chroma_subsampling.VShift(c);
+      size_t xsize_shifted =
+          DivCeil(frame_dim.xsize, 1 << gi.channel[c].hshift);
+      size_t ysize_shifted =
+          DivCeil(frame_dim.ysize, 1 << gi.channel[c].vshift);
+      gi.channel[c].shrink(xsize_shifted, ysize_shifted);
+      if (gi.channel[c].hshift != gi.channel[0].hshift ||
+          gi.channel[c].vshift != gi.channel[0].vshift)
+        all_same_shift = false;
+    }
+  }
+
+  for (size_t ec = 0, c = nb_chans; ec < nb_extra; ec++, c++) {
+    size_t ecups = frame_header.extra_channel_upsampling[ec];
+    gi.channel[c].shrink(DivCeil(frame_dim.xsize_upsampled, ecups),
+                         DivCeil(frame_dim.ysize_upsampled, ecups));
+    gi.channel[c].hshift = gi.channel[c].vshift =
+        CeilLog2Nonzero(ecups) - CeilLog2Nonzero(frame_header.upsampling);
+    if (gi.channel[c].hshift != gi.channel[0].hshift ||
+        gi.channel[c].vshift != gi.channel[0].vshift)
+      all_same_shift = false;
+  }
+
+  JXL_DEBUG_V(6, "DecodeGlobalInfo: full_image (w/o transforms) %s",
+              gi.DebugString().c_str());
+  ModularOptions options;
+  options.max_chan_size = frame_dim.group_dim;
+  options.group_dim = frame_dim.group_dim;
+  Status dec_status = ModularGenericDecompress(
+      reader, gi, &global_header, ModularStreamId::Global().ID(frame_dim),
+      &options,
+      /*undo_transforms=*/false, &tree, &code, &context_map,
+      allow_truncated_group);
+  if (!allow_truncated_group) JXL_RETURN_IF_ERROR(dec_status);
+  if (dec_status.IsFatalError()) {
+    return JXL_FAILURE("Failed to decode global modular info");
+  }
+
+  // TODO(eustas): are we sure this can be done after partial decode?
+  have_something = false;
+  for (size_t c = 0; c < gi.channel.size(); c++) {
+    Channel& gic = gi.channel[c];
+    if (c >= gi.nb_meta_channels && gic.w <= frame_dim.group_dim &&
+        gic.h <= frame_dim.group_dim)
+      have_something = true;
+  }
+  // move global transforms to groups if possible
+  if (!have_something && all_same_shift) {
+    if (gi.transform.size() == 1 && gi.transform[0].id == TransformId::kRCT) {
+      global_transform = gi.transform;
+      gi.transform.clear();
+      // TODO(jon): also move no-delta-palette out (trickier though)
+    }
+  }
+  full_image = std::move(gi);
+  JXL_DEBUG_V(6, "DecodeGlobalInfo: full_image (with transforms) %s",
+              full_image.DebugString().c_str());
+  return dec_status;
+}
+
+void ModularFrameDecoder::MaybeDropFullImage() {
+  if (full_image.transform.empty() && !have_something && all_same_shift) {
+    use_full_image = false;
+    JXL_DEBUG_V(6, "Dropping full image");
+    for (auto& ch : full_image.channel) {
+      // keep metadata on channels around, but dealloc their planes
+      ch.plane = Plane<pixel_type>();
+    }
+  }
+}
+
+Status ModularFrameDecoder::DecodeGroup(
+    const Rect& rect, BitReader* reader, int minShift, int maxShift,
+    const ModularStreamId& stream, bool zerofill, PassesDecoderState* dec_state,
+    RenderPipelineInput* render_pipeline_input, bool allow_truncated,
+    bool* should_run_pipeline) {
+  JXL_DEBUG_V(6, "Decoding %s with rect %s and shift bracket %d..%d %s",
+              stream.DebugString().c_str(), Description(rect).c_str(), minShift,
+              maxShift, zerofill ? "using zerofill" : "");
+  JXL_DASSERT(stream.kind == ModularStreamId::kModularDC ||
+              stream.kind == ModularStreamId::kModularAC);
+  const size_t xsize = rect.xsize();
+  const size_t ysize = rect.ysize();
+  Image gi(xsize, ysize, full_image.bitdepth, 0);
+  // start at the first bigger-than-groupsize non-metachannel
+  size_t c = full_image.nb_meta_channels;
+  for (; c < full_image.channel.size(); c++) {
+    Channel& fc = full_image.channel[c];
+    if (fc.w > frame_dim.group_dim || fc.h > frame_dim.group_dim) break;
+  }
+  size_t beginc = c;
+  for (; c < full_image.channel.size(); c++) {
+    Channel& fc = full_image.channel[c];
+    int shift = std::min(fc.hshift, fc.vshift);
+    if (shift > maxShift) continue;
+    if (shift < minShift) continue;
+    Rect r(rect.x0() >> fc.hshift, rect.y0() >> fc.vshift,
+           rect.xsize() >> fc.hshift, rect.ysize() >> fc.vshift, fc.w, fc.h);
+    if (r.xsize() == 0 || r.ysize() == 0) continue;
+    if (zerofill && use_full_image) {
+      for (size_t y = 0; y < r.ysize(); ++y) {
+        pixel_type* const JXL_RESTRICT row_out = r.Row(&fc.plane, y);
+        memset(row_out, 0, r.xsize() * sizeof(*row_out));
+      }
+    } else {
+      Channel gc(r.xsize(), r.ysize());
+      if (zerofill) ZeroFillImage(&gc.plane);
+      gc.hshift = fc.hshift;
+      gc.vshift = fc.vshift;
+      gi.channel.emplace_back(std::move(gc));
+    }
+  }
+  if (zerofill && use_full_image) return true;
+  // Return early if there's nothing to decode. Otherwise there might be
+  // problems later (in ModularImageToDecodedRect).
+  if (gi.channel.empty()) {
+    if (dec_state && should_run_pipeline) {
+      const auto& frame_header = dec_state->shared->frame_header;
+      const auto* metadata = frame_header.nonserialized_metadata;
+      if (do_color || metadata->m.num_extra_channels > 0) {
+        // Signal to FrameDecoder that we do not have some of the required input
+        // for the render pipeline.
+        *should_run_pipeline = false;
+      }
+    }
+    JXL_DEBUG_V(6, "Nothing to decode, returning early.");
+    return true;
+  }
+  ModularOptions options;
+  if (!zerofill) {
+    auto status = ModularGenericDecompress(
+        reader, gi, /*header=*/nullptr, stream.ID(frame_dim), &options,
+        /*undo_transforms=*/true, &tree, &code, &context_map, allow_truncated);
+    if (!allow_truncated) JXL_RETURN_IF_ERROR(status);
+    if (status.IsFatalError()) return status;
+  }
+  // Undo global transforms that have been pushed to the group level
+  if (!use_full_image) {
+    JXL_ASSERT(render_pipeline_input);
+    for (auto t : global_transform) {
+      JXL_RETURN_IF_ERROR(t.Inverse(gi, global_header.wp_header));
+    }
+    JXL_RETURN_IF_ERROR(ModularImageToDecodedRect(gi, dec_state, nullptr,
+                                                  *render_pipeline_input,
+                                                  Rect(0, 0, gi.w, gi.h)));
+    return true;
+  }
+  int gic = 0;
+  for (c = beginc; c < full_image.channel.size(); c++) {
+    Channel& fc = full_image.channel[c];
+    int shift = std::min(fc.hshift, fc.vshift);
+    if (shift > maxShift) continue;
+    if (shift < minShift) continue;
+    Rect r(rect.x0() >> fc.hshift, rect.y0() >> fc.vshift,
+           rect.xsize() >> fc.hshift, rect.ysize() >> fc.vshift, fc.w, fc.h);
+    if (r.xsize() == 0 || r.ysize() == 0) continue;
+    JXL_ASSERT(use_full_image);
+    CopyImageTo(/*rect_from=*/Rect(0, 0, r.xsize(), r.ysize()),
+                /*from=*/gi.channel[gic].plane,
+                /*rect_to=*/r, /*to=*/&fc.plane);
+    gic++;
+  }
+  return true;
+}
+
+Status ModularFrameDecoder::DecodeVarDCTDC(size_t group_id, BitReader* reader,
+                                           PassesDecoderState* dec_state) {
+  const Rect r = dec_state->shared->DCGroupRect(group_id);
+  // TODO(eustas): investigate if we could reduce the impact of
+  //               EvalRationalPolynomial; generally speaking, the limit is
+  //               2**(128/(3*magic)), where 128 comes from IEEE 754 exponent,
+  //               3 comes from XybToRgb that cubes the values, and "magic" is
+  //               the sum of all other contributions. 2**18 is known to lead
+  //               to NaN on input found by fuzzing (see commit message).
+  Image image(r.xsize(), r.ysize(), full_image.bitdepth, 3);
+  size_t stream_id = ModularStreamId::VarDCTDC(group_id).ID(frame_dim);
+  reader->Refill();
+  size_t extra_precision = reader->ReadFixedBits<2>();
+  float mul = 1.0f / (1 << extra_precision);
+  ModularOptions options;
+  for (size_t c = 0; c < 3; c++) {
+    Channel& ch = image.channel[c < 2 ? c ^ 1 : c];
+    ch.w >>= dec_state->shared->frame_header.chroma_subsampling.HShift(c);
+    ch.h >>= dec_state->shared->frame_header.chroma_subsampling.VShift(c);
+    ch.shrink();
+  }
+  if (!ModularGenericDecompress(
+          reader, image, /*header=*/nullptr, stream_id, &options,
+          /*undo_transforms=*/true, &tree, &code, &context_map)) {
+    return JXL_FAILURE("Failed to decode modular DC group");
+  }
+  DequantDC(r, &dec_state->shared_storage.dc_storage,
+            &dec_state->shared_storage.quant_dc, image,
+            dec_state->shared->quantizer.MulDC(), mul,
+            dec_state->shared->cmap.DCFactors(),
+            dec_state->shared->frame_header.chroma_subsampling,
+            dec_state->shared->block_ctx_map);
+  return true;
+}
+
+Status ModularFrameDecoder::DecodeAcMetadata(size_t group_id, BitReader* reader,
+                                             PassesDecoderState* dec_state) {
+  const Rect r = dec_state->shared->DCGroupRect(group_id);
+  size_t upper_bound = r.xsize() * r.ysize();
+  reader->Refill();
+  size_t count = reader->ReadBits(CeilLog2Nonzero(upper_bound)) + 1;
+  size_t stream_id = ModularStreamId::ACMetadata(group_id).ID(frame_dim);
+  // YToX, YToB, ACS + QF, EPF
+  Image image(r.xsize(), r.ysize(), full_image.bitdepth, 4);
+  static_assert(kColorTileDimInBlocks == 8, "Color tile size changed");
+  Rect cr(r.x0() >> 3, r.y0() >> 3, (r.xsize() + 7) >> 3, (r.ysize() + 7) >> 3);
+  image.channel[0] = Channel(cr.xsize(), cr.ysize(), 3, 3);
+  image.channel[1] = Channel(cr.xsize(), cr.ysize(), 3, 3);
+  image.channel[2] = Channel(count, 2, 0, 0);
+  ModularOptions options;
+  if (!ModularGenericDecompress(
+          reader, image, /*header=*/nullptr, stream_id, &options,
+          /*undo_transforms=*/true, &tree, &code, &context_map)) {
+    return JXL_FAILURE("Failed to decode AC metadata");
+  }
+  ConvertPlaneAndClamp(Rect(image.channel[0].plane), image.channel[0].plane, cr,
+                       &dec_state->shared_storage.cmap.ytox_map);
+  ConvertPlaneAndClamp(Rect(image.channel[1].plane), image.channel[1].plane, cr,
+                       &dec_state->shared_storage.cmap.ytob_map);
+  size_t num = 0;
+  bool is444 = dec_state->shared->frame_header.chroma_subsampling.Is444();
+  auto& ac_strategy = dec_state->shared_storage.ac_strategy;
+  size_t xlim = std::min(ac_strategy.xsize(), r.x0() + r.xsize());
+  size_t ylim = std::min(ac_strategy.ysize(), r.y0() + r.ysize());
+  uint32_t local_used_acs = 0;
+  for (size_t iy = 0; iy < r.ysize(); iy++) {
+    size_t y = r.y0() + iy;
+    int32_t* row_qf = r.Row(&dec_state->shared_storage.raw_quant_field, iy);
+    uint8_t* row_epf = r.Row(&dec_state->shared_storage.epf_sharpness, iy);
+    int32_t* row_in_1 = image.channel[2].plane.Row(0);
+    int32_t* row_in_2 = image.channel[2].plane.Row(1);
+    int32_t* row_in_3 = image.channel[3].plane.Row(iy);
+    for (size_t ix = 0; ix < r.xsize(); ix++) {
+      size_t x = r.x0() + ix;
+      int sharpness = row_in_3[ix];
+      if (sharpness < 0 || sharpness >= LoopFilter::kEpfSharpEntries) {
+        return JXL_FAILURE("Corrupted sharpness field");
+      }
+      row_epf[ix] = sharpness;
+      if (ac_strategy.IsValid(x, y)) {
+        continue;
+      }
+
+      if (num >= count) return JXL_FAILURE("Corrupted stream");
+
+      if (!AcStrategy::IsRawStrategyValid(row_in_1[num])) {
+        return JXL_FAILURE("Invalid AC strategy");
+      }
+      local_used_acs |= 1u << row_in_1[num];
+      AcStrategy acs = AcStrategy::FromRawStrategy(row_in_1[num]);
+      if ((acs.covered_blocks_x() > 1 || acs.covered_blocks_y() > 1) &&
+          !is444) {
+        return JXL_FAILURE(
+            "AC strategy not compatible with chroma subsampling");
+      }
+      // Ensure that blocks do not overflow *AC* groups.
+      size_t next_x_ac_block = (x / kGroupDimInBlocks + 1) * kGroupDimInBlocks;
+      size_t next_y_ac_block = (y / kGroupDimInBlocks + 1) * kGroupDimInBlocks;
+      size_t next_x_dct_block = x + acs.covered_blocks_x();
+      size_t next_y_dct_block = y + acs.covered_blocks_y();
+      if (next_x_dct_block > next_x_ac_block || next_x_dct_block > xlim) {
+        return JXL_FAILURE("Invalid AC strategy, x overflow");
+      }
+      if (next_y_dct_block > next_y_ac_block || next_y_dct_block > ylim) {
+        return JXL_FAILURE("Invalid AC strategy, y overflow");
+      }
+      JXL_RETURN_IF_ERROR(
+          ac_strategy.SetNoBoundsCheck(x, y, AcStrategy::Type(row_in_1[num])));
+      row_qf[ix] = 1 + std::max<int32_t>(0, std::min(Quantizer::kQuantMax - 1,
+                                                     row_in_2[num]));
+      num++;
+    }
+  }
+  dec_state->used_acs |= local_used_acs;
+  if (dec_state->shared->frame_header.loop_filter.epf_iters > 0) {
+    ComputeSigma(r, dec_state);
+  }
+  return true;
+}
+
+Status ModularFrameDecoder::ModularImageToDecodedRect(
+    Image& gi, PassesDecoderState* dec_state, jxl::ThreadPool* pool,
+    RenderPipelineInput& render_pipeline_input, Rect modular_rect) {
+  const auto& frame_header = dec_state->shared->frame_header;
+  const auto* metadata = frame_header.nonserialized_metadata;
+  JXL_CHECK(gi.transform.empty());
+
+  auto get_row = [&](size_t c, size_t y) {
+    const auto& buffer = render_pipeline_input.GetBuffer(c);
+    return buffer.second.Row(buffer.first, y);
+  };
+
+  size_t c = 0;
+  if (do_color) {
+    const bool rgb_from_gray =
+        metadata->m.color_encoding.IsGray() &&
+        frame_header.color_transform == ColorTransform::kNone;
+    const bool fp = metadata->m.bit_depth.floating_point_sample &&
+                    frame_header.color_transform != ColorTransform::kXYB;
+    for (; c < 3; c++) {
+      double factor = full_image.bitdepth < 32
+                          ? 1.0 / ((1u << full_image.bitdepth) - 1)
+                          : 0;
+      size_t c_in = c;
+      if (frame_header.color_transform == ColorTransform::kXYB) {
+        factor = dec_state->shared->matrices.DCQuants()[c];
+        // XYB is encoded as YX(B-Y)
+        if (c < 2) c_in = 1 - c;
+      } else if (rgb_from_gray) {
+        c_in = 0;
+      }
+      JXL_ASSERT(c_in < gi.channel.size());
+      Channel& ch_in = gi.channel[c_in];
+      // TODO(eustas): could we detect it on earlier stage?
+      if (ch_in.w == 0 || ch_in.h == 0) {
+        return JXL_FAILURE("Empty image");
+      }
+      JXL_CHECK(ch_in.hshift <= 3 && ch_in.vshift <= 3);
+      Rect r = render_pipeline_input.GetBuffer(c).second;
+      Rect mr(modular_rect.x0() >> ch_in.hshift,
+              modular_rect.y0() >> ch_in.vshift,
+              DivCeil(modular_rect.xsize(), 1 << ch_in.hshift),
+              DivCeil(modular_rect.ysize(), 1 << ch_in.vshift));
+      mr = mr.Crop(ch_in.plane);
+      size_t xsize_shifted = r.xsize();
+      size_t ysize_shifted = r.ysize();
+      if (r.ysize() != mr.ysize() || r.xsize() != mr.xsize()) {
+        return JXL_FAILURE("Dimension mismatch: trying to fit a %" PRIuS
+                           "x%" PRIuS
+                           " modular channel into "
+                           "a %" PRIuS "x%" PRIuS " rect",
+                           mr.xsize(), mr.ysize(), r.xsize(), r.ysize());
+      }
+      if (frame_header.color_transform == ColorTransform::kXYB && c == 2) {
+        JXL_ASSERT(!fp);
+        JXL_RETURN_IF_ERROR(RunOnPool(
+            pool, 0, ysize_shifted, ThreadPool::NoInit,
+            [&](const uint32_t task, size_t /* thread */) {
+              const size_t y = task;
+              const pixel_type* const JXL_RESTRICT row_in =
+                  mr.Row(&ch_in.plane, y);
+              const pixel_type* const JXL_RESTRICT row_in_Y =
+                  mr.Row(&gi.channel[0].plane, y);
+              float* const JXL_RESTRICT row_out = get_row(c, y);
+              HWY_DYNAMIC_DISPATCH(MultiplySum)
+              (xsize_shifted, row_in, row_in_Y, factor, row_out);
+            },
+            "ModularIntToFloat"));
+      } else if (fp) {
+        int bits = metadata->m.bit_depth.bits_per_sample;
+        int exp_bits = metadata->m.bit_depth.exponent_bits_per_sample;
+        JXL_RETURN_IF_ERROR(RunOnPool(
+            pool, 0, ysize_shifted, ThreadPool::NoInit,
+            [&](const uint32_t task, size_t /* thread */) {
+              const size_t y = task;
+              const pixel_type* const JXL_RESTRICT row_in =
+                  mr.Row(&ch_in.plane, y);
+              if (rgb_from_gray) {
+                for (size_t cc = 0; cc < 3; cc++) {
+                  float* const JXL_RESTRICT row_out = get_row(cc, y);
+                  int_to_float(row_in, row_out, xsize_shifted, bits, exp_bits);
+                }
+              } else {
+                float* const JXL_RESTRICT row_out = get_row(c, y);
+                int_to_float(row_in, row_out, xsize_shifted, bits, exp_bits);
+              }
+            },
+            "ModularIntToFloat_losslessfloat"));
+      } else {
+        JXL_RETURN_IF_ERROR(RunOnPool(
+            pool, 0, ysize_shifted, ThreadPool::NoInit,
+            [&](const uint32_t task, size_t /* thread */) {
+              const size_t y = task;
+              const pixel_type* const JXL_RESTRICT row_in =
+                  mr.Row(&ch_in.plane, y);
+              if (rgb_from_gray) {
+                if (full_image.bitdepth < 23) {
+                  HWY_DYNAMIC_DISPATCH(RgbFromSingle)
+                  (xsize_shifted, row_in, factor, get_row(0, y), get_row(1, y),
+                   get_row(2, y));
+                } else {
+                  SingleFromSingleAccurate(xsize_shifted, row_in, factor,
+                                           get_row(0, y));
+                  SingleFromSingleAccurate(xsize_shifted, row_in, factor,
+                                           get_row(1, y));
+                  SingleFromSingleAccurate(xsize_shifted, row_in, factor,
+                                           get_row(2, y));
+                }
+              } else {
+                float* const JXL_RESTRICT row_out = get_row(c, y);
+                if (full_image.bitdepth < 23) {
+                  HWY_DYNAMIC_DISPATCH(SingleFromSingle)
+                  (xsize_shifted, row_in, factor, row_out);
+                } else {
+                  SingleFromSingleAccurate(xsize_shifted, row_in, factor,
+                                           row_out);
+                }
+              }
+            },
+            "ModularIntToFloat"));
+      }
+      if (rgb_from_gray) {
+        break;
+      }
+    }
+    if (rgb_from_gray) {
+      c = 1;
+    }
+  }
+  size_t num_extra_channels = metadata->m.num_extra_channels;
+  for (size_t ec = 0; ec < num_extra_channels; ec++, c++) {
+    const ExtraChannelInfo& eci = metadata->m.extra_channel_info[ec];
+    int bits = eci.bit_depth.bits_per_sample;
+    int exp_bits = eci.bit_depth.exponent_bits_per_sample;
+    bool fp = eci.bit_depth.floating_point_sample;
+    JXL_ASSERT(fp || bits < 32);
+    const double factor = fp ? 0 : (1.0 / ((1u << bits) - 1));
+    JXL_ASSERT(c < gi.channel.size());
+    Channel& ch_in = gi.channel[c];
+    Rect r = render_pipeline_input.GetBuffer(3 + ec).second;
+    Rect mr(modular_rect.x0() >> ch_in.hshift,
+            modular_rect.y0() >> ch_in.vshift,
+            DivCeil(modular_rect.xsize(), 1 << ch_in.hshift),
+            DivCeil(modular_rect.ysize(), 1 << ch_in.vshift));
+    mr = mr.Crop(ch_in.plane);
+    if (r.ysize() != mr.ysize() || r.xsize() != mr.xsize()) {
+      return JXL_FAILURE("Dimension mismatch: trying to fit a %" PRIuS
+                         "x%" PRIuS
+                         " modular channel into "
+                         "a %" PRIuS "x%" PRIuS " rect",
+                         mr.xsize(), mr.ysize(), r.xsize(), r.ysize());
+    }
+    for (size_t y = 0; y < r.ysize(); ++y) {
+      float* const JXL_RESTRICT row_out =
+          r.Row(render_pipeline_input.GetBuffer(3 + ec).first, y);
+      const pixel_type* const JXL_RESTRICT row_in = mr.Row(&ch_in.plane, y);
+      if (fp) {
+        int_to_float(row_in, row_out, r.xsize(), bits, exp_bits);
+      } else {
+        if (full_image.bitdepth < 23) {
+          HWY_DYNAMIC_DISPATCH(SingleFromSingle)
+          (r.xsize(), row_in, factor, row_out);
+        } else {
+          SingleFromSingleAccurate(r.xsize(), row_in, factor, row_out);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+Status ModularFrameDecoder::FinalizeDecoding(PassesDecoderState* dec_state,
+                                             jxl::ThreadPool* pool,
+                                             bool inplace) {
+  if (!use_full_image) return true;
+  Image gi = (inplace ? std::move(full_image) : full_image.clone());
+  size_t xsize = gi.w;
+  size_t ysize = gi.h;
+
+  JXL_DEBUG_V(3, "Finalizing decoding for modular image: %s",
+              gi.DebugString().c_str());
+
+  // Don't use threads if total image size is smaller than a group
+  if (xsize * ysize < frame_dim.group_dim * frame_dim.group_dim) pool = nullptr;
+
+  // Undo the global transforms
+  gi.undo_transforms(global_header.wp_header, pool);
+  JXL_DASSERT(global_transform.empty());
+  if (gi.error) return JXL_FAILURE("Undoing transforms failed");
+
+  for (size_t i = 0; i < dec_state->shared->frame_dim.num_groups; i++) {
+    dec_state->render_pipeline->ClearDone(i);
+  }
+  std::atomic<bool> has_error{false};
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0, dec_state->shared->frame_dim.num_groups,
+      [&](size_t num_threads) {
+        const auto& frame_header = dec_state->shared->frame_header;
+        bool use_group_ids = (frame_header.encoding == FrameEncoding::kVarDCT ||
+                              (frame_header.flags & FrameHeader::kNoise));
+        return dec_state->render_pipeline->PrepareForThreads(num_threads,
+                                                             use_group_ids);
+      },
+      [&](const uint32_t group, size_t thread_id) {
+        RenderPipelineInput input =
+            dec_state->render_pipeline->GetInputBuffers(group, thread_id);
+        if (!ModularImageToDecodedRect(gi, dec_state, nullptr, input,
+                                       dec_state->shared->GroupRect(group))) {
+          has_error = true;
+          return;
+        }
+        input.Done();
+      },
+      "ModularToRect"));
+  if (has_error) {
+    return JXL_FAILURE("Error producing input to render pipeline");
+  }
+  return true;
+}
+
+static constexpr const float kAlmostZero = 1e-8f;
+
+Status ModularFrameDecoder::DecodeQuantTable(
+    size_t required_size_x, size_t required_size_y, BitReader* br,
+    QuantEncoding* encoding, size_t idx,
+    ModularFrameDecoder* modular_frame_decoder) {
+  JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->qraw.qtable_den));
+  if (encoding->qraw.qtable_den < kAlmostZero) {
+    // qtable[] values are already checked for <= 0 so the denominator may not
+    // be negative.
+    return JXL_FAILURE("Invalid qtable_den: value too small");
+  }
+  Image image(required_size_x, required_size_y, 8, 3);
+  ModularOptions options;
+  if (modular_frame_decoder) {
+    JXL_RETURN_IF_ERROR(ModularGenericDecompress(
+        br, image, /*header=*/nullptr,
+        ModularStreamId::QuantTable(idx).ID(modular_frame_decoder->frame_dim),
+        &options, /*undo_transforms=*/true, &modular_frame_decoder->tree,
+        &modular_frame_decoder->code, &modular_frame_decoder->context_map));
+  } else {
+    JXL_RETURN_IF_ERROR(ModularGenericDecompress(br, image, /*header=*/nullptr,
+                                                 0, &options,
+                                                 /*undo_transforms=*/true));
+  }
+  if (!encoding->qraw.qtable) {
+    encoding->qraw.qtable = new std::vector<int>();
+  }
+  encoding->qraw.qtable->resize(required_size_x * required_size_y * 3);
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < required_size_y; y++) {
+      int32_t* JXL_RESTRICT row = image.channel[c].Row(y);
+      for (size_t x = 0; x < required_size_x; x++) {
+        (*encoding->qraw.qtable)[c * required_size_x * required_size_y +
+                                 y * required_size_x + x] = row[x];
+        if (row[x] <= 0) {
+          return JXL_FAILURE("Invalid raw quantization table");
+        }
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/dec_modular.h b/third_party/jpeg-xl/lib/jxl/dec_modular.h
new file mode 100644
index 0000000000..aae643cf1f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_modular.h
@@ -0,0 +1,140 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_MODULAR_H_
+#define LIB_JXL_DEC_MODULAR_H_
+
+#include <stddef.h>
+
+#include <string>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/modular_image.h"
+
+namespace jxl {
+
+struct ModularStreamId {
+  enum Kind {
+    kGlobalData,
+    kVarDCTDC,
+    kModularDC,
+    kACMetadata,
+    kQuantTable,
+    kModularAC
+  };
+  Kind kind;
+  size_t quant_table_id;
+  size_t group_id;  // DC or AC group id.
+  size_t pass_id;   // Only for kModularAC.
+  size_t ID(const FrameDimensions& frame_dim) const {
+    size_t id = 0;
+    switch (kind) {
+      case kGlobalData:
+        id = 0;
+        break;
+      case kVarDCTDC:
+        id = 1 + group_id;
+        break;
+      case kModularDC:
+        id = 1 + frame_dim.num_dc_groups + group_id;
+        break;
+      case kACMetadata:
+        id = 1 + 2 * frame_dim.num_dc_groups + group_id;
+        break;
+      case kQuantTable:
+        id = 1 + 3 * frame_dim.num_dc_groups + quant_table_id;
+        break;
+      case kModularAC:
+        id = 1 + 3 * frame_dim.num_dc_groups + DequantMatrices::kNum +
+             frame_dim.num_groups * pass_id + group_id;
+        break;
+    };
+    return id;
+  }
+  static ModularStreamId Global() {
+    return ModularStreamId{kGlobalData, 0, 0, 0};
+  }
+  static ModularStreamId VarDCTDC(size_t group_id) {
+    return ModularStreamId{kVarDCTDC, 0, group_id, 0};
+  }
+  static ModularStreamId ModularDC(size_t group_id) {
+    return ModularStreamId{kModularDC, 0, group_id, 0};
+  }
+  static ModularStreamId ACMetadata(size_t group_id) {
+    return ModularStreamId{kACMetadata, 0, group_id, 0};
+  }
+  static ModularStreamId QuantTable(size_t quant_table_id) {
+    JXL_ASSERT(quant_table_id < DequantMatrices::kNum);
+    return ModularStreamId{kQuantTable, quant_table_id, 0, 0};
+  }
+  static ModularStreamId ModularAC(size_t group_id, size_t pass_id) {
+    return ModularStreamId{kModularAC, 0, group_id, pass_id};
+  }
+  static size_t Num(const FrameDimensions& frame_dim, size_t passes) {
+    return ModularAC(0, passes).ID(frame_dim);
+  }
+  std::string DebugString() const;
+};
+
+class ModularFrameDecoder {
+ public:
+  void Init(const FrameDimensions& frame_dim) { this->frame_dim = frame_dim; }
+  Status DecodeGlobalInfo(BitReader* reader, const FrameHeader& frame_header,
+                          bool allow_truncated_group);
+  Status DecodeGroup(const Rect& rect, BitReader* reader, int minShift,
+                     int maxShift, const ModularStreamId& stream, bool zerofill,
+                     PassesDecoderState* dec_state,
+                     RenderPipelineInput* render_pipeline_input,
+                     bool allow_truncated, bool* should_run_pipeline = nullptr);
+  // Decodes a VarDCT DC group (`group_id`) from the given `reader`.
+  Status DecodeVarDCTDC(size_t group_id, BitReader* reader,
+                        PassesDecoderState* dec_state);
+  // Decodes a VarDCT AC Metadata group (`group_id`) from the given `reader`.
+  Status DecodeAcMetadata(size_t group_id, BitReader* reader,
+                          PassesDecoderState* dec_state);
+  // Decodes a RAW quant table from `br` into the given `encoding`, of size
+  // `required_size_x x required_size_y`. If `modular_frame_decoder` is passed,
+  // its global tree is used, otherwise no global tree is used.
+  static Status DecodeQuantTable(size_t required_size_x, size_t required_size_y,
+                                 BitReader* br, QuantEncoding* encoding,
+                                 size_t idx,
+                                 ModularFrameDecoder* modular_frame_decoder);
+  // if inplace is true, this can only be called once
+  // if it is false, it can be called multiple times (e.g. for progressive
+  // steps)
+  Status FinalizeDecoding(PassesDecoderState* dec_state, jxl::ThreadPool* pool,
+                          bool inplace);
+  bool have_dc() const { return have_something; }
+  void MaybeDropFullImage();
+  bool UsesFullImage() const { return use_full_image; }
+
+ private:
+  Status ModularImageToDecodedRect(Image& gi, PassesDecoderState* dec_state,
+                                   jxl::ThreadPool* pool,
+                                   RenderPipelineInput& render_pipeline_input,
+                                   Rect modular_rect);
+
+  Image full_image;
+  std::vector<Transform> global_transform;
+  FrameDimensions frame_dim;
+  bool do_color;
+  bool have_something;
+  bool use_full_image = true;
+  bool all_same_shift;
+  Tree tree;
+  ANSCode code;
+  std::vector<uint8_t> context_map;
+  GroupHeader global_header;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_MODULAR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_noise.cc b/third_party/jpeg-xl/lib/jxl/dec_noise.cc
new file mode 100644
index 0000000000..275a6d0b21
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_noise.cc
@@ -0,0 +1,131 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_noise.h"
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/dec_noise.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/xorshift128plus-inl.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Vec;
+
+using D = HWY_CAPPED(float, kBlockDim);
+using DI = hwy::HWY_NAMESPACE::Rebind<int, D>;
+using DI8 = hwy::HWY_NAMESPACE::Repartition<uint8_t, D>;
+
+// Converts one vector's worth of random bits to floats in [1, 2).
+// NOTE: as the convolution kernel sums to 0, it doesn't matter if inputs are in
+// [0, 1) or in [1, 2).
+void BitsToFloat(const uint32_t* JXL_RESTRICT random_bits,
+                 float* JXL_RESTRICT floats) {
+  const HWY_FULL(float) df;
+  const HWY_FULL(uint32_t) du;
+
+  const auto bits = Load(du, random_bits);
+  // 1.0 + 23 random mantissa bits = [1, 2)
+  const auto rand12 = BitCast(df, Or(ShiftRight<9>(bits), Set(du, 0x3F800000)));
+  Store(rand12, df, floats);
+}
+
+void RandomImage(Xorshift128Plus* rng, const Rect& rect,
+                 ImageF* JXL_RESTRICT noise) {
+  const size_t xsize = rect.xsize();
+  const size_t ysize = rect.ysize();
+
+  // May exceed the vector size, hence we have two loops over x below.
+  constexpr size_t kFloatsPerBatch =
+      Xorshift128Plus::N * sizeof(uint64_t) / sizeof(float);
+  HWY_ALIGN uint64_t batch[Xorshift128Plus::N] = {};
+
+  const HWY_FULL(float) df;
+  const size_t N = Lanes(df);
+
+  for (size_t y = 0; y < ysize; ++y) {
+    float* JXL_RESTRICT row = rect.Row(noise, y);
+
+    size_t x = 0;
+    // Only entire batches (avoids exceeding the image padding).
+    for (; x + kFloatsPerBatch < xsize; x += kFloatsPerBatch) {
+      rng->Fill(batch);
+      for (size_t i = 0; i < kFloatsPerBatch; i += Lanes(df)) {
+        BitsToFloat(reinterpret_cast<const uint32_t*>(batch) + i, row + x + i);
+      }
+    }
+
+    // Any remaining pixels, rounded up to vectors (safe due to padding).
+    rng->Fill(batch);
+    size_t batch_pos = 0;  // < kFloatsPerBatch
+    for (; x < xsize; x += N) {
+      BitsToFloat(reinterpret_cast<const uint32_t*>(batch) + batch_pos,
+                  row + x);
+      batch_pos += N;
+    }
+  }
+}
+void Random3Planes(size_t visible_frame_index, size_t nonvisible_frame_index,
+                   size_t x0, size_t y0, const std::pair<ImageF*, Rect>& plane0,
+                   const std::pair<ImageF*, Rect>& plane1,
+                   const std::pair<ImageF*, Rect>& plane2) {
+  HWY_ALIGN Xorshift128Plus rng(visible_frame_index, nonvisible_frame_index, x0,
+                                y0);
+  RandomImage(&rng, plane0.second, plane0.first);
+  RandomImage(&rng, plane1.second, plane1.first);
+  RandomImage(&rng, plane2.second, plane2.first);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(Random3Planes);
+void Random3Planes(size_t visible_frame_index, size_t nonvisible_frame_index,
+                   size_t x0, size_t y0, const std::pair<ImageF*, Rect>& plane0,
+                   const std::pair<ImageF*, Rect>& plane1,
+                   const std::pair<ImageF*, Rect>& plane2) {
+  return HWY_DYNAMIC_DISPATCH(Random3Planes)(visible_frame_index,
+                                             nonvisible_frame_index, x0, y0,
+                                             plane0, plane1, plane2);
+}
+
+void DecodeFloatParam(float precision, float* val, BitReader* br) {
+  const int absval_quant = br->ReadFixedBits<10>();
+  *val = absval_quant / precision;
+}
+
+Status DecodeNoise(BitReader* br, NoiseParams* noise_params) {
+  for (float& i : noise_params->lut) {
+    DecodeFloatParam(kNoisePrecision, &i, br);
+  }
+  return true;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/dec_noise.h b/third_party/jpeg-xl/lib/jxl/dec_noise.h
new file mode 100644
index 0000000000..ac05866470
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_noise.h
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_NOISE_H_
+#define LIB_JXL_DEC_NOISE_H_
+
+// Noise synthesis. Currently disabled.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/noise.h"
+
+namespace jxl {
+
+void Random3Planes(size_t visible_frame_index, size_t nonvisible_frame_index,
+                   size_t x0, size_t y0, const std::pair<ImageF*, Rect>& plane0,
+                   const std::pair<ImageF*, Rect>& plane1,
+                   const std::pair<ImageF*, Rect>& plane2);
+
+// Must only call if FrameHeader.flags.kNoise.
+Status DecodeNoise(BitReader* br, NoiseParams* noise_params);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_NOISE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc b/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc
new file mode 100644
index 0000000000..85e5de3c8d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc
@@ -0,0 +1,347 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_patch_dictionary.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/blending.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_frame.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/patch_dictionary_internal.h"
+
+namespace jxl {
+
+Status PatchDictionary::Decode(BitReader* br, size_t xsize, size_t ysize,
+                               bool* uses_extra_channels) {
+  positions_.clear();
+  std::vector<uint8_t> context_map;
+  ANSCode code;
+  JXL_RETURN_IF_ERROR(
+      DecodeHistograms(br, kNumPatchDictionaryContexts, &code, &context_map));
+  ANSSymbolReader decoder(&code, br);
+
+  auto read_num = [&](size_t context) {
+    size_t r = decoder.ReadHybridUint(context, br, context_map);
+    return r;
+  };
+
+  size_t num_ref_patch = read_num(kNumRefPatchContext);
+  // Limit max memory usage of patches to about 66 bytes per pixel (assuming 8
+  // bytes per size_t)
+  const size_t num_pixels = xsize * ysize;
+  const size_t max_ref_patches = 1024 + num_pixels / 4;
+  const size_t max_patches = max_ref_patches * 4;
+  const size_t max_blending_infos = max_patches * 4;
+  if (num_ref_patch > max_ref_patches) {
+    return JXL_FAILURE("Too many patches in dictionary");
+  }
+  size_t num_ec = shared_->metadata->m.num_extra_channels;
+
+  size_t total_patches = 0;
+  size_t next_size = 1;
+
+  for (size_t id = 0; id < num_ref_patch; id++) {
+    PatchReferencePosition ref_pos;
+    ref_pos.ref = read_num(kReferenceFrameContext);
+    if (ref_pos.ref >= kMaxNumReferenceFrames ||
+        shared_->reference_frames[ref_pos.ref].frame.xsize() == 0) {
+      return JXL_FAILURE("Invalid reference frame ID");
+    }
+    if (!shared_->reference_frames[ref_pos.ref].ib_is_in_xyb) {
+      return JXL_FAILURE(
+          "Patches cannot use frames saved post color transforms");
+    }
+    const ImageBundle& ib = shared_->reference_frames[ref_pos.ref].frame;
+    ref_pos.x0 = read_num(kPatchReferencePositionContext);
+    ref_pos.y0 = read_num(kPatchReferencePositionContext);
+    ref_pos.xsize = read_num(kPatchSizeContext) + 1;
+    ref_pos.ysize = read_num(kPatchSizeContext) + 1;
+    if (ref_pos.x0 + ref_pos.xsize > ib.xsize()) {
+      return JXL_FAILURE("Invalid position specified in reference frame");
+    }
+    if (ref_pos.y0 + ref_pos.ysize > ib.ysize()) {
+      return JXL_FAILURE("Invalid position specified in reference frame");
+    }
+    size_t id_count = read_num(kPatchCountContext) + 1;
+    total_patches += id_count;
+    if (total_patches > max_patches) {
+      return JXL_FAILURE("Too many patches in dictionary");
+    }
+    if (next_size < total_patches) {
+      next_size *= 2;
+      next_size = std::min<size_t>(next_size, max_patches);
+    }
+    if (next_size * (num_ec + 1) > max_blending_infos) {
+      return JXL_FAILURE("Too many patches in dictionary");
+    }
+    positions_.reserve(next_size);
+    blendings_.reserve(next_size * (num_ec + 1));
+    for (size_t i = 0; i < id_count; i++) {
+      PatchPosition pos;
+      pos.ref_pos_idx = ref_positions_.size();
+      if (i == 0) {
+        pos.x = read_num(kPatchPositionContext);
+        pos.y = read_num(kPatchPositionContext);
+      } else {
+        pos.x =
+            positions_.back().x + UnpackSigned(read_num(kPatchOffsetContext));
+        pos.y =
+            positions_.back().y + UnpackSigned(read_num(kPatchOffsetContext));
+      }
+      if (pos.x + ref_pos.xsize > xsize) {
+        return JXL_FAILURE("Invalid patch x: at %" PRIuS " + %" PRIuS
+                           " > %" PRIuS,
+                           pos.x, ref_pos.xsize, xsize);
+      }
+      if (pos.y + ref_pos.ysize > ysize) {
+        return JXL_FAILURE("Invalid patch y: at %" PRIuS " + %" PRIuS
+                           " > %" PRIuS,
+                           pos.y, ref_pos.ysize, ysize);
+      }
+      for (size_t j = 0; j < num_ec + 1; j++) {
+        uint32_t blend_mode = read_num(kPatchBlendModeContext);
+        if (blend_mode >= uint32_t(PatchBlendMode::kNumBlendModes)) {
+          return JXL_FAILURE("Invalid patch blend mode: %u", blend_mode);
+        }
+        PatchBlending info;
+        info.mode = static_cast<PatchBlendMode>(blend_mode);
+        if (UsesAlpha(info.mode)) {
+          *uses_extra_channels = true;
+        }
+        if (info.mode != PatchBlendMode::kNone && j > 0) {
+          *uses_extra_channels = true;
+        }
+        if (UsesAlpha(info.mode) &&
+            shared_->metadata->m.extra_channel_info.size() > 1) {
+          info.alpha_channel = read_num(kPatchAlphaChannelContext);
+          if (info.alpha_channel >=
+              shared_->metadata->m.extra_channel_info.size()) {
+            return JXL_FAILURE(
+                "Invalid alpha channel for blending: %u out of %u\n",
+                info.alpha_channel,
+                (uint32_t)shared_->metadata->m.extra_channel_info.size());
+          }
+        } else {
+          info.alpha_channel = 0;
+        }
+        if (UsesClamp(info.mode)) {
+          info.clamp = read_num(kPatchClampContext);
+        } else {
+          info.clamp = false;
+        }
+        blendings_.push_back(info);
+      }
+      positions_.push_back(std::move(pos));
+    }
+    ref_positions_.emplace_back(std::move(ref_pos));
+  }
+  positions_.shrink_to_fit();
+
+  if (!decoder.CheckANSFinalState()) {
+    return JXL_FAILURE("ANS checksum failure.");
+  }
+
+  ComputePatchTree();
+  return true;
+}
+
+int PatchDictionary::GetReferences() const {
+  int result = 0;
+  for (size_t i = 0; i < ref_positions_.size(); ++i) {
+    result |= (1 << static_cast<int>(ref_positions_[i].ref));
+  }
+  return result;
+}
+
+namespace {
+struct PatchInterval {
+  size_t idx;
+  size_t y0, y1;
+};
+}  // namespace
+
+void PatchDictionary::ComputePatchTree() {
+  patch_tree_.clear();
+  num_patches_.clear();
+  sorted_patches_y0_.clear();
+  sorted_patches_y1_.clear();
+  if (positions_.empty()) {
+    return;
+  }
+  // Create a y-interval for each patch.
+  std::vector<PatchInterval> intervals(positions_.size());
+  for (size_t i = 0; i < positions_.size(); ++i) {
+    const auto& pos = positions_[i];
+    intervals[i].idx = i;
+    intervals[i].y0 = pos.y;
+    intervals[i].y1 = pos.y + ref_positions_[pos.ref_pos_idx].ysize;
+  }
+  auto sort_by_y0 = [&intervals](size_t start, size_t end) {
+    std::sort(intervals.data() + start, intervals.data() + end,
+              [](const PatchInterval& i0, const PatchInterval& i1) {
+                return i0.y0 < i1.y0;
+              });
+  };
+  auto sort_by_y1 = [&intervals](size_t start, size_t end) {
+    std::sort(intervals.data() + start, intervals.data() + end,
+              [](const PatchInterval& i0, const PatchInterval& i1) {
+                return i0.y1 < i1.y1;
+              });
+  };
+  // Count the number of patches for each row.
+  sort_by_y1(0, intervals.size());
+  num_patches_.resize(intervals.back().y1);
+  for (auto iv : intervals) {
+    for (size_t y = iv.y0; y < iv.y1; ++y) num_patches_[y]++;
+  }
+  PatchTreeNode root;
+  root.start = 0;
+  root.num = intervals.size();
+  patch_tree_.push_back(root);
+  size_t next = 0;
+  while (next < patch_tree_.size()) {
+    auto& node = patch_tree_[next];
+    size_t start = node.start;
+    size_t end = node.start + node.num;
+    // Choose the y_center for this node to be the median of interval starts.
+    sort_by_y0(start, end);
+    size_t middle_idx = start + node.num / 2;
+    node.y_center = intervals[middle_idx].y0;
+    // Divide the intervals in [start, end) into three groups:
+    //   * those completely to the right of y_center: [right_start, end)
+    //   * those overlapping y_center: [left_end, right_start)
+    //   * those completely to the left of y_center: [start, left_end)
+    size_t right_start = middle_idx;
+    while (right_start < end && intervals[right_start].y0 == node.y_center) {
+      ++right_start;
+    }
+    sort_by_y1(start, right_start);
+    size_t left_end = right_start;
+    while (left_end > start && intervals[left_end - 1].y1 > node.y_center) {
+      --left_end;
+    }
+    // Fill in sorted_patches_y0_ and sorted_patches_y1_ for the current node.
+    node.num = right_start - left_end;
+    node.start = sorted_patches_y0_.size();
+    for (ssize_t i = static_cast<ssize_t>(right_start) - 1;
+         i >= static_cast<ssize_t>(left_end); --i) {
+      sorted_patches_y1_.push_back({intervals[i].y1, intervals[i].idx});
+    }
+    sort_by_y0(left_end, right_start);
+    for (size_t i = left_end; i < right_start; ++i) {
+      sorted_patches_y0_.push_back({intervals[i].y0, intervals[i].idx});
+    }
+    // Create the left and right nodes (if not empty).
+    node.left_child = node.right_child = -1;
+    if (left_end > start) {
+      PatchTreeNode left;
+      left.start = start;
+      left.num = left_end - left.start;
+      patch_tree_[next].left_child = patch_tree_.size();
+      patch_tree_.push_back(left);
+    }
+    if (right_start < end) {
+      PatchTreeNode right;
+      right.start = right_start;
+      right.num = end - right.start;
+      patch_tree_[next].right_child = patch_tree_.size();
+      patch_tree_.push_back(right);
+    }
+    ++next;
+  }
+}
+
+std::vector<size_t> PatchDictionary::GetPatchesForRow(size_t y) const {
+  std::vector<size_t> result;
+  if (y < num_patches_.size() && num_patches_[y] > 0) {
+    result.reserve(num_patches_[y]);
+    for (ssize_t tree_idx = 0; tree_idx != -1;) {
+      JXL_DASSERT(tree_idx < (ssize_t)patch_tree_.size());
+      const auto& node = patch_tree_[tree_idx];
+      if (y <= node.y_center) {
+        for (size_t i = 0; i < node.num; ++i) {
+          const auto& p = sorted_patches_y0_[node.start + i];
+          if (y < p.first) break;
+          result.push_back(p.second);
+        }
+        tree_idx = y < node.y_center ? node.left_child : -1;
+      } else {
+        for (size_t i = 0; i < node.num; ++i) {
+          const auto& p = sorted_patches_y1_[node.start + i];
+          if (y >= p.first) break;
+          result.push_back(p.second);
+        }
+        tree_idx = node.right_child;
+      }
+    }
+    // Ensure that he relative order of patches that affect the same pixels is
+    // preserved. This is important for patches that have a blend mode
+    // different from kAdd.
+    std::sort(result.begin(), result.end());
+  }
+  return result;
+}
+
+// Adds patches to a segment of `xsize` pixels, starting at `inout`, assumed
+// to be located at position (x0, y) in the frame.
+void PatchDictionary::AddOneRow(float* const* inout, size_t y, size_t x0,
+                                size_t xsize) const {
+  size_t num_ec = shared_->metadata->m.num_extra_channels;
+  std::vector<const float*> fg_ptrs(3 + num_ec);
+  for (size_t pos_idx : GetPatchesForRow(y)) {
+    const size_t blending_idx = pos_idx * (num_ec + 1);
+    const PatchPosition& pos = positions_[pos_idx];
+    const PatchReferencePosition& ref_pos = ref_positions_[pos.ref_pos_idx];
+    size_t by = pos.y;
+    size_t bx = pos.x;
+    size_t patch_xsize = ref_pos.xsize;
+    JXL_DASSERT(y >= by);
+    JXL_DASSERT(y < by + ref_pos.ysize);
+    size_t iy = y - by;
+    size_t ref = ref_pos.ref;
+    if (bx >= x0 + xsize) continue;
+    if (bx + patch_xsize < x0) continue;
+    size_t patch_x0 = std::max(bx, x0);
+    size_t patch_x1 = std::min(bx + patch_xsize, x0 + xsize);
+    for (size_t c = 0; c < 3; c++) {
+      fg_ptrs[c] = shared_->reference_frames[ref].frame.color().ConstPlaneRow(
+                       c, ref_pos.y0 + iy) +
+                   ref_pos.x0 + x0 - bx;
+    }
+    for (size_t i = 0; i < num_ec; i++) {
+      fg_ptrs[3 + i] =
+          shared_->reference_frames[ref].frame.extra_channels()[i].ConstRow(
+              ref_pos.y0 + iy) +
+          ref_pos.x0 + x0 - bx;
+    }
+    PerformBlending(inout, fg_ptrs.data(), inout, patch_x0 - x0,
+                    patch_x1 - patch_x0, blendings_[blending_idx],
+                    blendings_.data() + blending_idx + 1,
+                    shared_->metadata->m.extra_channel_info);
+  }
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.h b/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.h
new file mode 100644
index 0000000000..a950e83e85
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.h
@@ -0,0 +1,151 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_PATCH_DICTIONARY_H_
+#define LIB_JXL_DEC_PATCH_DICTIONARY_H_
+
+// Chooses reference patches, and avoids encoding them once per occurrence.
+
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include <tuple>
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/opsin_params.h"
+
+namespace jxl {
+
+enum class PatchBlendMode : uint8_t {
+  // The new values are the old ones. Useful to skip some channels.
+  kNone = 0,
+  // The new values (in the crop) replace the old ones: sample = new
+  kReplace = 1,
+  // The new values (in the crop) get added to the old ones: sample = old + new
+  kAdd = 2,
+  // The new values (in the crop) get multiplied by the old ones:
+  // sample = old * new
+  // This blend mode is only supported if BlendColorSpace is kEncoded. The
+  // range of the new value matters for multiplication purposes, and its
+  // nominal range of 0..1 is computed the same way as this is done for the
+  // alpha values in kBlend and kAlphaWeightedAdd.
+  kMul = 3,
+  // The new values (in the crop) replace the old ones if alpha>0:
+  // For first alpha channel:
+  // alpha = old + new * (1 - old)
+  // For other channels if !alpha_associated:
+  // sample = ((1 - new_alpha) * old * old_alpha + new_alpha * new) / alpha
+  // For other channels if alpha_associated:
+  // sample = (1 - new_alpha) * old + new
+  // The alpha formula applies to the alpha used for the division in the other
+  // channels formula, and applies to the alpha channel itself if its
+  // blend_channel value matches itself.
+  // If using kBlendAbove, new is the patch and old is the original image; if
+  // using kBlendBelow, the meaning is inverted.
+  kBlendAbove = 4,
+  kBlendBelow = 5,
+  // The new values (in the crop) are added to the old ones if alpha>0:
+  // For first alpha channel: sample = sample = old + new * (1 - old)
+  // For other channels: sample = old + alpha * new
+  kAlphaWeightedAddAbove = 6,
+  kAlphaWeightedAddBelow = 7,
+  kNumBlendModes,
+};
+
+inline bool UsesAlpha(PatchBlendMode mode) {
+  return mode == PatchBlendMode::kBlendAbove ||
+         mode == PatchBlendMode::kBlendBelow ||
+         mode == PatchBlendMode::kAlphaWeightedAddAbove ||
+         mode == PatchBlendMode::kAlphaWeightedAddBelow;
+}
+inline bool UsesClamp(PatchBlendMode mode) {
+  return UsesAlpha(mode) || mode == PatchBlendMode::kMul;
+}
+
+struct PatchBlending {
+  PatchBlendMode mode;
+  uint32_t alpha_channel;
+  bool clamp;
+};
+
+// Position and size of the patch in the reference frame.
+struct PatchReferencePosition {
+  size_t ref, x0, y0, xsize, ysize;
+};
+
+struct PatchPosition {
+  // Position of top-left corner of the patch in the image.
+  size_t x, y;
+  size_t ref_pos_idx;
+};
+
+struct PassesSharedState;
+
+// Encoder-side helper class to encode the PatchesDictionary.
+class PatchDictionaryEncoder;
+
+class PatchDictionary {
+ public:
+  PatchDictionary() = default;
+
+  void SetPassesSharedState(const PassesSharedState* shared) {
+    shared_ = shared;
+  }
+
+  bool HasAny() const { return !positions_.empty(); }
+
+  Status Decode(BitReader* br, size_t xsize, size_t ysize,
+                bool* uses_extra_channels);
+
+  void Clear() {
+    positions_.clear();
+    ComputePatchTree();
+  }
+
+  // Adds patches to a segment of `xsize` pixels, starting at `inout`, assumed
+  // to be located at position (x0, y) in the frame.
+  void AddOneRow(float* const* inout, size_t y, size_t x0, size_t xsize) const;
+
+  // Returns dependencies of this patch dictionary on reference frame ids as a
+  // bit mask: bits 0-3 indicate reference frame 0-3.
+  int GetReferences() const;
+
+  std::vector<size_t> GetPatchesForRow(size_t y) const;
+
+ private:
+  friend class PatchDictionaryEncoder;
+
+  const PassesSharedState* shared_;
+  std::vector<PatchPosition> positions_;
+  std::vector<PatchReferencePosition> ref_positions_;
+  std::vector<PatchBlending> blendings_;
+
+  // Interval tree on the y coordinates of the patches.
+  struct PatchTreeNode {
+    ssize_t left_child;
+    ssize_t right_child;
+    size_t y_center;
+    // Range of patches in sorted_patches_y0_ and sorted_patches_y1_ that
+    // contain the row y_center.
+    size_t start;
+    size_t num;
+  };
+  std::vector<PatchTreeNode> patch_tree_;
+  // Number of patches for each row.
+  std::vector<size_t> num_patches_;
+  std::vector<std::pair<size_t, size_t>> sorted_patches_y0_;
+  std::vector<std::pair<size_t, size_t>> sorted_patches_y1_;
+
+  void ComputePatchTree();
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_PATCH_DICTIONARY_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_tone_mapping-inl.h b/third_party/jpeg-xl/lib/jxl/dec_tone_mapping-inl.h
new file mode 100644
index 0000000000..26bf643152
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_tone_mapping-inl.h
@@ -0,0 +1,234 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JXL_DEC_TONE_MAPPING_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_DEC_TONE_MAPPING_INL_H_
+#undef LIB_JXL_DEC_TONE_MAPPING_INL_H_
+#else
+#define LIB_JXL_DEC_TONE_MAPPING_INL_H_
+#endif
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+template <typename D>
+class Rec2408ToneMapper {
+ private:
+  using V = hwy::HWY_NAMESPACE::Vec<D>;
+
+ public:
+  explicit Rec2408ToneMapper(std::pair<float, float> source_range,
+                             std::pair<float, float> target_range,
+                             const float primaries_luminances[3])
+      : source_range_(source_range),
+        target_range_(target_range),
+        red_Y_(primaries_luminances[0]),
+        green_Y_(primaries_luminances[1]),
+        blue_Y_(primaries_luminances[2]) {}
+
+  void ToneMap(V* red, V* green, V* blue) const {
+    const V luminance = Mul(Set(df_, source_range_.second),
+                            (MulAdd(Set(df_, red_Y_), *red,
+                                    MulAdd(Set(df_, green_Y_), *green,
+                                           Mul(Set(df_, blue_Y_), *blue)))));
+    const V pq_mastering_min = Set(df_, pq_mastering_min_);
+    const V inv_pq_mastering_range = Set(df_, inv_pq_mastering_range_);
+    const V normalized_pq = Min(
+        Set(df_, 1.f),
+        Mul(Sub(InvEOTF(luminance), pq_mastering_min), inv_pq_mastering_range));
+    const V ks = Set(df_, ks_);
+    const V e2 =
+        IfThenElse(Lt(normalized_pq, ks), normalized_pq, P(normalized_pq));
+    const V one_minus_e2 = Sub(Set(df_, 1), e2);
+    const V one_minus_e2_2 = Mul(one_minus_e2, one_minus_e2);
+    const V one_minus_e2_4 = Mul(one_minus_e2_2, one_minus_e2_2);
+    const V b = Set(df_, min_lum_);
+    const V e3 = MulAdd(b, one_minus_e2_4, e2);
+    const V pq_mastering_range = Set(df_, pq_mastering_range_);
+    const V e4 = MulAdd(e3, pq_mastering_range, pq_mastering_min);
+    const V new_luminance =
+        Min(Set(df_, target_range_.second),
+            ZeroIfNegative(
+                Mul(Set(df_, 10000), TF_PQ().DisplayFromEncoded(df_, e4))));
+
+    const V ratio = Div(new_luminance, luminance);
+    const V inv_target_peak = Set(df_, inv_target_peak_);
+    const V normalizer = Set(df_, normalizer_);
+    const V multiplier = Mul(ratio, normalizer);
+    for (V* const val : {red, green, blue}) {
+      *val = IfThenElse(Le(luminance, Set(df_, 1e-6f)),
+                        Mul(new_luminance, inv_target_peak),
+                        Mul(*val, multiplier));
+    }
+  }
+
+ private:
+  V InvEOTF(const V luminance) const {
+    return TF_PQ().EncodedFromDisplay(df_,
+                                      Mul(luminance, Set(df_, 1. / 10000)));
+  }
+  float InvEOTF(const float luminance) const {
+    return TF_PQ().EncodedFromDisplay(luminance / 10000.0f);
+  }
+  V T(const V a) const {
+    const V ks = Set(df_, ks_);
+    const V inv_one_minus_ks = Set(df_, inv_one_minus_ks_);
+    return Mul(Sub(a, ks), inv_one_minus_ks);
+  }
+  V P(const V b) const {
+    const V t_b = T(b);
+    const V t_b_2 = Mul(t_b, t_b);
+    const V t_b_3 = Mul(t_b_2, t_b);
+    const V ks = Set(df_, ks_);
+    const V max_lum = Set(df_, max_lum_);
+    return MulAdd(
+        MulAdd(Set(df_, 2), t_b_3, MulAdd(Set(df_, -3), t_b_2, Set(df_, 1))),
+        ks,
+        MulAdd(Add(t_b_3, MulAdd(Set(df_, -2), t_b_2, t_b)),
+               Sub(Set(df_, 1), ks),
+               Mul(MulAdd(Set(df_, -2), t_b_3, Mul(Set(df_, 3), t_b_2)),
+                   max_lum)));
+  }
+
+  D df_;
+  const std::pair<float, float> source_range_;
+  const std::pair<float, float> target_range_;
+  const float red_Y_;
+  const float green_Y_;
+  const float blue_Y_;
+
+  const float pq_mastering_min_ = InvEOTF(source_range_.first);
+  const float pq_mastering_max_ = InvEOTF(source_range_.second);
+  const float pq_mastering_range_ = pq_mastering_max_ - pq_mastering_min_;
+  const float inv_pq_mastering_range_ = 1.0f / pq_mastering_range_;
+  // TODO(eustas): divide instead of inverse-multiply?
+  const float min_lum_ = (InvEOTF(target_range_.first) - pq_mastering_min_) *
+                         inv_pq_mastering_range_;
+  // TODO(eustas): divide instead of inverse-multiply?
+  const float max_lum_ = (InvEOTF(target_range_.second) - pq_mastering_min_) *
+                         inv_pq_mastering_range_;
+  const float ks_ = 1.5f * max_lum_ - 0.5f;
+  const float b_ = min_lum_;
+
+  const float inv_one_minus_ks_ = 1.0f / std::max(1e-6f, 1.0f - ks_);
+
+  const float normalizer_ = source_range_.second / target_range_.second;
+  const float inv_target_peak_ = 1.f / target_range_.second;
+};
+
+class HlgOOTF {
+ public:
+  explicit HlgOOTF(float source_luminance, float target_luminance,
+                   const float primaries_luminances[3])
+      : HlgOOTF(/*gamma=*/std::pow(
+                    1.111f, std::log2(target_luminance / source_luminance)),
+                primaries_luminances) {}
+
+  static HlgOOTF FromSceneLight(float display_luminance,
+                                const float primaries_luminances[3]) {
+    return HlgOOTF(/*gamma=*/1.2f *
+                       std::pow(1.111f, std::log2(display_luminance / 1000.f)),
+                   primaries_luminances);
+  }
+
+  static HlgOOTF ToSceneLight(float display_luminance,
+                              const float primaries_luminances[3]) {
+    return HlgOOTF(
+        /*gamma=*/(1 / 1.2f) *
+            std::pow(1.111f, -std::log2(display_luminance / 1000.f)),
+        primaries_luminances);
+  }
+
+  template <typename V>
+  void Apply(V* red, V* green, V* blue) const {
+    hwy::HWY_NAMESPACE::DFromV<V> df;
+    if (!apply_ootf_) return;
+    const V luminance =
+        MulAdd(Set(df, red_Y_), *red,
+               MulAdd(Set(df, green_Y_), *green, Mul(Set(df, blue_Y_), *blue)));
+    const V ratio =
+        Min(FastPowf(df, luminance, Set(df, exponent_)), Set(df, 1e9));
+    *red = Mul(*red, ratio);
+    *green = Mul(*green, ratio);
+    *blue = Mul(*blue, ratio);
+  }
+
+  bool WarrantsGamutMapping() const { return apply_ootf_ && exponent_ < 0; }
+
+ private:
+  explicit HlgOOTF(float gamma, const float luminances[3])
+      : exponent_(gamma - 1),
+        red_Y_(luminances[0]),
+        green_Y_(luminances[1]),
+        blue_Y_(luminances[2]) {}
+  const float exponent_;
+  const bool apply_ootf_ = exponent_ < -0.01f || 0.01f < exponent_;
+  const float red_Y_;
+  const float green_Y_;
+  const float blue_Y_;
+};
+
+template <typename V>
+void GamutMap(V* red, V* green, V* blue, const float primaries_luminances[3],
+              float preserve_saturation = 0.1f) {
+  hwy::HWY_NAMESPACE::DFromV<V> df;
+  const V luminance =
+      MulAdd(Set(df, primaries_luminances[0]), *red,
+             MulAdd(Set(df, primaries_luminances[1]), *green,
+                    Mul(Set(df, primaries_luminances[2]), *blue)));
+
+  // Desaturate out-of-gamut pixels. This is done by mixing each pixel
+  // with just enough gray of the target luminance to make all
+  // components non-negative.
+  // - For saturation preservation, if a component is still larger than
+  // 1 then the pixel is normalized to have a maximum component of 1.
+  // That will reduce its luminance.
+  // - For luminance preservation, getting all components below 1 is
+  // done by mixing in yet more gray. That will desaturate it further.
+  V gray_mix_saturation = Zero(df);
+  V gray_mix_luminance = Zero(df);
+  for (const V* ch : {red, green, blue}) {
+    const V& val = *ch;
+    const V inv_val_minus_gray = Div(Set(df, 1), (Sub(val, luminance)));
+    gray_mix_saturation =
+        IfThenElse(Ge(val, luminance), gray_mix_saturation,
+                   Max(gray_mix_saturation, Mul(val, inv_val_minus_gray)));
+    gray_mix_luminance =
+        Max(gray_mix_luminance,
+            IfThenElse(Le(val, luminance), gray_mix_saturation,
+                       Mul(Sub(val, Set(df, 1)), inv_val_minus_gray)));
+  }
+  const V gray_mix = Clamp(
+      MulAdd(Set(df, preserve_saturation),
+             Sub(gray_mix_saturation, gray_mix_luminance), gray_mix_luminance),
+      Zero(df), Set(df, 1));
+  for (V* const val : {red, green, blue}) {
+    *val = MulAdd(gray_mix, Sub(luminance, *val), *val);
+  }
+  const V normalizer =
+      Div(Set(df, 1), Max(Set(df, 1), Max(*red, Max(*green, *blue))));
+  for (V* const val : {red, green, blue}) {
+    *val = Mul(*val, normalizer);
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_DEC_TONE_MAPPING_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_transforms-inl.h b/third_party/jpeg-xl/lib/jxl/dec_transforms-inl.h
new file mode 100644
index 0000000000..075619b3b9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_transforms-inl.h
@@ -0,0 +1,853 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JXL_DEC_TRANSFORMS_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_DEC_TRANSFORMS_INL_H_
+#undef LIB_JXL_DEC_TRANSFORMS_INL_H_
+#else
+#define LIB_JXL_DEC_TRANSFORMS_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/dct-inl.h"
+#include "lib/jxl/dct_scales.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::MulAdd;
+
+// Computes the lowest-frequency LF_ROWSxLF_COLS-sized square in output, which
+// is a DCT_ROWS*DCT_COLS-sized DCT block, by doing a ROWS*COLS DCT on the
+// input block.
+template <size_t DCT_ROWS, size_t DCT_COLS, size_t LF_ROWS, size_t LF_COLS,
+          size_t ROWS, size_t COLS>
+JXL_INLINE void ReinterpretingDCT(const float* input, const size_t input_stride,
+                                  float* output, const size_t output_stride) {
+  static_assert(LF_ROWS == ROWS,
+                "ReinterpretingDCT should only be called with LF == N");
+  static_assert(LF_COLS == COLS,
+                "ReinterpretingDCT should only be called with LF == N");
+  HWY_ALIGN float block[ROWS * COLS];
+
+  // ROWS, COLS <= 8, so we can put scratch space on the stack.
+  HWY_ALIGN float scratch_space[ROWS * COLS];
+  ComputeScaledDCT<ROWS, COLS>()(DCTFrom(input, input_stride), block,
+                                 scratch_space);
+  if (ROWS < COLS) {
+    for (size_t y = 0; y < LF_ROWS; y++) {
+      for (size_t x = 0; x < LF_COLS; x++) {
+        output[y * output_stride + x] =
+            block[y * COLS + x] * DCTTotalResampleScale<ROWS, DCT_ROWS>(y) *
+            DCTTotalResampleScale<COLS, DCT_COLS>(x);
+      }
+    }
+  } else {
+    for (size_t y = 0; y < LF_COLS; y++) {
+      for (size_t x = 0; x < LF_ROWS; x++) {
+        output[y * output_stride + x] =
+            block[y * ROWS + x] * DCTTotalResampleScale<COLS, DCT_COLS>(y) *
+            DCTTotalResampleScale<ROWS, DCT_ROWS>(x);
+      }
+    }
+  }
+}
+
+template <size_t S>
+void IDCT2TopBlock(const float* block, size_t stride_out, float* out) {
+  static_assert(kBlockDim % S == 0, "S should be a divisor of kBlockDim");
+  static_assert(S % 2 == 0, "S should be even");
+  float temp[kDCTBlockSize];
+  constexpr size_t num_2x2 = S / 2;
+  for (size_t y = 0; y < num_2x2; y++) {
+    for (size_t x = 0; x < num_2x2; x++) {
+      float c00 = block[y * kBlockDim + x];
+      float c01 = block[y * kBlockDim + num_2x2 + x];
+      float c10 = block[(y + num_2x2) * kBlockDim + x];
+      float c11 = block[(y + num_2x2) * kBlockDim + num_2x2 + x];
+      float r00 = c00 + c01 + c10 + c11;
+      float r01 = c00 + c01 - c10 - c11;
+      float r10 = c00 - c01 + c10 - c11;
+      float r11 = c00 - c01 - c10 + c11;
+      temp[y * 2 * kBlockDim + x * 2] = r00;
+      temp[y * 2 * kBlockDim + x * 2 + 1] = r01;
+      temp[(y * 2 + 1) * kBlockDim + x * 2] = r10;
+      temp[(y * 2 + 1) * kBlockDim + x * 2 + 1] = r11;
+    }
+  }
+  for (size_t y = 0; y < S; y++) {
+    for (size_t x = 0; x < S; x++) {
+      out[y * stride_out + x] = temp[y * kBlockDim + x];
+    }
+  }
+}
+
+void AFVIDCT4x4(const float* JXL_RESTRICT coeffs, float* JXL_RESTRICT pixels) {
+  HWY_ALIGN static constexpr float k4x4AFVBasis[16][16] = {
+      {
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+      },
+      {
+          0.876902929799142f,
+          0.2206518106944235f,
+          -0.10140050393753763f,
+          -0.1014005039375375f,
+          0.2206518106944236f,
+          -0.10140050393753777f,
+          -0.10140050393753772f,
+          -0.10140050393753763f,
+          -0.10140050393753758f,
+          -0.10140050393753769f,
+          -0.1014005039375375f,
+          -0.10140050393753768f,
+          -0.10140050393753768f,
+          -0.10140050393753759f,
+          -0.10140050393753763f,
+          -0.10140050393753741f,
+      },
+      {
+          0.0,
+          0.0,
+          0.40670075830260755f,
+          0.44444816619734445f,
+          0.0,
+          0.0,
+          0.19574399372042936f,
+          0.2929100136981264f,
+          -0.40670075830260716f,
+          -0.19574399372042872f,
+          0.0,
+          0.11379074460448091f,
+          -0.44444816619734384f,
+          -0.29291001369812636f,
+          -0.1137907446044814f,
+          0.0,
+      },
+      {
+          0.0,
+          0.0,
+          -0.21255748058288748f,
+          0.3085497062849767f,
+          0.0,
+          0.4706702258572536f,
+          -0.1621205195722993f,
+          0.0,
+          -0.21255748058287047f,
+          -0.16212051957228327f,
+          -0.47067022585725277f,
+          -0.1464291867126764f,
+          0.3085497062849487f,
+          0.0,
+          -0.14642918671266536f,
+          0.4251149611657548f,
+      },
+      {
+          0.0,
+          -0.7071067811865474f,
+          0.0,
+          0.0,
+          0.7071067811865476f,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+      },
+      {
+          -0.4105377591765233f,
+          0.6235485373547691f,
+          -0.06435071657946274f,
+          -0.06435071657946266f,
+          0.6235485373547694f,
+          -0.06435071657946284f,
+          -0.0643507165794628f,
+          -0.06435071657946274f,
+          -0.06435071657946272f,
+          -0.06435071657946279f,
+          -0.06435071657946266f,
+          -0.06435071657946277f,
+          -0.06435071657946277f,
+          -0.06435071657946273f,
+          -0.06435071657946274f,
+          -0.0643507165794626f,
+      },
+      {
+          0.0,
+          0.0,
+          -0.4517556589999482f,
+          0.15854503551840063f,
+          0.0,
+          -0.04038515160822202f,
+          0.0074182263792423875f,
+          0.39351034269210167f,
+          -0.45175565899994635f,
+          0.007418226379244351f,
+          0.1107416575309343f,
+          0.08298163094882051f,
+          0.15854503551839705f,
+          0.3935103426921022f,
+          0.0829816309488214f,
+          -0.45175565899994796f,
+      },
+      {
+          0.0,
+          0.0,
+          -0.304684750724869f,
+          0.5112616136591823f,
+          0.0,
+          0.0,
+          -0.290480129728998f,
+          -0.06578701549142804f,
+          0.304684750724884f,
+          0.2904801297290076f,
+          0.0,
+          -0.23889773523344604f,
+          -0.5112616136592012f,
+          0.06578701549142545f,
+          0.23889773523345467f,
+          0.0,
+      },
+      {
+          0.0,
+          0.0,
+          0.3017929516615495f,
+          0.25792362796341184f,
+          0.0,
+          0.16272340142866204f,
+          0.09520022653475037f,
+          0.0,
+          0.3017929516615503f,
+          0.09520022653475055f,
+          -0.16272340142866173f,
+          -0.35312385449816297f,
+          0.25792362796341295f,
+          0.0,
+          -0.3531238544981624f,
+          -0.6035859033230976f,
+      },
+      {
+          0.0,
+          0.0,
+          0.40824829046386274f,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          -0.4082482904638628f,
+          -0.4082482904638635f,
+          0.0,
+          0.0,
+          -0.40824829046386296f,
+          0.0,
+          0.4082482904638634f,
+          0.408248290463863f,
+          0.0,
+      },
+      {
+          0.0,
+          0.0,
+          0.1747866975480809f,
+          0.0812611176717539f,
+          0.0,
+          0.0,
+          -0.3675398009862027f,
+          -0.307882213957909f,
+          -0.17478669754808135f,
+          0.3675398009862011f,
+          0.0,
+          0.4826689115059883f,
+          -0.08126111767175039f,
+          0.30788221395790305f,
+          -0.48266891150598584f,
+          0.0,
+      },
+      {
+          0.0,
+          0.0,
+          -0.21105601049335784f,
+          0.18567180916109802f,
+          0.0,
+          0.0,
+          0.49215859013738733f,
+          -0.38525013709251915f,
+          0.21105601049335806f,
+          -0.49215859013738905f,
+          0.0,
+          0.17419412659916217f,
+          -0.18567180916109904f,
+          0.3852501370925211f,
+          -0.1741941265991621f,
+          0.0,
+      },
+      {
+          0.0,
+          0.0,
+          -0.14266084808807264f,
+          -0.3416446842253372f,
+          0.0,
+          0.7367497537172237f,
+          0.24627107722075148f,
+          -0.08574019035519306f,
+          -0.14266084808807344f,
+          0.24627107722075137f,
+          0.14883399227113567f,
+          -0.04768680350229251f,
+          -0.3416446842253373f,
+          -0.08574019035519267f,
+          -0.047686803502292804f,
+          -0.14266084808807242f,
+      },
+      {
+          0.0,
+          0.0,
+          -0.13813540350758585f,
+          0.3302282550303788f,
+          0.0,
+          0.08755115000587084f,
+          -0.07946706605909573f,
+          -0.4613374887461511f,
+          -0.13813540350758294f,
+          -0.07946706605910261f,
+          0.49724647109535086f,
+          0.12538059448563663f,
+          0.3302282550303805f,
+          -0.4613374887461554f,
+          0.12538059448564315f,
+          -0.13813540350758452f,
+      },
+      {
+          0.0,
+          0.0,
+          -0.17437602599651067f,
+          0.0702790691196284f,
+          0.0,
+          -0.2921026642334881f,
+          0.3623817333531167f,
+          0.0,
+          -0.1743760259965108f,
+          0.36238173335311646f,
+          0.29210266423348785f,
+          -0.4326608024727445f,
+          0.07027906911962818f,
+          0.0,
+          -0.4326608024727457f,
+          0.34875205199302267f,
+      },
+      {
+          0.0,
+          0.0,
+          0.11354987314994337f,
+          -0.07417504595810355f,
+          0.0,
+          0.19402893032594343f,
+          -0.435190496523228f,
+          0.21918684838857466f,
+          0.11354987314994257f,
+          -0.4351904965232251f,
+          0.5550443808910661f,
+          -0.25468277124066463f,
+          -0.07417504595810233f,
+          0.2191868483885728f,
+          -0.25468277124066413f,
+          0.1135498731499429f,
+      },
+  };
+
+  const HWY_CAPPED(float, 16) d;
+  for (size_t i = 0; i < 16; i += Lanes(d)) {
+    auto pixel = Zero(d);
+    for (size_t j = 0; j < 16; j++) {
+      auto cf = Set(d, coeffs[j]);
+      auto basis = Load(d, k4x4AFVBasis[j] + i);
+      pixel = MulAdd(cf, basis, pixel);
+    }
+    Store(pixel, d, pixels + i);
+  }
+}
+
+template <size_t afv_kind>
+void AFVTransformToPixels(const float* JXL_RESTRICT coefficients,
+                          float* JXL_RESTRICT pixels, size_t pixels_stride) {
+  HWY_ALIGN float scratch_space[4 * 8];
+  size_t afv_x = afv_kind & 1;
+  size_t afv_y = afv_kind / 2;
+  float dcs[3] = {};
+  float block00 = coefficients[0];
+  float block01 = coefficients[1];
+  float block10 = coefficients[8];
+  dcs[0] = (block00 + block10 + block01) * 4.0f;
+  dcs[1] = (block00 + block10 - block01);
+  dcs[2] = block00 - block10;
+  // IAFV: (even, even) positions.
+  HWY_ALIGN float coeff[4 * 4];
+  coeff[0] = dcs[0];
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 4; ix++) {
+      if (ix == 0 && iy == 0) continue;
+      coeff[iy * 4 + ix] = coefficients[iy * 2 * 8 + ix * 2];
+    }
+  }
+  HWY_ALIGN float block[4 * 8];
+  AFVIDCT4x4(coeff, block);
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 4; ix++) {
+      pixels[(iy + afv_y * 4) * pixels_stride + afv_x * 4 + ix] =
+          block[(afv_y == 1 ? 3 - iy : iy) * 4 + (afv_x == 1 ? 3 - ix : ix)];
+    }
+  }
+  // IDCT4x4 in (odd, even) positions.
+  block[0] = dcs[1];
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 4; ix++) {
+      if (ix == 0 && iy == 0) continue;
+      block[iy * 4 + ix] = coefficients[iy * 2 * 8 + ix * 2 + 1];
+    }
+  }
+  ComputeScaledIDCT<4, 4>()(
+      block,
+      DCTTo(pixels + afv_y * 4 * pixels_stride + (afv_x == 1 ? 0 : 4),
+            pixels_stride),
+      scratch_space);
+  // IDCT4x8.
+  block[0] = dcs[2];
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 8; ix++) {
+      if (ix == 0 && iy == 0) continue;
+      block[iy * 8 + ix] = coefficients[(1 + iy * 2) * 8 + ix];
+    }
+  }
+  ComputeScaledIDCT<4, 8>()(
+      block,
+      DCTTo(pixels + (afv_y == 1 ? 0 : 4) * pixels_stride, pixels_stride),
+      scratch_space);
+}
+
+HWY_MAYBE_UNUSED void TransformToPixels(const AcStrategy::Type strategy,
+                                        float* JXL_RESTRICT coefficients,
+                                        float* JXL_RESTRICT pixels,
+                                        size_t pixels_stride,
+                                        float* scratch_space) {
+  using Type = AcStrategy::Type;
+  switch (strategy) {
+    case Type::IDENTITY: {
+      PROFILER_ZONE("IDCT Identity");
+      float dcs[4] = {};
+      float block00 = coefficients[0];
+      float block01 = coefficients[1];
+      float block10 = coefficients[8];
+      float block11 = coefficients[9];
+      dcs[0] = block00 + block01 + block10 + block11;
+      dcs[1] = block00 + block01 - block10 - block11;
+      dcs[2] = block00 - block01 + block10 - block11;
+      dcs[3] = block00 - block01 - block10 + block11;
+      for (size_t y = 0; y < 2; y++) {
+        for (size_t x = 0; x < 2; x++) {
+          float block_dc = dcs[y * 2 + x];
+          float residual_sum = 0;
+          for (size_t iy = 0; iy < 4; iy++) {
+            for (size_t ix = 0; ix < 4; ix++) {
+              if (ix == 0 && iy == 0) continue;
+              residual_sum += coefficients[(y + iy * 2) * 8 + x + ix * 2];
+            }
+          }
+          pixels[(4 * y + 1) * pixels_stride + 4 * x + 1] =
+              block_dc - residual_sum * (1.0f / 16);
+          for (size_t iy = 0; iy < 4; iy++) {
+            for (size_t ix = 0; ix < 4; ix++) {
+              if (ix == 1 && iy == 1) continue;
+              pixels[(y * 4 + iy) * pixels_stride + x * 4 + ix] =
+                  coefficients[(y + iy * 2) * 8 + x + ix * 2] +
+                  pixels[(4 * y + 1) * pixels_stride + 4 * x + 1];
+            }
+          }
+          pixels[y * 4 * pixels_stride + x * 4] =
+              coefficients[(y + 2) * 8 + x + 2] +
+              pixels[(4 * y + 1) * pixels_stride + 4 * x + 1];
+        }
+      }
+      break;
+    }
+    case Type::DCT8X4: {
+      PROFILER_ZONE("IDCT 8x4");
+      float dcs[2] = {};
+      float block0 = coefficients[0];
+      float block1 = coefficients[8];
+      dcs[0] = block0 + block1;
+      dcs[1] = block0 - block1;
+      for (size_t x = 0; x < 2; x++) {
+        HWY_ALIGN float block[4 * 8];
+        block[0] = dcs[x];
+        for (size_t iy = 0; iy < 4; iy++) {
+          for (size_t ix = 0; ix < 8; ix++) {
+            if (ix == 0 && iy == 0) continue;
+            block[iy * 8 + ix] = coefficients[(x + iy * 2) * 8 + ix];
+          }
+        }
+        ComputeScaledIDCT<8, 4>()(block, DCTTo(pixels + x * 4, pixels_stride),
+                                  scratch_space);
+      }
+      break;
+    }
+    case Type::DCT4X8: {
+      PROFILER_ZONE("IDCT 4x8");
+      float dcs[2] = {};
+      float block0 = coefficients[0];
+      float block1 = coefficients[8];
+      dcs[0] = block0 + block1;
+      dcs[1] = block0 - block1;
+      for (size_t y = 0; y < 2; y++) {
+        HWY_ALIGN float block[4 * 8];
+        block[0] = dcs[y];
+        for (size_t iy = 0; iy < 4; iy++) {
+          for (size_t ix = 0; ix < 8; ix++) {
+            if (ix == 0 && iy == 0) continue;
+            block[iy * 8 + ix] = coefficients[(y + iy * 2) * 8 + ix];
+          }
+        }
+        ComputeScaledIDCT<4, 8>()(
+            block, DCTTo(pixels + y * 4 * pixels_stride, pixels_stride),
+            scratch_space);
+      }
+      break;
+    }
+    case Type::DCT4X4: {
+      PROFILER_ZONE("IDCT 4");
+      float dcs[4] = {};
+      float block00 = coefficients[0];
+      float block01 = coefficients[1];
+      float block10 = coefficients[8];
+      float block11 = coefficients[9];
+      dcs[0] = block00 + block01 + block10 + block11;
+      dcs[1] = block00 + block01 - block10 - block11;
+      dcs[2] = block00 - block01 + block10 - block11;
+      dcs[3] = block00 - block01 - block10 + block11;
+      for (size_t y = 0; y < 2; y++) {
+        for (size_t x = 0; x < 2; x++) {
+          HWY_ALIGN float block[4 * 4];
+          block[0] = dcs[y * 2 + x];
+          for (size_t iy = 0; iy < 4; iy++) {
+            for (size_t ix = 0; ix < 4; ix++) {
+              if (ix == 0 && iy == 0) continue;
+              block[iy * 4 + ix] = coefficients[(y + iy * 2) * 8 + x + ix * 2];
+            }
+          }
+          ComputeScaledIDCT<4, 4>()(
+              block,
+              DCTTo(pixels + y * 4 * pixels_stride + x * 4, pixels_stride),
+              scratch_space);
+        }
+      }
+      break;
+    }
+    case Type::DCT2X2: {
+      PROFILER_ZONE("IDCT 2");
+      HWY_ALIGN float coeffs[kDCTBlockSize];
+      memcpy(coeffs, coefficients, sizeof(float) * kDCTBlockSize);
+      IDCT2TopBlock<2>(coeffs, kBlockDim, coeffs);
+      IDCT2TopBlock<4>(coeffs, kBlockDim, coeffs);
+      IDCT2TopBlock<8>(coeffs, kBlockDim, coeffs);
+      for (size_t y = 0; y < kBlockDim; y++) {
+        for (size_t x = 0; x < kBlockDim; x++) {
+          pixels[y * pixels_stride + x] = coeffs[y * kBlockDim + x];
+        }
+      }
+      break;
+    }
+    case Type::DCT16X16: {
+      PROFILER_ZONE("IDCT 16");
+      ComputeScaledIDCT<16, 16>()(coefficients, DCTTo(pixels, pixels_stride),
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT16X8: {
+      PROFILER_ZONE("IDCT 16x8");
+      ComputeScaledIDCT<16, 8>()(coefficients, DCTTo(pixels, pixels_stride),
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT8X16: {
+      PROFILER_ZONE("IDCT 8x16");
+      ComputeScaledIDCT<8, 16>()(coefficients, DCTTo(pixels, pixels_stride),
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT32X8: {
+      PROFILER_ZONE("IDCT 32x8");
+      ComputeScaledIDCT<32, 8>()(coefficients, DCTTo(pixels, pixels_stride),
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT8X32: {
+      PROFILER_ZONE("IDCT 8x32");
+      ComputeScaledIDCT<8, 32>()(coefficients, DCTTo(pixels, pixels_stride),
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT32X16: {
+      PROFILER_ZONE("IDCT 32x16");
+      ComputeScaledIDCT<32, 16>()(coefficients, DCTTo(pixels, pixels_stride),
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT16X32: {
+      PROFILER_ZONE("IDCT 16x32");
+      ComputeScaledIDCT<16, 32>()(coefficients, DCTTo(pixels, pixels_stride),
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT32X32: {
+      PROFILER_ZONE("IDCT 32");
+      ComputeScaledIDCT<32, 32>()(coefficients, DCTTo(pixels, pixels_stride),
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT: {
+      PROFILER_ZONE("IDCT 8");
+      ComputeScaledIDCT<8, 8>()(coefficients, DCTTo(pixels, pixels_stride),
+                                scratch_space);
+      break;
+    }
+    case Type::AFV0: {
+      PROFILER_ZONE("IAFV0");
+      AFVTransformToPixels<0>(coefficients, pixels, pixels_stride);
+      break;
+    }
+    case Type::AFV1: {
+      PROFILER_ZONE("IAFV1");
+      AFVTransformToPixels<1>(coefficients, pixels, pixels_stride);
+      break;
+    }
+    case Type::AFV2: {
+      PROFILER_ZONE("IAFV2");
+      AFVTransformToPixels<2>(coefficients, pixels, pixels_stride);
+      break;
+    }
+    case Type::AFV3: {
+      PROFILER_ZONE("IAFV3");
+      AFVTransformToPixels<3>(coefficients, pixels, pixels_stride);
+      break;
+    }
+    case Type::DCT64X32: {
+      PROFILER_ZONE("IDCT 64x32");
+      ComputeScaledIDCT<64, 32>()(coefficients, DCTTo(pixels, pixels_stride),
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT32X64: {
+      PROFILER_ZONE("IDCT 32x64");
+      ComputeScaledIDCT<32, 64>()(coefficients, DCTTo(pixels, pixels_stride),
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT64X64: {
+      PROFILER_ZONE("IDCT 64");
+      ComputeScaledIDCT<64, 64>()(coefficients, DCTTo(pixels, pixels_stride),
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT128X64: {
+      PROFILER_ZONE("IDCT 128x64");
+      ComputeScaledIDCT<128, 64>()(coefficients, DCTTo(pixels, pixels_stride),
+                                   scratch_space);
+      break;
+    }
+    case Type::DCT64X128: {
+      PROFILER_ZONE("IDCT 64x128");
+      ComputeScaledIDCT<64, 128>()(coefficients, DCTTo(pixels, pixels_stride),
+                                   scratch_space);
+      break;
+    }
+    case Type::DCT128X128: {
+      PROFILER_ZONE("IDCT 128");
+      ComputeScaledIDCT<128, 128>()(coefficients, DCTTo(pixels, pixels_stride),
+                                    scratch_space);
+      break;
+    }
+    case Type::DCT256X128: {
+      PROFILER_ZONE("IDCT 256x128");
+      ComputeScaledIDCT<256, 128>()(coefficients, DCTTo(pixels, pixels_stride),
+                                    scratch_space);
+      break;
+    }
+    case Type::DCT128X256: {
+      PROFILER_ZONE("IDCT 128x256");
+      ComputeScaledIDCT<128, 256>()(coefficients, DCTTo(pixels, pixels_stride),
+                                    scratch_space);
+      break;
+    }
+    case Type::DCT256X256: {
+      PROFILER_ZONE("IDCT 256");
+      ComputeScaledIDCT<256, 256>()(coefficients, DCTTo(pixels, pixels_stride),
+                                    scratch_space);
+      break;
+    }
+    case Type::kNumValidStrategies:
+      JXL_ABORT("Invalid strategy");
+  }
+}
+
+HWY_MAYBE_UNUSED void LowestFrequenciesFromDC(const AcStrategy::Type strategy,
+                                              const float* dc, size_t dc_stride,
+                                              float* llf) {
+  using Type = AcStrategy::Type;
+  switch (strategy) {
+    case Type::DCT16X8: {
+      ReinterpretingDCT</*DCT_ROWS=*/2 * kBlockDim, /*DCT_COLS=*/kBlockDim,
+                        /*LF_ROWS=*/2, /*LF_COLS=*/1, /*ROWS=*/2, /*COLS=*/1>(
+          dc, dc_stride, llf, 2 * kBlockDim);
+      break;
+    }
+    case Type::DCT8X16: {
+      ReinterpretingDCT</*DCT_ROWS=*/kBlockDim, /*DCT_COLS=*/2 * kBlockDim,
+                        /*LF_ROWS=*/1, /*LF_COLS=*/2, /*ROWS=*/1, /*COLS=*/2>(
+          dc, dc_stride, llf, 2 * kBlockDim);
+      break;
+    }
+    case Type::DCT16X16: {
+      ReinterpretingDCT</*DCT_ROWS=*/2 * kBlockDim, /*DCT_COLS=*/2 * kBlockDim,
+                        /*LF_ROWS=*/2, /*LF_COLS=*/2, /*ROWS=*/2, /*COLS=*/2>(
+          dc, dc_stride, llf, 2 * kBlockDim);
+      break;
+    }
+    case Type::DCT32X8: {
+      ReinterpretingDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/kBlockDim,
+                        /*LF_ROWS=*/4, /*LF_COLS=*/1, /*ROWS=*/4, /*COLS=*/1>(
+          dc, dc_stride, llf, 4 * kBlockDim);
+      break;
+    }
+    case Type::DCT8X32: {
+      ReinterpretingDCT</*DCT_ROWS=*/kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                        /*LF_ROWS=*/1, /*LF_COLS=*/4, /*ROWS=*/1, /*COLS=*/4>(
+          dc, dc_stride, llf, 4 * kBlockDim);
+      break;
+    }
+    case Type::DCT32X16: {
+      ReinterpretingDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/2 * kBlockDim,
+                        /*LF_ROWS=*/4, /*LF_COLS=*/2, /*ROWS=*/4, /*COLS=*/2>(
+          dc, dc_stride, llf, 4 * kBlockDim);
+      break;
+    }
+    case Type::DCT16X32: {
+      ReinterpretingDCT</*DCT_ROWS=*/2 * kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                        /*LF_ROWS=*/2, /*LF_COLS=*/4, /*ROWS=*/2, /*COLS=*/4>(
+          dc, dc_stride, llf, 4 * kBlockDim);
+      break;
+    }
+    case Type::DCT32X32: {
+      ReinterpretingDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                        /*LF_ROWS=*/4, /*LF_COLS=*/4, /*ROWS=*/4, /*COLS=*/4>(
+          dc, dc_stride, llf, 4 * kBlockDim);
+      break;
+    }
+    case Type::DCT64X32: {
+      ReinterpretingDCT</*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                        /*LF_ROWS=*/8, /*LF_COLS=*/4, /*ROWS=*/8, /*COLS=*/4>(
+          dc, dc_stride, llf, 8 * kBlockDim);
+      break;
+    }
+    case Type::DCT32X64: {
+      ReinterpretingDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim,
+                        /*LF_ROWS=*/4, /*LF_COLS=*/8, /*ROWS=*/4, /*COLS=*/8>(
+          dc, dc_stride, llf, 8 * kBlockDim);
+      break;
+    }
+    case Type::DCT64X64: {
+      ReinterpretingDCT</*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim,
+                        /*LF_ROWS=*/8, /*LF_COLS=*/8, /*ROWS=*/8, /*COLS=*/8>(
+          dc, dc_stride, llf, 8 * kBlockDim);
+      break;
+    }
+    case Type::DCT128X64: {
+      ReinterpretingDCT</*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim,
+                        /*LF_ROWS=*/16, /*LF_COLS=*/8, /*ROWS=*/16, /*COLS=*/8>(
+          dc, dc_stride, llf, 16 * kBlockDim);
+      break;
+    }
+    case Type::DCT64X128: {
+      ReinterpretingDCT</*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim,
+                        /*LF_ROWS=*/8, /*LF_COLS=*/16, /*ROWS=*/8, /*COLS=*/16>(
+          dc, dc_stride, llf, 16 * kBlockDim);
+      break;
+    }
+    case Type::DCT128X128: {
+      ReinterpretingDCT<
+          /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim,
+          /*LF_ROWS=*/16, /*LF_COLS=*/16, /*ROWS=*/16, /*COLS=*/16>(
+          dc, dc_stride, llf, 16 * kBlockDim);
+      break;
+    }
+    case Type::DCT256X128: {
+      ReinterpretingDCT<
+          /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim,
+          /*LF_ROWS=*/32, /*LF_COLS=*/16, /*ROWS=*/32, /*COLS=*/16>(
+          dc, dc_stride, llf, 32 * kBlockDim);
+      break;
+    }
+    case Type::DCT128X256: {
+      ReinterpretingDCT<
+          /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim,
+          /*LF_ROWS=*/16, /*LF_COLS=*/32, /*ROWS=*/16, /*COLS=*/32>(
+          dc, dc_stride, llf, 32 * kBlockDim);
+      break;
+    }
+    case Type::DCT256X256: {
+      ReinterpretingDCT<
+          /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim,
+          /*LF_ROWS=*/32, /*LF_COLS=*/32, /*ROWS=*/32, /*COLS=*/32>(
+          dc, dc_stride, llf, 32 * kBlockDim);
+      break;
+    }
+    case Type::DCT:
+    case Type::DCT2X2:
+    case Type::DCT4X4:
+    case Type::DCT4X8:
+    case Type::DCT8X4:
+    case Type::AFV0:
+    case Type::AFV1:
+    case Type::AFV2:
+    case Type::AFV3:
+    case Type::IDENTITY:
+      llf[0] = dc[0];
+      break;
+    case Type::kNumValidStrategies:
+      JXL_ABORT("Invalid strategy");
+  };
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_DEC_TRANSFORMS_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.cc b/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.cc
new file mode 100644
index 0000000000..9ee80c59dc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.cc
@@ -0,0 +1,41 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_transforms_testonly.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/dec_transforms_testonly.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/dec_transforms-inl.h"
+
+namespace jxl {
+
+#if HWY_ONCE
+HWY_EXPORT(TransformToPixels);
+void TransformToPixels(AcStrategy::Type strategy,
+                       float* JXL_RESTRICT coefficients,
+                       float* JXL_RESTRICT pixels, size_t pixels_stride,
+                       float* scratch_space) {
+  return HWY_DYNAMIC_DISPATCH(TransformToPixels)(strategy, coefficients, pixels,
+                                                 pixels_stride, scratch_space);
+}
+
+HWY_EXPORT(LowestFrequenciesFromDC);
+void LowestFrequenciesFromDC(const jxl::AcStrategy::Type strategy,
+                             const float* dc, size_t dc_stride, float* llf) {
+  return HWY_DYNAMIC_DISPATCH(LowestFrequenciesFromDC)(strategy, dc, dc_stride,
+                                                       llf);
+}
+
+HWY_EXPORT(AFVIDCT4x4);
+void AFVIDCT4x4(const float* JXL_RESTRICT coeffs, float* JXL_RESTRICT pixels) {
+  return HWY_DYNAMIC_DISPATCH(AFVIDCT4x4)(coeffs, pixels);
+}
+#endif  // HWY_ONCE
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.h b/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.h
new file mode 100644
index 0000000000..97c4ca543d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.h
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_TRANSFORMS_TESTONLY_H_
+#define LIB_JXL_DEC_TRANSFORMS_TESTONLY_H_
+
+// Facade for (non-inlined) inverse integral transforms.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+
+void TransformToPixels(AcStrategy::Type strategy,
+                       float* JXL_RESTRICT coefficients,
+                       float* JXL_RESTRICT pixels, size_t pixels_stride,
+                       float* JXL_RESTRICT scratch_space);
+
+// Equivalent of the above for DC image.
+void LowestFrequenciesFromDC(const jxl::AcStrategy::Type strategy,
+                             const float* dc, size_t dc_stride, float* llf);
+
+void AFVIDCT4x4(const float* JXL_RESTRICT coeffs, float* JXL_RESTRICT pixels);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_TRANSFORMS_TESTONLY_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_xyb-inl.h b/third_party/jpeg-xl/lib/jxl/dec_xyb-inl.h
new file mode 100644
index 0000000000..a4f24cd123
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_xyb-inl.h
@@ -0,0 +1,346 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// XYB -> linear sRGB helper function.
+
+#if defined(LIB_JXL_DEC_XYB_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_DEC_XYB_INL_H_
+#undef LIB_JXL_DEC_XYB_INL_H_
+#else
+#define LIB_JXL_DEC_XYB_INL_H_
+#endif
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_xyb.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Broadcast;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Sub;
+
+// Inverts the pixel-wise RGB->XYB conversion in OpsinDynamicsImage() (including
+// the gamma mixing and simple gamma). Avoids clamping to [0, 1] - out of (sRGB)
+// gamut values may be in-gamut after transforming to a wider space.
+// "inverse_matrix" points to 9 broadcasted vectors, which are the 3x3 entries
+// of the (row-major) opsin absorbance matrix inverse. Pre-multiplying its
+// entries by c is equivalent to multiplying linear_* by c afterwards.
+template <class D, class V>
+HWY_INLINE HWY_MAYBE_UNUSED void XybToRgb(D d, const V opsin_x, const V opsin_y,
+                                          const V opsin_b,
+                                          const OpsinParams& opsin_params,
+                                          V* const HWY_RESTRICT linear_r,
+                                          V* const HWY_RESTRICT linear_g,
+                                          V* const HWY_RESTRICT linear_b) {
+#if HWY_TARGET == HWY_SCALAR
+  const auto neg_bias_r = Set(d, opsin_params.opsin_biases[0]);
+  const auto neg_bias_g = Set(d, opsin_params.opsin_biases[1]);
+  const auto neg_bias_b = Set(d, opsin_params.opsin_biases[2]);
+#else
+  const auto neg_bias_rgb = LoadDup128(d, opsin_params.opsin_biases);
+  const auto neg_bias_r = Broadcast<0>(neg_bias_rgb);
+  const auto neg_bias_g = Broadcast<1>(neg_bias_rgb);
+  const auto neg_bias_b = Broadcast<2>(neg_bias_rgb);
+#endif
+
+  // Color space: XYB -> RGB
+  auto gamma_r = Add(opsin_y, opsin_x);
+  auto gamma_g = Sub(opsin_y, opsin_x);
+  auto gamma_b = opsin_b;
+
+  gamma_r = Sub(gamma_r, Set(d, opsin_params.opsin_biases_cbrt[0]));
+  gamma_g = Sub(gamma_g, Set(d, opsin_params.opsin_biases_cbrt[1]));
+  gamma_b = Sub(gamma_b, Set(d, opsin_params.opsin_biases_cbrt[2]));
+
+  // Undo gamma compression: linear = gamma^3 for efficiency.
+  const auto gamma_r2 = Mul(gamma_r, gamma_r);
+  const auto gamma_g2 = Mul(gamma_g, gamma_g);
+  const auto gamma_b2 = Mul(gamma_b, gamma_b);
+  const auto mixed_r = MulAdd(gamma_r2, gamma_r, neg_bias_r);
+  const auto mixed_g = MulAdd(gamma_g2, gamma_g, neg_bias_g);
+  const auto mixed_b = MulAdd(gamma_b2, gamma_b, neg_bias_b);
+
+  const float* HWY_RESTRICT inverse_matrix = opsin_params.inverse_opsin_matrix;
+
+  // Unmix (multiply by 3x3 inverse_matrix)
+  // TODO(eustas): ref would be more readable than pointer
+  *linear_r = Mul(LoadDup128(d, &inverse_matrix[0 * 4]), mixed_r);
+  *linear_g = Mul(LoadDup128(d, &inverse_matrix[3 * 4]), mixed_r);
+  *linear_b = Mul(LoadDup128(d, &inverse_matrix[6 * 4]), mixed_r);
+  *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[1 * 4]), mixed_g, *linear_r);
+  *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[4 * 4]), mixed_g, *linear_g);
+  *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[7 * 4]), mixed_g, *linear_b);
+  *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[2 * 4]), mixed_b, *linear_r);
+  *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[5 * 4]), mixed_b, *linear_g);
+  *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[8 * 4]), mixed_b, *linear_b);
+}
+
+static inline HWY_MAYBE_UNUSED bool HasFastXYBTosRGB8() {
+#if HWY_TARGET == HWY_NEON
+  return true;
+#else
+  return false;
+#endif
+}
+
+static inline HWY_MAYBE_UNUSED void FastXYBTosRGB8(const float* input[4],
+                                                   uint8_t* output,
+                                                   bool is_rgba, size_t xsize) {
+  // This function is very NEON-specific. As such, it uses intrinsics directly.
+#if HWY_TARGET == HWY_NEON
+  // WARNING: doing fixed point arithmetic correctly is very complicated.
+  // Changes to this function should be thoroughly tested.
+
+  // Note that the input is assumed to have 13 bits of mantissa, and the output
+  // will have 14 bits.
+  auto srgb_tf = [&](int16x8_t v16) {
+    int16x8_t clz = vclzq_s16(v16);
+    // Convert to [0.25, 0.5) range.
+    int16x8_t v025_05_16 = vqshlq_s16(v16, vqsubq_s16(clz, vdupq_n_s16(2)));
+
+    // third degree polynomial approximation between 0.25 and 0.5
+    // of 1.055/2^(7/2.4) * x^(1/2.4) / 32.
+    // poly ~ ((0.95x-1.75)*x+1.72)*x+0.29
+    // We actually compute ~ ((0.47x-0.87)*x+0.86)*(2x)+0.29 as 1.75 and 1.72
+    // overflow our fixed point representation.
+
+    int16x8_t twov = vqaddq_s16(v025_05_16, v025_05_16);
+
+    // 0.47 * x
+    int16x8_t step1 = vqrdmulhq_n_s16(v025_05_16, 15706);
+    // - 0.87
+    int16x8_t step2 = vsubq_s16(step1, vdupq_n_s16(28546));
+    // * x
+    int16x8_t step3 = vqrdmulhq_s16(step2, v025_05_16);
+    // + 0.86
+    int16x8_t step4 = vaddq_s16(step3, vdupq_n_s16(28302));
+    // * 2x
+    int16x8_t step5 = vqrdmulhq_s16(step4, twov);
+    // + 0.29
+    int16x8_t mul16 = vaddq_s16(step5, vdupq_n_s16(9485));
+
+    int16x8_t exp16 = vsubq_s16(vdupq_n_s16(11), clz);
+    // Compute 2**(1/2.4*exp16)/32. Values of exp16 that would overflow are
+    // capped to 1.
+    // Generated with the following Python script:
+    // a = []
+    // b = []
+    //
+    // for i in range(0, 16):
+    //   v = 2**(5/12.*i)
+    //   v /= 16
+    //   v *= 256 * 128
+    //   v = int(v)
+    //   a.append(v // 256)
+    //   b.append(v % 256)
+    //
+    // print(", ".join("0x%02x" % x for x in a))
+    //
+    // print(", ".join("0x%02x" % x for x in b))
+
+    HWY_ALIGN constexpr uint8_t k2to512powersm1div32_high[16] = {
+        0x08, 0x0a, 0x0e, 0x13, 0x19, 0x21, 0x2d, 0x3c,
+        0x50, 0x6b, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f,
+    };
+    HWY_ALIGN constexpr uint8_t k2to512powersm1div32_low[16] = {
+        0x00, 0xad, 0x41, 0x06, 0x65, 0xe7, 0x41, 0x68,
+        0xa2, 0xa2, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    };
+    // Using the highway implementation here since vqtbl1q is aarch64-only.
+    using hwy::HWY_NAMESPACE::Vec128;
+    uint8x16_t pow_low =
+        TableLookupBytes(
+            Vec128<uint8_t, 16>(vld1q_u8(k2to512powersm1div32_low)),
+            Vec128<uint8_t, 16>(vreinterpretq_u8_s16(exp16)))
+            .raw;
+    uint8x16_t pow_high =
+        TableLookupBytes(
+            Vec128<uint8_t, 16>(vld1q_u8(k2to512powersm1div32_high)),
+            Vec128<uint8_t, 16>(vreinterpretq_u8_s16(exp16)))
+            .raw;
+    int16x8_t pow16 = vreinterpretq_s16_u16(vsliq_n_u16(
+        vreinterpretq_u16_u8(pow_low), vreinterpretq_u16_u8(pow_high), 8));
+
+    // approximation of v * 12.92, divided by 2
+    // Note that our input is using 13 mantissa bits instead of 15.
+    int16x8_t v16_linear = vrshrq_n_s16(vmulq_n_s16(v16, 826), 5);
+    // 1.055*pow(v, 1/2.4) - 0.055, divided by 2
+    auto v16_pow = vsubq_s16(vqrdmulhq_s16(mul16, pow16), vdupq_n_s16(901));
+    // > 0.0031308f (note that v16 has 13 mantissa bits)
+    return vbslq_s16(vcgeq_s16(v16, vdupq_n_s16(26)), v16_pow, v16_linear);
+  };
+
+  const float* JXL_RESTRICT row_in_x = input[0];
+  const float* JXL_RESTRICT row_in_y = input[1];
+  const float* JXL_RESTRICT row_in_b = input[2];
+  const float* JXL_RESTRICT row_in_a = input[3];
+  for (size_t x = 0; x < xsize; x += 8) {
+    // Normal ranges for xyb for in-gamut sRGB colors:
+    // x: -0.015386 0.028100
+    // y: 0.000000 0.845308
+    // b: 0.000000 0.845308
+
+    // We actually want x * 8 to have some extra precision.
+    // TODO(veluca): consider different approaches here, like vld1q_f32_x2.
+    float32x4_t opsin_x_left = vld1q_f32(row_in_x + x);
+    int16x4_t opsin_x16_times8_left =
+        vqmovn_s32(vcvtq_n_s32_f32(opsin_x_left, 18));
+    float32x4_t opsin_x_right =
+        vld1q_f32(row_in_x + x + (x + 4 < xsize ? 4 : 0));
+    int16x4_t opsin_x16_times8_right =
+        vqmovn_s32(vcvtq_n_s32_f32(opsin_x_right, 18));
+    int16x8_t opsin_x16_times8 =
+        vcombine_s16(opsin_x16_times8_left, opsin_x16_times8_right);
+
+    float32x4_t opsin_y_left = vld1q_f32(row_in_y + x);
+    int16x4_t opsin_y16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_left, 15));
+    float32x4_t opsin_y_right =
+        vld1q_f32(row_in_y + x + (x + 4 < xsize ? 4 : 0));
+    int16x4_t opsin_y16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_right, 15));
+    int16x8_t opsin_y16 = vcombine_s16(opsin_y16_left, opsin_y16_right);
+
+    float32x4_t opsin_b_left = vld1q_f32(row_in_b + x);
+    int16x4_t opsin_b16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_left, 15));
+    float32x4_t opsin_b_right =
+        vld1q_f32(row_in_b + x + (x + 4 < xsize ? 4 : 0));
+    int16x4_t opsin_b16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_right, 15));
+    int16x8_t opsin_b16 = vcombine_s16(opsin_b16_left, opsin_b16_right);
+
+    int16x8_t neg_bias16 = vdupq_n_s16(-124);        // -0.0037930732552754493
+    int16x8_t neg_bias_cbrt16 = vdupq_n_s16(-5110);  // -0.155954201
+    int16x8_t neg_bias_half16 = vdupq_n_s16(-62);
+
+    // Color space: XYB -> RGB
+    // Compute ((y+x-bias_cbrt)^3-(y-x-bias_cbrt)^3)/2,
+    // ((y+x-bias_cbrt)^3+(y-x-bias_cbrt)^3)/2+bias, (b-bias_cbrt)^3+bias.
+    // Note that ignoring x2 in the formulas below (as x << y) results in
+    // errors of at least 3 in the final sRGB values.
+    int16x8_t opsin_yp16 = vqsubq_s16(opsin_y16, neg_bias_cbrt16);
+    int16x8_t ysq16 = vqrdmulhq_s16(opsin_yp16, opsin_yp16);
+    int16x8_t twentyfourx16 = vmulq_n_s16(opsin_x16_times8, 3);
+    int16x8_t twentyfourxy16 = vqrdmulhq_s16(opsin_yp16, twentyfourx16);
+    int16x8_t threexsq16 =
+        vrshrq_n_s16(vqrdmulhq_s16(opsin_x16_times8, twentyfourx16), 6);
+
+    // We can ignore x^3 here. Note that this is multiplied by 8.
+    int16x8_t mixed_rmg16 = vqrdmulhq_s16(twentyfourxy16, opsin_yp16);
+
+    int16x8_t mixed_rpg_sos_half = vhaddq_s16(ysq16, threexsq16);
+    int16x8_t mixed_rpg16 = vhaddq_s16(
+        vqrdmulhq_s16(opsin_yp16, mixed_rpg_sos_half), neg_bias_half16);
+
+    int16x8_t gamma_b16 = vqsubq_s16(opsin_b16, neg_bias_cbrt16);
+    int16x8_t gamma_bsq16 = vqrdmulhq_s16(gamma_b16, gamma_b16);
+    int16x8_t gamma_bcb16 = vqrdmulhq_s16(gamma_bsq16, gamma_b16);
+    int16x8_t mixed_b16 = vqaddq_s16(gamma_bcb16, neg_bias16);
+    // mixed_rpg and mixed_b are in 0-1 range.
+    // mixed_rmg has a smaller range (-0.035 to 0.035 for valid sRGB). Note
+    // that at this point it is already multiplied by 8.
+
+    // We multiply all the mixed values by 1/4 (i.e. shift them to 13-bit
+    // fixed point) to ensure intermediate quantities are in range. Note that
+    // r-g is not shifted, and was x8 before here; this corresponds to a x32
+    // overall multiplicative factor and ensures that all the matrix constants
+    // are in 0-1 range.
+    // Similarly, mixed_rpg16 is already multiplied by 1/4 because of the two
+    // vhadd + using neg_bias_half.
+    mixed_b16 = vshrq_n_s16(mixed_b16, 2);
+
+    // Unmix (multiply by 3x3 inverse_matrix)
+    // For increased precision, we use a matrix for converting from
+    // ((mixed_r - mixed_g)/2, (mixed_r + mixed_g)/2, mixed_b) to rgb. This
+    // avoids cancellation effects when computing (y+x)^3-(y-x)^3.
+    // We compute mixed_rpg - mixed_b because the (1+c)*mixed_rpg - c *
+    // mixed_b pattern is repeated frequently in the code below. This allows
+    // us to save a multiply per channel, and removes the presence of
+    // some constants above 1. Moreover, mixed_rmg - mixed_b is in (-1, 1)
+    // range, so the subtraction is safe.
+    // All the magic-looking constants here are derived by computing the
+    // inverse opsin matrix for the transformation modified as described
+    // above.
+
+    // Precomputation common to multiple color values.
+    int16x8_t mixed_rpgmb16 = vqsubq_s16(mixed_rpg16, mixed_b16);
+    int16x8_t mixed_rpgmb_times_016 = vqrdmulhq_n_s16(mixed_rpgmb16, 5394);
+    int16x8_t mixed_rg16 = vqaddq_s16(mixed_rpgmb_times_016, mixed_rpg16);
+
+    // R
+    int16x8_t linear_r16 =
+        vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, 21400));
+
+    // G
+    int16x8_t linear_g16 =
+        vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, -7857));
+
+    // B
+    int16x8_t linear_b16 = vqrdmulhq_n_s16(mixed_rpgmb16, -30996);
+    linear_b16 = vqaddq_s16(linear_b16, mixed_b16);
+    linear_b16 = vqaddq_s16(linear_b16, vqrdmulhq_n_s16(mixed_rmg16, -6525));
+
+    // Apply SRGB transfer function.
+    int16x8_t r = srgb_tf(linear_r16);
+    int16x8_t g = srgb_tf(linear_g16);
+    int16x8_t b = srgb_tf(linear_b16);
+
+    uint8x8_t r8 =
+        vqmovun_s16(vrshrq_n_s16(vsubq_s16(r, vshrq_n_s16(r, 8)), 6));
+    uint8x8_t g8 =
+        vqmovun_s16(vrshrq_n_s16(vsubq_s16(g, vshrq_n_s16(g, 8)), 6));
+    uint8x8_t b8 =
+        vqmovun_s16(vrshrq_n_s16(vsubq_s16(b, vshrq_n_s16(b, 8)), 6));
+
+    size_t n = xsize - x;
+    if (is_rgba) {
+      float32x4_t a_f32_left =
+          row_in_a ? vld1q_f32(row_in_a + x) : vdupq_n_f32(1.0f);
+      float32x4_t a_f32_right =
+          row_in_a ? vld1q_f32(row_in_a + x + (x + 4 < xsize ? 4 : 0))
+                   : vdupq_n_f32(1.0f);
+      int16x4_t a16_left = vqmovn_s32(vcvtq_n_s32_f32(a_f32_left, 8));
+      int16x4_t a16_right = vqmovn_s32(vcvtq_n_s32_f32(a_f32_right, 8));
+      uint8x8_t a8 = vqmovun_s16(vcombine_s16(a16_left, a16_right));
+      uint8_t* buf = output + 4 * x;
+      uint8x8x4_t data = {r8, g8, b8, a8};
+      if (n >= 8) {
+        vst4_u8(buf, data);
+      } else {
+        uint8_t tmp[8 * 4];
+        vst4_u8(tmp, data);
+        memcpy(buf, tmp, n * 4);
+      }
+    } else {
+      uint8_t* buf = output + 3 * x;
+      uint8x8x3_t data = {r8, g8, b8};
+      if (n >= 8) {
+        vst3_u8(buf, data);
+      } else {
+        uint8_t tmp[8 * 3];
+        vst3_u8(tmp, data);
+        memcpy(buf, tmp, n * 3);
+      }
+    }
+  }
+#else
+  (void)input;
+  (void)output;
+  (void)is_rgba;
+  (void)xsize;
+  JXL_ABORT("Unreachable");
+#endif
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_DEC_XYB_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_xyb.cc b/third_party/jpeg-xl/lib/jxl/dec_xyb.cc
new file mode 100644
index 0000000000..46fc63c49e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_xyb.cc
@@ -0,0 +1,329 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_xyb.h"
+
+#include <string.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/dec_xyb.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_group_border.h"
+#include "lib/jxl/dec_xyb-inl.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/matrix_ops.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/quantizer.h"
+#include "lib/jxl/sanitizers.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::MulAdd;
+
+void OpsinToLinearInplace(Image3F* JXL_RESTRICT inout, ThreadPool* pool,
+                          const OpsinParams& opsin_params) {
+  PROFILER_FUNC;
+  JXL_CHECK_IMAGE_INITIALIZED(*inout, Rect(*inout));
+
+  const size_t xsize = inout->xsize();  // not padded
+  JXL_CHECK(RunOnPool(
+      pool, 0, inout->ysize(), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /* thread */) {
+        const size_t y = task;
+
+        // Faster than adding via ByteOffset at end of loop.
+        float* JXL_RESTRICT row0 = inout->PlaneRow(0, y);
+        float* JXL_RESTRICT row1 = inout->PlaneRow(1, y);
+        float* JXL_RESTRICT row2 = inout->PlaneRow(2, y);
+
+        const HWY_FULL(float) d;
+
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          const auto in_opsin_x = Load(d, row0 + x);
+          const auto in_opsin_y = Load(d, row1 + x);
+          const auto in_opsin_b = Load(d, row2 + x);
+          auto linear_r = Undefined(d);
+          auto linear_g = Undefined(d);
+          auto linear_b = Undefined(d);
+          XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params,
+                   &linear_r, &linear_g, &linear_b);
+
+          Store(linear_r, d, row0 + x);
+          Store(linear_g, d, row1 + x);
+          Store(linear_b, d, row2 + x);
+        }
+      },
+      "OpsinToLinear"));
+}
+
+// Same, but not in-place.
+void OpsinToLinear(const Image3F& opsin, const Rect& rect, ThreadPool* pool,
+                   Image3F* JXL_RESTRICT linear,
+                   const OpsinParams& opsin_params) {
+  PROFILER_FUNC;
+
+  JXL_ASSERT(SameSize(rect, *linear));
+  JXL_CHECK_IMAGE_INITIALIZED(opsin, rect);
+
+  JXL_CHECK(RunOnPool(
+      pool, 0, static_cast<int>(rect.ysize()), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const size_t y = static_cast<size_t>(task);
+
+        // Faster than adding via ByteOffset at end of loop.
+        const float* JXL_RESTRICT row_opsin_0 = rect.ConstPlaneRow(opsin, 0, y);
+        const float* JXL_RESTRICT row_opsin_1 = rect.ConstPlaneRow(opsin, 1, y);
+        const float* JXL_RESTRICT row_opsin_2 = rect.ConstPlaneRow(opsin, 2, y);
+        float* JXL_RESTRICT row_linear_0 = linear->PlaneRow(0, y);
+        float* JXL_RESTRICT row_linear_1 = linear->PlaneRow(1, y);
+        float* JXL_RESTRICT row_linear_2 = linear->PlaneRow(2, y);
+
+        const HWY_FULL(float) d;
+
+        for (size_t x = 0; x < rect.xsize(); x += Lanes(d)) {
+          const auto in_opsin_x = Load(d, row_opsin_0 + x);
+          const auto in_opsin_y = Load(d, row_opsin_1 + x);
+          const auto in_opsin_b = Load(d, row_opsin_2 + x);
+          auto linear_r = Undefined(d);
+          auto linear_g = Undefined(d);
+          auto linear_b = Undefined(d);
+          XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params,
+                   &linear_r, &linear_g, &linear_b);
+
+          Store(linear_r, d, row_linear_0 + x);
+          Store(linear_g, d, row_linear_1 + x);
+          Store(linear_b, d, row_linear_2 + x);
+        }
+      },
+      "OpsinToLinear(Rect)"));
+  JXL_CHECK_IMAGE_INITIALIZED(*linear, rect);
+}
+
+// Transform YCbCr to RGB.
+// Could be performed in-place (i.e. Y, Cb and Cr could alias R, B and B).
+void YcbcrToRgb(const Image3F& ycbcr, Image3F* rgb, const Rect& rect) {
+  JXL_CHECK_IMAGE_INITIALIZED(ycbcr, rect);
+  const HWY_CAPPED(float, kBlockDim) df;
+  const size_t S = Lanes(df);  // Step.
+
+  const size_t xsize = rect.xsize();
+  const size_t ysize = rect.ysize();
+  if ((xsize == 0) || (ysize == 0)) return;
+
+  // Full-range BT.601 as defined by JFIF Clause 7:
+  // https://www.itu.int/rec/T-REC-T.871-201105-I/en
+  const auto c128 = Set(df, 128.0f / 255);
+  const auto crcr = Set(df, 1.402f);
+  const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f);
+  const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f);
+  const auto cbcb = Set(df, 1.772f);
+
+  for (size_t y = 0; y < ysize; y++) {
+    const float* y_row = rect.ConstPlaneRow(ycbcr, 1, y);
+    const float* cb_row = rect.ConstPlaneRow(ycbcr, 0, y);
+    const float* cr_row = rect.ConstPlaneRow(ycbcr, 2, y);
+    float* r_row = rect.PlaneRow(rgb, 0, y);
+    float* g_row = rect.PlaneRow(rgb, 1, y);
+    float* b_row = rect.PlaneRow(rgb, 2, y);
+    for (size_t x = 0; x < xsize; x += S) {
+      const auto y_vec = Add(Load(df, y_row + x), c128);
+      const auto cb_vec = Load(df, cb_row + x);
+      const auto cr_vec = Load(df, cr_row + x);
+      const auto r_vec = MulAdd(crcr, cr_vec, y_vec);
+      const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec));
+      const auto b_vec = MulAdd(cbcb, cb_vec, y_vec);
+      Store(r_vec, df, r_row + x);
+      Store(g_vec, df, g_row + x);
+      Store(b_vec, df, b_row + x);
+    }
+  }
+  JXL_CHECK_IMAGE_INITIALIZED(*rgb, rect);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(OpsinToLinearInplace);
+void OpsinToLinearInplace(Image3F* JXL_RESTRICT inout, ThreadPool* pool,
+                          const OpsinParams& opsin_params) {
+  return HWY_DYNAMIC_DISPATCH(OpsinToLinearInplace)(inout, pool, opsin_params);
+}
+
+HWY_EXPORT(OpsinToLinear);
+void OpsinToLinear(const Image3F& opsin, const Rect& rect, ThreadPool* pool,
+                   Image3F* JXL_RESTRICT linear,
+                   const OpsinParams& opsin_params) {
+  return HWY_DYNAMIC_DISPATCH(OpsinToLinear)(opsin, rect, pool, linear,
+                                             opsin_params);
+}
+
+HWY_EXPORT(YcbcrToRgb);
+void YcbcrToRgb(const Image3F& ycbcr, Image3F* rgb, const Rect& rect) {
+  return HWY_DYNAMIC_DISPATCH(YcbcrToRgb)(ycbcr, rgb, rect);
+}
+
+HWY_EXPORT(HasFastXYBTosRGB8);
+bool HasFastXYBTosRGB8() { return HWY_DYNAMIC_DISPATCH(HasFastXYBTosRGB8)(); }
+
+HWY_EXPORT(FastXYBTosRGB8);
+void FastXYBTosRGB8(const float* input[4], uint8_t* output, bool is_rgba,
+                    size_t xsize) {
+  return HWY_DYNAMIC_DISPATCH(FastXYBTosRGB8)(input, output, is_rgba, xsize);
+}
+
+void OpsinParams::Init(float intensity_target) {
+  InitSIMDInverseMatrix(GetOpsinAbsorbanceInverseMatrix(), inverse_opsin_matrix,
+                        intensity_target);
+  memcpy(opsin_biases, kNegOpsinAbsorbanceBiasRGB,
+         sizeof(kNegOpsinAbsorbanceBiasRGB));
+  memcpy(quant_biases, kDefaultQuantBias, sizeof(kDefaultQuantBias));
+  for (size_t c = 0; c < 4; c++) {
+    opsin_biases_cbrt[c] = cbrtf(opsin_biases[c]);
+  }
+}
+
+bool CanOutputToColorEncoding(const ColorEncoding& c_desired) {
+  if (!c_desired.HaveFields()) {
+    return false;
+  }
+  // TODO(veluca): keep in sync with dec_reconstruct.cc
+  if (!c_desired.tf.IsPQ() && !c_desired.tf.IsSRGB() &&
+      !c_desired.tf.IsGamma() && !c_desired.tf.IsLinear() &&
+      !c_desired.tf.IsHLG() && !c_desired.tf.IsDCI() && !c_desired.tf.Is709()) {
+    return false;
+  }
+  if (c_desired.IsGray() && c_desired.white_point != WhitePoint::kD65) {
+    // TODO(veluca): figure out what should happen here.
+    return false;
+  }
+  return true;
+}
+
+Status OutputEncodingInfo::SetFromMetadata(const CodecMetadata& metadata) {
+  orig_color_encoding = metadata.m.color_encoding;
+  orig_intensity_target = metadata.m.IntensityTarget();
+  desired_intensity_target = orig_intensity_target;
+  const auto& im = metadata.transform_data.opsin_inverse_matrix;
+  memcpy(orig_inverse_matrix, im.inverse_matrix, sizeof(orig_inverse_matrix));
+  default_transform = im.all_default;
+  xyb_encoded = metadata.m.xyb_encoded;
+  std::copy(std::begin(im.opsin_biases), std::end(im.opsin_biases),
+            opsin_params.opsin_biases);
+  for (int i = 0; i < 3; ++i) {
+    opsin_params.opsin_biases_cbrt[i] = cbrtf(opsin_params.opsin_biases[i]);
+  }
+  opsin_params.opsin_biases_cbrt[3] = opsin_params.opsin_biases[3] = 1;
+  std::copy(std::begin(im.quant_biases), std::end(im.quant_biases),
+            opsin_params.quant_biases);
+  bool orig_ok = CanOutputToColorEncoding(orig_color_encoding);
+  bool orig_grey = orig_color_encoding.IsGray();
+  return SetColorEncoding(!xyb_encoded || orig_ok
+                              ? orig_color_encoding
+                              : ColorEncoding::LinearSRGB(orig_grey));
+}
+
+Status OutputEncodingInfo::MaybeSetColorEncoding(
+    const ColorEncoding& c_desired) {
+  if (c_desired.GetColorSpace() == ColorSpace::kXYB &&
+      ((color_encoding.GetColorSpace() == ColorSpace::kRGB &&
+        color_encoding.primaries != Primaries::kSRGB) ||
+       color_encoding.tf.IsPQ())) {
+    return false;
+  }
+  if (!xyb_encoded && !CanOutputToColorEncoding(c_desired)) {
+    return false;
+  }
+  return SetColorEncoding(c_desired);
+}
+
+Status OutputEncodingInfo::SetColorEncoding(const ColorEncoding& c_desired) {
+  color_encoding = c_desired;
+  color_encoding_is_original = orig_color_encoding.SameColorEncoding(c_desired);
+
+  // Compute the opsin inverse matrix and luminances based on primaries and
+  // white point.
+  float inverse_matrix[9];
+  bool inverse_matrix_is_default = default_transform;
+  memcpy(inverse_matrix, orig_inverse_matrix, sizeof(inverse_matrix));
+  constexpr float kSRGBLuminances[3] = {0.2126, 0.7152, 0.0722};
+  memcpy(luminances, kSRGBLuminances, sizeof(luminances));
+  if ((c_desired.primaries != Primaries::kSRGB ||
+       c_desired.white_point != WhitePoint::kD65) &&
+      !c_desired.IsGray()) {
+    float srgb_to_xyzd50[9];
+    const auto& srgb = ColorEncoding::SRGB(/*is_gray=*/false);
+    JXL_CHECK(PrimariesToXYZD50(
+        srgb.GetPrimaries().r.x, srgb.GetPrimaries().r.y,
+        srgb.GetPrimaries().g.x, srgb.GetPrimaries().g.y,
+        srgb.GetPrimaries().b.x, srgb.GetPrimaries().b.y,
+        srgb.GetWhitePoint().x, srgb.GetWhitePoint().y, srgb_to_xyzd50));
+    float original_to_xyz[3][3];
+    JXL_RETURN_IF_ERROR(PrimariesToXYZ(
+        c_desired.GetPrimaries().r.x, c_desired.GetPrimaries().r.y,
+        c_desired.GetPrimaries().g.x, c_desired.GetPrimaries().g.y,
+        c_desired.GetPrimaries().b.x, c_desired.GetPrimaries().b.y,
+        c_desired.GetWhitePoint().x, c_desired.GetWhitePoint().y,
+        &original_to_xyz[0][0]));
+    memcpy(luminances, original_to_xyz[1], sizeof luminances);
+    if (xyb_encoded) {
+      float adapt_to_d50[9];
+      JXL_RETURN_IF_ERROR(AdaptToXYZD50(c_desired.GetWhitePoint().x,
+                                        c_desired.GetWhitePoint().y,
+                                        adapt_to_d50));
+      float xyzd50_to_original[9];
+      Mul3x3Matrix(adapt_to_d50, &original_to_xyz[0][0], xyzd50_to_original);
+      JXL_RETURN_IF_ERROR(Inv3x3Matrix(xyzd50_to_original));
+      float srgb_to_original[9];
+      Mul3x3Matrix(xyzd50_to_original, srgb_to_xyzd50, srgb_to_original);
+      Mul3x3Matrix(srgb_to_original, orig_inverse_matrix, inverse_matrix);
+      inverse_matrix_is_default = false;
+    }
+  }
+
+  if (c_desired.IsGray()) {
+    float tmp_inv_matrix[9];
+    memcpy(tmp_inv_matrix, inverse_matrix, sizeof(inverse_matrix));
+    float srgb_to_luma[9];
+    memcpy(&srgb_to_luma[0], luminances, sizeof(luminances));
+    memcpy(&srgb_to_luma[3], luminances, sizeof(luminances));
+    memcpy(&srgb_to_luma[6], luminances, sizeof(luminances));
+    Mul3x3Matrix(srgb_to_luma, tmp_inv_matrix, inverse_matrix);
+  }
+
+  // The internal XYB color space uses absolute luminance, so we scale back the
+  // opsin inverse matrix to relative luminance where 1.0 corresponds to the
+  // original intensity target, or to absolute luminance for PQ, where 1.0
+  // corresponds to 10000 nits.
+  if (xyb_encoded) {
+    float intensity_target =
+        (c_desired.tf.IsPQ() ? 10000 : orig_intensity_target);
+    InitSIMDInverseMatrix(inverse_matrix, opsin_params.inverse_opsin_matrix,
+                          intensity_target);
+    all_default_opsin = (std::abs(intensity_target - 255.0) <= 0.1f &&
+                         inverse_matrix_is_default);
+  }
+
+  // Set the inverse gamma based on color space transfer function.
+  inverse_gamma = (c_desired.tf.IsGamma() ? c_desired.tf.GetGamma()
+                   : c_desired.tf.IsDCI() ? 1.0f / 2.6f
+                                          : 1.0);
+  return true;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/dec_xyb.h b/third_party/jpeg-xl/lib/jxl/dec_xyb.h
new file mode 100644
index 0000000000..ebaae9a176
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_xyb.h
@@ -0,0 +1,89 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_XYB_H_
+#define LIB_JXL_DEC_XYB_H_
+
+// XYB -> linear sRGB.
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/opsin_params.h"
+
+namespace jxl {
+
+// Parameters for XYB->sRGB conversion.
+struct OpsinParams {
+  float inverse_opsin_matrix[9 * 4];
+  float opsin_biases[4];
+  float opsin_biases_cbrt[4];
+  float quant_biases[4];
+  void Init(float intensity_target);
+};
+
+struct OutputEncodingInfo {
+  //
+  // Fields depending only on image metadata
+  //
+  ColorEncoding orig_color_encoding;
+  // Used for the HLG OOTF and PQ tone mapping.
+  float orig_intensity_target;
+  // Opsin inverse matrix taken from the metadata.
+  float orig_inverse_matrix[9];
+  bool default_transform;
+  bool xyb_encoded;
+  //
+  // Fields depending on output color encoding
+  //
+  ColorEncoding color_encoding;
+  bool color_encoding_is_original;
+  // Contains an opsin matrix that converts to the primaries of the output
+  // encoding.
+  OpsinParams opsin_params;
+  bool all_default_opsin;
+  // Used for Gamma and DCI transfer functions.
+  float inverse_gamma;
+  // Luminances of color_encoding's primaries, used for the HLG inverse OOTF and
+  // for PQ tone mapping.
+  // Default to sRGB's.
+  float luminances[3];
+  // Used for the HLG inverse OOTF and PQ tone mapping.
+  float desired_intensity_target;
+
+  Status SetFromMetadata(const CodecMetadata& metadata);
+  Status MaybeSetColorEncoding(const ColorEncoding& c_desired);
+
+ private:
+  Status SetColorEncoding(const ColorEncoding& c_desired);
+};
+
+// Converts `inout` (not padded) from opsin to linear sRGB in-place. Called from
+// per-pass postprocessing, hence parallelized.
+void OpsinToLinearInplace(Image3F* JXL_RESTRICT inout, ThreadPool* pool,
+                          const OpsinParams& opsin_params);
+
+// Converts `opsin:rect` (opsin may be padded, rect.x0 must be vector-aligned)
+// to linear sRGB. Called from whole-frame encoder, hence parallelized.
+void OpsinToLinear(const Image3F& opsin, const Rect& rect, ThreadPool* pool,
+                   Image3F* JXL_RESTRICT linear,
+                   const OpsinParams& opsin_params);
+
+// Bt.601 to match JPEG/JFIF. Inputs are _signed_ YCbCr values suitable for DCT,
+// see F.1.1.3 of T.81 (because our data type is float, there is no need to add
+// a bias to make the values unsigned).
+void YcbcrToRgb(const Image3F& ycbcr, Image3F* rgb, const Rect& rect);
+
+bool HasFastXYBTosRGB8();
+void FastXYBTosRGB8(const float* input[4], uint8_t* output, bool is_rgba,
+                    size_t xsize);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_XYB_H_
diff --git a/third_party/jpeg-xl/lib/jxl/decode.cc b/third_party/jpeg-xl/lib/jxl/decode.cc
new file mode 100644
index 0000000000..5476f686f6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/decode.cc
@@ -0,0 +1,2809 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/decode.h>
+#include <jxl/types.h>
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#if JPEGXL_ENABLE_BOXES || JPEGXL_ENABLE_TRANSCODE_JPEG
+#include "lib/jxl/box_content_decoder.h"
+#endif
+#include "lib/jxl/dec_external_image.h"
+#include "lib/jxl/dec_frame.h"
+#include "lib/jxl/dec_modular.h"
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+#include "lib/jxl/decode_to_jpeg.h"
+#endif
+#include "lib/jxl/fields.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/icc_codec.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/memory_manager_internal.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/toc.h"
+
+namespace {
+
+// Checks if a + b > size, taking possible integer overflow into account.
+bool OutOfBounds(size_t a, size_t b, size_t size) {
+  size_t pos = a + b;
+  if (pos > size) return true;
+  if (pos < a) return true;  // overflow happened
+  return false;
+}
+
+bool SumOverflows(size_t a, size_t b, size_t c) {
+  size_t sum = a + b;
+  if (sum < b) return true;
+  sum += c;
+  if (sum < c) return true;
+  return false;
+}
+
+JXL_INLINE size_t InitialBasicInfoSizeHint() {
+  // Amount of bytes before the start of the codestream in the container format,
+  // assuming that the codestream is the first box after the signature and
+  // filetype boxes. 12 bytes signature box + 20 bytes filetype box + 16 bytes
+  // codestream box length + name + optional XLBox length.
+  const size_t container_header_size = 48;
+
+  // Worst-case amount of bytes for basic info of the JPEG XL codestream header,
+  // that is all information up to and including extra_channel_bits. Up to
+  // around 2 bytes signature + 8 bytes SizeHeader + 31 bytes ColorEncoding + 4
+  // bytes rest of ImageMetadata + 5 bytes part of ImageMetadata2.
+  // TODO(lode): recompute and update this value when alpha_bits is moved to
+  // extra channels info.
+  const size_t max_codestream_basic_info_size = 50;
+
+  return container_header_size + max_codestream_basic_info_size;
+}
+
+// Debug-printing failure macro similar to JXL_FAILURE, but for the status code
+// JXL_DEC_ERROR
+#ifdef JXL_CRASH_ON_ERROR
+#define JXL_API_ERROR(format, ...)                                           \
+  (::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \
+   ::jxl::Abort(), JXL_DEC_ERROR)
+#else  // JXL_CRASH_ON_ERROR
+#define JXL_API_ERROR(format, ...)                                             \
+  (((JXL_DEBUG_ON_ERROR) &&                                                    \
+    ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__)), \
+   JXL_DEC_ERROR)
+#endif  // JXL_CRASH_ON_ERROR
+
+JxlDecoderStatus ConvertStatus(JxlDecoderStatus status) { return status; }
+
+JxlDecoderStatus ConvertStatus(jxl::Status status) {
+  return status ? JXL_DEC_SUCCESS : JXL_DEC_ERROR;
+}
+
+JxlSignature ReadSignature(const uint8_t* buf, size_t len, size_t* pos) {
+  if (*pos >= len) return JXL_SIG_NOT_ENOUGH_BYTES;
+
+  buf += *pos;
+  len -= *pos;
+
+  // JPEG XL codestream: 0xff 0x0a
+  if (len >= 1 && buf[0] == 0xff) {
+    if (len < 2) {
+      return JXL_SIG_NOT_ENOUGH_BYTES;
+    } else if (buf[1] == jxl::kCodestreamMarker) {
+      *pos += 2;
+      return JXL_SIG_CODESTREAM;
+    } else {
+      return JXL_SIG_INVALID;
+    }
+  }
+
+  // JPEG XL container
+  if (len >= 1 && buf[0] == 0) {
+    if (len < 12) {
+      return JXL_SIG_NOT_ENOUGH_BYTES;
+    } else if (buf[1] == 0 && buf[2] == 0 && buf[3] == 0xC && buf[4] == 'J' &&
+               buf[5] == 'X' && buf[6] == 'L' && buf[7] == ' ' &&
+               buf[8] == 0xD && buf[9] == 0xA && buf[10] == 0x87 &&
+               buf[11] == 0xA) {
+      *pos += 12;
+      return JXL_SIG_CONTAINER;
+    } else {
+      return JXL_SIG_INVALID;
+    }
+  }
+
+  return JXL_SIG_INVALID;
+}
+
+}  // namespace
+
+uint32_t JxlDecoderVersion(void) {
+  return JPEGXL_MAJOR_VERSION * 1000000 + JPEGXL_MINOR_VERSION * 1000 +
+         JPEGXL_PATCH_VERSION;
+}
+
+JxlSignature JxlSignatureCheck(const uint8_t* buf, size_t len) {
+  size_t pos = 0;
+  return ReadSignature(buf, len, &pos);
+}
+
+namespace {
+
+size_t BitsPerChannel(JxlDataType data_type) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      return 8;
+    case JXL_TYPE_UINT16:
+      return 16;
+    case JXL_TYPE_FLOAT:
+      return 32;
+    case JXL_TYPE_FLOAT16:
+      return 16;
+    default:
+      return 0;  // signals unhandled JxlDataType
+  }
+}
+
+template <typename T>
+uint32_t GetBitDepth(JxlBitDepth bit_depth, const T& metadata,
+                     JxlPixelFormat format) {
+  if (bit_depth.type == JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
+    return BitsPerChannel(format.data_type);
+  } else if (bit_depth.type == JXL_BIT_DEPTH_FROM_CODESTREAM) {
+    return metadata.bit_depth.bits_per_sample;
+  } else if (bit_depth.type == JXL_BIT_DEPTH_CUSTOM) {
+    return bit_depth.bits_per_sample;
+  } else {
+    return 0;
+  }
+}
+
+enum class DecoderStage : uint32_t {
+  kInited,              // Decoder created, no JxlDecoderProcessInput called yet
+  kStarted,             // Running JxlDecoderProcessInput calls
+  kCodestreamFinished,  // Codestream done, but other boxes could still occur.
+                        // This stage can also occur before having seen the
+                        // entire codestream if the user didn't subscribe to any
+                        // codestream events at all, e.g. only to box events,
+                        // or, the user only subscribed to basic info, and only
+                        // the header of the codestream was parsed.
+  kError,               // Error occurred, decoder object no longer usable
+};
+
+enum class FrameStage : uint32_t {
+  kHeader,  // Must parse frame header.
+  kTOC,     // Must parse TOC
+  kFull,    // Must parse full pixels
+};
+
+enum class BoxStage : uint32_t {
+  kHeader,      // Parsing box header of the next box, or start of non-container
+                // stream
+  kFtyp,        // The ftyp box
+  kSkip,        // Box whose contents are skipped
+  kCodestream,  // Handling codestream box contents, or non-container stream
+  kPartialCodestream,  // Handling the extra header of partial codestream box
+  kJpegRecon,          // Handling jpeg reconstruction box
+};
+
+enum class JpegReconStage : uint32_t {
+  kNone,             // Not outputting
+  kSettingMetadata,  // Ready to output, must set metadata to the jpeg_data
+  kOutputting,       // Currently outputting the JPEG bytes
+  kFinished,         // JPEG reconstruction fully handled
+};
+
+/*
+Given list of frame references to storage slots, and storage slots in which this
+frame is saved, computes which frames are required to decode the frame at the
+given index and any frames after it. The frames on which this depends are
+returned as a vector of their indices, in no particular order. The given index
+must be smaller than saved_as.size(), and references.size() must equal
+saved_as.size(). Any frames beyond saved_as and references are considered
+unknown future frames and must be treated as if something depends on them.
+*/
+std::vector<size_t> GetFrameDependencies(size_t index,
+                                         const std::vector<int>& saved_as,
+                                         const std::vector<int>& references) {
+  JXL_ASSERT(references.size() == saved_as.size());
+  JXL_ASSERT(index < references.size());
+
+  std::vector<size_t> result;
+
+  constexpr size_t kNumStorage = 8;
+
+  // value which indicates nothing is stored in this storage slot
+  const size_t invalid = references.size();
+  // for each of the 8 storage slots, a vector that translates frame index to
+  // frame stored in this storage slot at this point, that is, the last
+  // frame that was stored in this slot before or at this index.
+  std::array<std::vector<size_t>, kNumStorage> storage;
+  for (size_t s = 0; s < kNumStorage; ++s) {
+    storage[s].resize(saved_as.size());
+    int mask = 1 << s;
+    size_t id = invalid;
+    for (size_t i = 0; i < saved_as.size(); ++i) {
+      if (saved_as[i] & mask) {
+        id = i;
+      }
+      storage[s][i] = id;
+    }
+  }
+
+  std::vector<char> seen(index + 1, 0);
+  std::vector<size_t> stack;
+  stack.push_back(index);
+  seen[index] = 1;
+
+  // For frames after index, assume they can depend on any of the 8 storage
+  // slots, so push the frame for each stored reference to the stack and result.
+  // All frames after index are treated as having unknown references and with
+  // the possibility that there are more frames after the last known.
+  // TODO(lode): take values of saved_as and references after index, and a
+  // input flag indicating if they are all frames of the image, to further
+  // optimize this.
+  for (size_t s = 0; s < kNumStorage; ++s) {
+    size_t frame_ref = storage[s][index];
+    if (frame_ref == invalid) continue;
+    if (seen[frame_ref]) continue;
+    stack.push_back(frame_ref);
+    seen[frame_ref] = 1;
+    result.push_back(frame_ref);
+  }
+
+  while (!stack.empty()) {
+    size_t frame_index = stack.back();
+    stack.pop_back();
+    if (frame_index == 0) continue;  // first frame cannot have references
+    for (size_t s = 0; s < kNumStorage; ++s) {
+      int mask = 1 << s;
+      if (!(references[frame_index] & mask)) continue;
+      size_t frame_ref = storage[s][frame_index - 1];
+      if (frame_ref == invalid) continue;
+      if (seen[frame_ref]) continue;
+      stack.push_back(frame_ref);
+      seen[frame_ref] = 1;
+      result.push_back(frame_ref);
+    }
+  }
+
+  return result;
+}
+
+// Parameters for user-requested extra channel output.
+struct ExtraChannelOutput {
+  JxlPixelFormat format;
+  void* buffer;
+  size_t buffer_size;
+};
+
+}  // namespace
+
+namespace jxl {
+
+typedef struct JxlDecoderFrameIndexBoxEntryStruct {
+  // OFFi: offset of start byte of this frame compared to start
+  // byte of previous frame from this index in the JPEG XL codestream. For the
+  // first frame, this is the offset from the first byte of the JPEG XL
+  // codestream.
+  uint64_t OFFi;
+  // Ti: duration in ticks between the start of this frame and
+  // the start of the next frame in the index. If this is the last frame in the
+  // index, this is the duration in ticks between the start of this frame and
+  // the end of the stream. A tick lasts TNUM / TDEN seconds.
+  uint32_t Ti;
+  // Fi: amount of frames the next frame in the index occurs
+  // after this frame. If this is the last frame in the index, this is the
+  // amount of frames after this frame in the remainder of the stream. Only
+  // frames that are presented by the decoder are counted for this purpose, this
+  // excludes frames that are not intended for display but for compositing with
+  // other frames, such as frames that aren't the last frame with a duration of
+  // 0 ticks.
+  uint32_t Fi;
+} JxlDecoderFrameIndexBoxEntry;
+
+typedef struct JxlDecoderFrameIndexBoxStruct {
+  int64_t NF() const { return entries.size(); }
+  int32_t TNUM = 1;
+  int32_t TDEN = 1000;
+
+  std::vector<JxlDecoderFrameIndexBoxEntry> entries;
+
+  // That way we can ensure that every index box will have the first frame.
+  // If the API user decides to mark it as an indexed frame, we call
+  // the AddFrame again, this time with requested.
+  void AddFrame(uint64_t OFFi, uint32_t Ti, uint32_t Fi) {
+    JxlDecoderFrameIndexBoxEntry e;
+    e.OFFi = OFFi;
+    e.Ti = Ti;
+    e.Fi = Fi;
+    entries.push_back(e);
+  }
+} JxlDecoderFrameIndexBox;
+
+}  // namespace jxl
+
+// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding)
+struct JxlDecoderStruct {
+  JxlDecoderStruct() = default;
+
+  JxlMemoryManager memory_manager;
+  std::unique_ptr<jxl::ThreadPool> thread_pool;
+
+  DecoderStage stage;
+
+  // Status of progression, internal.
+  bool got_signature;
+  // Indicates we know that we've seen the last codestream box: either this
+  // was a jxlc box, or a jxlp box that has its index indicated as last by
+  // having its most significant bit set, or no boxes are used at all. This
+  // does not indicate the full codestream has already been seen, only the
+  // last box of it has been initiated.
+  bool last_codestream_seen;
+  bool got_codestream_signature;
+  bool got_basic_info;
+  bool got_transform_data;  // To skip everything before ICC.
+  bool got_all_headers;     // Codestream metadata headers.
+  bool post_headers;        // Already decoding pixels.
+  jxl::ICCReader icc_reader;
+  jxl::JxlDecoderFrameIndexBox frame_index_box;
+  // This means either we actually got the preview image, or determined we
+  // cannot get it or there is none.
+  bool got_preview_image;
+  bool preview_frame;
+
+  // Position of next_in in the original file including box format if present
+  // (as opposed to position in the codestream)
+  size_t file_pos;
+
+  size_t box_contents_begin;
+  size_t box_contents_end;
+  size_t box_contents_size;
+  size_t box_size;
+  size_t header_size;
+  // Either a final box that runs until EOF, or the case of no container format
+  // at all.
+  bool box_contents_unbounded;
+
+  JxlBoxType box_type;
+  JxlBoxType box_decoded_type;  // Underlying type for brob boxes
+  // Set to true right after a JXL_DEC_BOX event only.
+  bool box_event;
+  bool decompress_boxes;
+
+  bool box_out_buffer_set;
+  // Whether the out buffer is set for the current box, if the user did not yet
+  // release the buffer while the next box is encountered, this will be set to
+  // false. If this is false, no JXL_DEC_NEED_MORE_INPUT is emitted
+  // (irrespective of the value of box_out_buffer_set), because not setting
+  // output indicates the user does not wish the data of this box.
+  bool box_out_buffer_set_current_box;
+  uint8_t* box_out_buffer;
+  size_t box_out_buffer_size;
+  // which byte of the full box content the start of the out buffer points to
+  size_t box_out_buffer_begin;
+  // which byte of box_out_buffer to write to next
+  size_t box_out_buffer_pos;
+
+  // Settings
+  bool keep_orientation;
+  bool unpremul_alpha;
+  bool render_spotcolors;
+  bool coalescing;
+  float desired_intensity_target;
+
+  // Bitfield, for which informative events (JXL_DEC_BASIC_INFO, etc...) the
+  // decoder returns a status. By default, do not return for any of the events,
+  // only return when the decoder cannot continue because it needs more input or
+  // output data.
+  int events_wanted;
+  int orig_events_wanted;
+
+  // Fields for reading the basic info from the header.
+  size_t basic_info_size_hint;
+  bool have_container;
+  size_t box_count;
+
+  // The level of progressive detail in frame decoding.
+  JxlProgressiveDetail prog_detail = kDC;
+  // The progressive detail of the current frame.
+  JxlProgressiveDetail frame_prog_detail;
+  // The intended downsampling ratio for the current progression step.
+  size_t downsampling_target;
+
+  // Set to true if either an image out buffer or an image out callback was set.
+  bool image_out_buffer_set;
+
+  // Owned by the caller, buffer for preview or full resolution image.
+  void* image_out_buffer;
+  JxlImageOutInitCallback image_out_init_callback;
+  JxlImageOutRunCallback image_out_run_callback;
+  JxlImageOutDestroyCallback image_out_destroy_callback;
+  void* image_out_init_opaque;
+  struct SimpleImageOutCallback {
+    JxlImageOutCallback callback;
+    void* opaque;
+  };
+  SimpleImageOutCallback simple_image_out_callback;
+
+  size_t image_out_size;
+
+  JxlPixelFormat image_out_format;
+  JxlBitDepth image_out_bit_depth;
+
+  // For extra channels. Empty if no extra channels are requested, and they are
+  // reset each frame
+  std::vector<ExtraChannelOutput> extra_channel_output;
+
+  jxl::CodecMetadata metadata;
+  // Same as metadata.m, except for the color_encoding, which is set to the
+  // output encoding.
+  jxl::ImageMetadata image_metadata;
+  std::unique_ptr<jxl::ImageBundle> ib;
+
+  std::unique_ptr<jxl::PassesDecoderState> passes_state;
+  std::unique_ptr<jxl::FrameDecoder> frame_dec;
+  size_t next_section;
+  std::vector<char> section_processed;
+
+  // headers and TOC for the current frame. When got_toc is true, this is
+  // always the frame header of the last frame of the current still series,
+  // that is, the displayed frame.
+  std::unique_ptr<jxl::FrameHeader> frame_header;
+
+  size_t remaining_frame_size;
+  FrameStage frame_stage;
+  bool dc_frame_progression_done;
+  // The currently processed frame is the last of the current composite still,
+  // and so must be returned as pixels
+  bool is_last_of_still;
+  // The currently processed frame is the last of the codestream
+  bool is_last_total;
+  // How many frames to skip.
+  size_t skip_frames;
+  // Skipping the current frame. May be false if skip_frames was just set to
+  // a positive value while already processing a current frame, then
+  // skipping_frame will be enabled only for the next frame.
+  bool skipping_frame;
+
+  // Amount of internal frames and external frames started. External frames are
+  // user-visible frames, internal frames includes all external frames and
+  // also invisible frames such as patches, blending-only and dc_level frames.
+  size_t internal_frames;
+  size_t external_frames;
+
+  // For each internal frame, which storage locations it references, and which
+  // storage locations it is stored in, using the bit mask as defined in
+  // FrameDecoder::References and FrameDecoder::SaveAs.
+  std::vector<int> frame_references;
+  std::vector<int> frame_saved_as;
+
+  // Translates external frame index to internal frame index. The external
+  // index is the index of user-visible frames. The internal index can be larger
+  // since non-visible frames (such as frames with patches, ...) are included.
+  std::vector<size_t> frame_external_to_internal;
+
+  // Whether the frame with internal index is required to decode the frame
+  // being skipped to or any frames after that. If no skipping is active,
+  // this vector is ignored. If the current internal frame index is beyond this
+  // vector, it must be treated as a required frame.
+  std::vector<char> frame_required;
+
+  // Codestream input data is copied here temporarily when the decoder needs
+  // more input bytes to process the next part of the stream. We copy the input
+  // data in order to be able to release it all through the API it when
+  // returning JXL_DEC_NEED_MORE_INPUT.
+  std::vector<uint8_t> codestream_copy;
+  // Number of bytes at the end of codestream_copy that were not yet consumed
+  // by calling AdvanceInput().
+  size_t codestream_unconsumed;
+  // Position in the codestream_copy vector that the decoder already finished
+  // processing. It can be greater than the current size of codestream_copy in
+  // case where the decoder skips some parts of the frame that were not yet
+  // provided.
+  size_t codestream_pos;
+  // Number of bits after codestream_pos that were already processed.
+  size_t codestream_bits_ahead;
+
+  BoxStage box_stage;
+
+#if JPEGXL_ENABLE_BOXES
+  jxl::JxlBoxContentDecoder box_content_decoder;
+#endif
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+  jxl::JxlToJpegDecoder jpeg_decoder;
+  // Decodes Exif or XMP metadata for JPEG reconstruction
+  jxl::JxlBoxContentDecoder metadata_decoder;
+  std::vector<uint8_t> exif_metadata;
+  std::vector<uint8_t> xmp_metadata;
+  // must store JPEG reconstruction metadata from the current box
+  // 0 = not stored, 1 = currently storing, 2 = finished
+  int store_exif;
+  int store_xmp;
+  size_t recon_out_buffer_pos;
+  size_t recon_exif_size;  // Expected exif size as read from the jbrd box
+  size_t recon_xmp_size;   // Expected exif size as read from the jbrd box
+  JpegReconStage recon_output_jpeg;
+
+  bool JbrdNeedMoreBoxes() const {
+    // jbrd box wants exif but exif box not yet seen
+    if (store_exif < 2 && recon_exif_size > 0) return true;
+    // jbrd box wants xmp but xmp box not yet seen
+    if (store_xmp < 2 && recon_xmp_size > 0) return true;
+    return false;
+  }
+#endif
+
+  const uint8_t* next_in;
+  size_t avail_in;
+  bool input_closed;
+
+  void AdvanceInput(size_t size) {
+    JXL_DASSERT(avail_in >= size);
+    next_in += size;
+    avail_in -= size;
+    file_pos += size;
+  }
+
+  size_t AvailableCodestream() const {
+    size_t avail_codestream = avail_in;
+    if (!box_contents_unbounded) {
+      avail_codestream =
+          std::min<size_t>(avail_codestream, box_contents_end - file_pos);
+    }
+    return avail_codestream;
+  }
+
+  void AdvanceCodestream(size_t size) {
+    size_t avail_codestream = AvailableCodestream();
+    if (codestream_copy.empty()) {
+      if (size <= avail_codestream) {
+        AdvanceInput(size);
+      } else {
+        codestream_pos = size - avail_codestream;
+        AdvanceInput(avail_codestream);
+      }
+    } else {
+      codestream_pos += size;
+      if (codestream_pos + codestream_unconsumed >= codestream_copy.size()) {
+        size_t advance = std::min(
+            codestream_unconsumed,
+            codestream_unconsumed + codestream_pos - codestream_copy.size());
+        AdvanceInput(advance);
+        codestream_pos -= std::min(codestream_pos, codestream_copy.size());
+        codestream_unconsumed = 0;
+        codestream_copy.clear();
+      }
+    }
+  }
+
+  JxlDecoderStatus RequestMoreInput() {
+    if (codestream_copy.empty()) {
+      size_t avail_codestream = AvailableCodestream();
+      codestream_copy.insert(codestream_copy.end(), next_in,
+                             next_in + avail_codestream);
+      AdvanceInput(avail_codestream);
+    } else {
+      AdvanceInput(codestream_unconsumed);
+      codestream_unconsumed = 0;
+    }
+    return JXL_DEC_NEED_MORE_INPUT;
+  }
+
+  JxlDecoderStatus GetCodestreamInput(jxl::Span<const uint8_t>* span) {
+    if (codestream_copy.empty() && codestream_pos > 0) {
+      size_t avail_codestream = AvailableCodestream();
+      size_t skip = std::min<size_t>(codestream_pos, avail_codestream);
+      AdvanceInput(skip);
+      codestream_pos -= skip;
+      if (codestream_pos > 0) {
+        return RequestMoreInput();
+      }
+    }
+    JXL_ASSERT(codestream_pos <= codestream_copy.size());
+    JXL_ASSERT(codestream_unconsumed <= codestream_copy.size());
+    size_t avail_codestream = AvailableCodestream();
+    if (codestream_copy.empty()) {
+      if (avail_codestream == 0) {
+        return RequestMoreInput();
+      }
+      *span = jxl::Span<const uint8_t>(next_in, avail_codestream);
+      return JXL_DEC_SUCCESS;
+    } else {
+      codestream_copy.insert(codestream_copy.end(),
+                             next_in + codestream_unconsumed,
+                             next_in + avail_codestream);
+      codestream_unconsumed = avail_codestream;
+      *span = jxl::Span<const uint8_t>(codestream_copy.data() + codestream_pos,
+                                       codestream_copy.size() - codestream_pos);
+      return JXL_DEC_SUCCESS;
+    }
+  }
+
+  // Whether the decoder can use more codestream input for a purpose it needs.
+  // This returns false if the user didn't subscribe to any events that
+  // require the codestream (e.g. only subscribed to metadata boxes), or all
+  // parts of the codestream that are subscribed to (e.g. only basic info) have
+  // already occurred.
+  bool CanUseMoreCodestreamInput() const {
+    // The decoder can set this to finished early if all relevant events were
+    // processed, so this check works.
+    return stage != DecoderStage::kCodestreamFinished;
+  }
+
+  // If set then some operations will fail, if those would require
+  // allocating large objects. Actual memory usage might be two orders of
+  // magnitude bigger.
+  // TODO(eustas): remove once there is working API for memory / CPU limit.
+  size_t memory_limit_base = 0;
+  size_t cpu_limit_base = 0;
+  size_t used_cpu_base = 0;
+};
+
+namespace {
+
+bool CheckSizeLimit(JxlDecoder* dec, size_t xsize, size_t ysize) {
+  if (!dec->memory_limit_base) return true;
+  if (xsize == 0 || ysize == 0) return true;
+  if (xsize >= dec->memory_limit_base || ysize >= dec->memory_limit_base) {
+    return false;
+  }
+  // Rough estimate of real row length.
+  xsize = jxl::DivCeil(xsize, 32) * 32;
+  size_t num_pixels = xsize * ysize;
+  if (num_pixels / xsize != ysize) return false;  // overflow
+  if (num_pixels > dec->memory_limit_base) return false;
+  return true;
+}
+
+}  // namespace
+
+// TODO(zond): Make this depend on the data loaded into the decoder.
+JxlDecoderStatus JxlDecoderDefaultPixelFormat(const JxlDecoder* dec,
+                                              JxlPixelFormat* format) {
+  if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT;
+  *format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+  return JXL_DEC_SUCCESS;
+}
+
+// Resets the state that must be reset for both Rewind and Reset
+void JxlDecoderRewindDecodingState(JxlDecoder* dec) {
+  dec->stage = DecoderStage::kInited;
+  dec->got_signature = false;
+  dec->last_codestream_seen = false;
+  dec->got_codestream_signature = false;
+  dec->got_basic_info = false;
+  dec->got_transform_data = false;
+  dec->got_all_headers = false;
+  dec->post_headers = false;
+  dec->icc_reader.Reset();
+  dec->got_preview_image = false;
+  dec->preview_frame = false;
+  dec->file_pos = 0;
+  dec->box_contents_begin = 0;
+  dec->box_contents_end = 0;
+  dec->box_contents_size = 0;
+  dec->box_size = 0;
+  dec->header_size = 0;
+  dec->box_contents_unbounded = false;
+  memset(dec->box_type, 0, sizeof(dec->box_type));
+  memset(dec->box_decoded_type, 0, sizeof(dec->box_decoded_type));
+  dec->box_event = false;
+  dec->box_stage = BoxStage::kHeader;
+  dec->box_out_buffer_set = false;
+  dec->box_out_buffer_set_current_box = false;
+  dec->box_out_buffer = nullptr;
+  dec->box_out_buffer_size = 0;
+  dec->box_out_buffer_begin = 0;
+  dec->box_out_buffer_pos = 0;
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+  dec->exif_metadata.clear();
+  dec->xmp_metadata.clear();
+  dec->store_exif = 0;
+  dec->store_xmp = 0;
+  dec->recon_out_buffer_pos = 0;
+  dec->recon_exif_size = 0;
+  dec->recon_xmp_size = 0;
+  dec->recon_output_jpeg = JpegReconStage::kNone;
+#endif
+
+  dec->events_wanted = dec->orig_events_wanted;
+  dec->basic_info_size_hint = InitialBasicInfoSizeHint();
+  dec->have_container = 0;
+  dec->box_count = 0;
+  dec->downsampling_target = 8;
+  dec->image_out_buffer_set = false;
+  dec->image_out_buffer = nullptr;
+  dec->image_out_init_callback = nullptr;
+  dec->image_out_run_callback = nullptr;
+  dec->image_out_destroy_callback = nullptr;
+  dec->image_out_init_opaque = nullptr;
+  dec->image_out_size = 0;
+  dec->image_out_bit_depth.type = JXL_BIT_DEPTH_FROM_PIXEL_FORMAT;
+  dec->extra_channel_output.clear();
+  dec->next_in = 0;
+  dec->avail_in = 0;
+  dec->input_closed = false;
+
+  dec->passes_state.reset(nullptr);
+  dec->frame_dec.reset(nullptr);
+  dec->next_section = 0;
+  dec->section_processed.clear();
+
+  dec->ib.reset();
+  dec->metadata = jxl::CodecMetadata();
+  dec->image_metadata = dec->metadata.m;
+  dec->frame_header.reset(new jxl::FrameHeader(&dec->metadata));
+
+  dec->codestream_copy.clear();
+  dec->codestream_unconsumed = 0;
+  dec->codestream_pos = 0;
+  dec->codestream_bits_ahead = 0;
+
+  dec->frame_stage = FrameStage::kHeader;
+  dec->remaining_frame_size = 0;
+  dec->is_last_of_still = false;
+  dec->is_last_total = false;
+  dec->skip_frames = 0;
+  dec->skipping_frame = false;
+  dec->internal_frames = 0;
+  dec->external_frames = 0;
+}
+
+void JxlDecoderReset(JxlDecoder* dec) {
+  JxlDecoderRewindDecodingState(dec);
+
+  dec->thread_pool.reset();
+  dec->keep_orientation = false;
+  dec->unpremul_alpha = false;
+  dec->render_spotcolors = true;
+  dec->coalescing = true;
+  dec->desired_intensity_target = 0;
+  dec->orig_events_wanted = 0;
+  dec->events_wanted = 0;
+  dec->frame_references.clear();
+  dec->frame_saved_as.clear();
+  dec->frame_external_to_internal.clear();
+  dec->frame_required.clear();
+  dec->decompress_boxes = false;
+}
+
+JxlDecoder* JxlDecoderCreate(const JxlMemoryManager* memory_manager) {
+  JxlMemoryManager local_memory_manager;
+  if (!jxl::MemoryManagerInit(&local_memory_manager, memory_manager))
+    return nullptr;
+
+  void* alloc =
+      jxl::MemoryManagerAlloc(&local_memory_manager, sizeof(JxlDecoder));
+  if (!alloc) return nullptr;
+  // Placement new constructor on allocated memory
+  JxlDecoder* dec = new (alloc) JxlDecoder();
+  dec->memory_manager = local_memory_manager;
+
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  if (!memory_manager) {
+    dec->memory_limit_base = 53 << 16;
+    // Allow 5 x max_image_size processing units; every frame is accounted
+    // as W x H CPU processing units, so there could be numerous small frames
+    // or few larger ones.
+    dec->cpu_limit_base = 5 * dec->memory_limit_base;
+  }
+#endif
+
+  JxlDecoderReset(dec);
+
+  return dec;
+}
+
+void JxlDecoderDestroy(JxlDecoder* dec) {
+  if (dec) {
+    JxlMemoryManager local_memory_manager = dec->memory_manager;
+    // Call destructor directly since custom free function is used.
+    dec->~JxlDecoder();
+    jxl::MemoryManagerFree(&local_memory_manager, dec);
+  }
+}
+
+void JxlDecoderRewind(JxlDecoder* dec) { JxlDecoderRewindDecodingState(dec); }
+
+void JxlDecoderSkipFrames(JxlDecoder* dec, size_t amount) {
+  // Increment amount, rather than set it: making the amount smaller is
+  // impossible because the decoder may already have skipped frames required to
+  // decode earlier frames, and making the amount larger compared to an existing
+  // amount is impossible because if JxlDecoderSkipFrames is called in the
+  // middle of already skipping frames, the user cannot know how many frames
+  // have already been skipped internally so far so an absolute value cannot
+  // be defined.
+  dec->skip_frames += amount;
+
+  dec->frame_required.clear();
+  size_t next_frame = dec->external_frames + dec->skip_frames;
+
+  // A frame that has been seen before a rewind
+  if (next_frame < dec->frame_external_to_internal.size()) {
+    size_t internal_index = dec->frame_external_to_internal[next_frame];
+    if (internal_index < dec->frame_saved_as.size()) {
+      std::vector<size_t> deps = GetFrameDependencies(
+          internal_index, dec->frame_saved_as, dec->frame_references);
+
+      dec->frame_required.resize(internal_index + 1, 0);
+      for (size_t i = 0; i < deps.size(); i++) {
+        JXL_ASSERT(deps[i] < dec->frame_required.size());
+        dec->frame_required[deps[i]] = 1;
+      }
+    }
+  }
+}
+
+JxlDecoderStatus JxlDecoderSkipCurrentFrame(JxlDecoder* dec) {
+  if (dec->frame_stage != FrameStage::kFull) {
+    return JXL_DEC_ERROR;
+  }
+  JXL_DASSERT(dec->frame_dec);
+  dec->frame_stage = FrameStage::kHeader;
+  dec->AdvanceCodestream(dec->remaining_frame_size);
+  if (dec->is_last_of_still) {
+    dec->image_out_buffer_set = false;
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+JXL_EXPORT JxlDecoderStatus
+JxlDecoderSetParallelRunner(JxlDecoder* dec, JxlParallelRunner parallel_runner,
+                            void* parallel_runner_opaque) {
+  if (dec->stage != DecoderStage::kInited) {
+    return JXL_API_ERROR("parallel_runner must be set before starting");
+  }
+  dec->thread_pool.reset(
+      new jxl::ThreadPool(parallel_runner, parallel_runner_opaque));
+  return JXL_DEC_SUCCESS;
+}
+
+size_t JxlDecoderSizeHintBasicInfo(const JxlDecoder* dec) {
+  if (dec->got_basic_info) return 0;
+  return dec->basic_info_size_hint;
+}
+
+JxlDecoderStatus JxlDecoderSubscribeEvents(JxlDecoder* dec, int events_wanted) {
+  if (dec->stage != DecoderStage::kInited) {
+    return JXL_DEC_ERROR;  // Cannot subscribe to events after having started.
+  }
+  if (events_wanted & 63) {
+    return JXL_DEC_ERROR;  // Can only subscribe to informative events.
+  }
+  dec->events_wanted = events_wanted;
+  dec->orig_events_wanted = events_wanted;
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetKeepOrientation(JxlDecoder* dec,
+                                              JXL_BOOL skip_reorientation) {
+  if (dec->stage != DecoderStage::kInited) {
+    return JXL_API_ERROR("Must set keep_orientation option before starting");
+  }
+  dec->keep_orientation = !!skip_reorientation;
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetUnpremultiplyAlpha(JxlDecoder* dec,
+                                                 JXL_BOOL unpremul_alpha) {
+  if (dec->stage != DecoderStage::kInited) {
+    return JXL_API_ERROR("Must set unpremul_alpha option before starting");
+  }
+  dec->unpremul_alpha = !!unpremul_alpha;
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetRenderSpotcolors(JxlDecoder* dec,
+                                               JXL_BOOL render_spotcolors) {
+  if (dec->stage != DecoderStage::kInited) {
+    return JXL_API_ERROR("Must set render_spotcolors option before starting");
+  }
+  dec->render_spotcolors = !!render_spotcolors;
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetCoalescing(JxlDecoder* dec, JXL_BOOL coalescing) {
+  if (dec->stage != DecoderStage::kInited) {
+    return JXL_API_ERROR("Must set coalescing option before starting");
+  }
+  dec->coalescing = !!coalescing;
+  return JXL_DEC_SUCCESS;
+}
+
+namespace {
+// helper function to get the dimensions of the current image buffer
+void GetCurrentDimensions(const JxlDecoder* dec, size_t& xsize, size_t& ysize) {
+  if (dec->frame_header->nonserialized_is_preview) {
+    xsize = dec->metadata.oriented_preview_xsize(dec->keep_orientation);
+    ysize = dec->metadata.oriented_preview_ysize(dec->keep_orientation);
+    return;
+  }
+  xsize = dec->metadata.oriented_xsize(dec->keep_orientation);
+  ysize = dec->metadata.oriented_ysize(dec->keep_orientation);
+  if (!dec->coalescing) {
+    const auto frame_dim = dec->frame_header->ToFrameDimensions();
+    xsize = frame_dim.xsize_upsampled;
+    ysize = frame_dim.ysize_upsampled;
+    if (!dec->keep_orientation &&
+        static_cast<int>(dec->metadata.m.GetOrientation()) > 4) {
+      std::swap(xsize, ysize);
+    }
+  }
+}
+}  // namespace
+
+namespace jxl {
+namespace {
+
+template <class T>
+bool CanRead(Span<const uint8_t> data, BitReader* reader, T* JXL_RESTRICT t) {
+  // Use a copy of the bit reader because CanRead advances bits.
+  BitReader reader2(data);
+  reader2.SkipBits(reader->TotalBitsConsumed());
+  bool result = Bundle::CanRead(&reader2, t);
+  JXL_ASSERT(reader2.Close());
+  return result;
+}
+
+// Returns JXL_DEC_SUCCESS if the full bundle was successfully read, status
+// indicating either error or need more input otherwise.
+template <class T>
+JxlDecoderStatus ReadBundle(JxlDecoder* dec, Span<const uint8_t> data,
+                            BitReader* reader, T* JXL_RESTRICT t) {
+  if (!CanRead(data, reader, t)) {
+    return dec->RequestMoreInput();
+  }
+  if (!Bundle::Read(reader, t)) {
+    return JXL_DEC_ERROR;
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+#define JXL_API_RETURN_IF_ERROR(expr)               \
+  {                                                 \
+    JxlDecoderStatus status_ = ConvertStatus(expr); \
+    if (status_ != JXL_DEC_SUCCESS) return status_; \
+  }
+
+std::unique_ptr<BitReader, std::function<void(BitReader*)>> GetBitReader(
+    Span<const uint8_t> span) {
+  BitReader* reader = new BitReader(span);
+  return std::unique_ptr<BitReader, std::function<void(BitReader*)>>(
+      reader, [](BitReader* reader) {
+        // We can't allow Close to abort the program if the reader is out of
+        // bounds, or all return paths in the code, even those that already
+        // return failure, would have to manually call AllReadsWithinBounds().
+        // Invalid JXL codestream should not cause program to quit.
+        (void)reader->AllReadsWithinBounds();
+        (void)reader->Close();
+        delete reader;
+      });
+}
+
+JxlDecoderStatus JxlDecoderReadBasicInfo(JxlDecoder* dec) {
+  if (!dec->got_codestream_signature) {
+    // Check and skip the codestream signature
+    Span<const uint8_t> span;
+    JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
+    if (span.size() < 2) {
+      return dec->RequestMoreInput();
+    }
+    if (span.data()[0] != 0xff || span.data()[1] != jxl::kCodestreamMarker) {
+      return JXL_API_ERROR("invalid signature");
+    }
+    dec->got_codestream_signature = true;
+    dec->AdvanceCodestream(2);
+  }
+
+  Span<const uint8_t> span;
+  JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
+  auto reader = GetBitReader(span);
+  JXL_API_RETURN_IF_ERROR(
+      ReadBundle(dec, span, reader.get(), &dec->metadata.size));
+  JXL_API_RETURN_IF_ERROR(
+      ReadBundle(dec, span, reader.get(), &dec->metadata.m));
+  size_t total_bits = reader->TotalBitsConsumed();
+  dec->AdvanceCodestream(total_bits / jxl::kBitsPerByte);
+  dec->codestream_bits_ahead = total_bits % jxl::kBitsPerByte;
+  dec->got_basic_info = true;
+  dec->basic_info_size_hint = 0;
+  dec->image_metadata = dec->metadata.m;
+  JXL_DEBUG_V(2, "Decoded BasicInfo: %s", dec->metadata.DebugString().c_str());
+
+  if (!CheckSizeLimit(dec, dec->metadata.size.xsize(),
+                      dec->metadata.size.ysize())) {
+    return JXL_API_ERROR("image is too large");
+  }
+
+  return JXL_DEC_SUCCESS;
+}
+
+// Reads all codestream headers (but not frame headers)
+JxlDecoderStatus JxlDecoderReadAllHeaders(JxlDecoder* dec) {
+  if (!dec->got_transform_data) {
+    Span<const uint8_t> span;
+    JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
+    auto reader = GetBitReader(span);
+    reader->SkipBits(dec->codestream_bits_ahead);
+    dec->metadata.transform_data.nonserialized_xyb_encoded =
+        dec->metadata.m.xyb_encoded;
+    JXL_API_RETURN_IF_ERROR(
+        ReadBundle(dec, span, reader.get(), &dec->metadata.transform_data));
+    size_t total_bits = reader->TotalBitsConsumed();
+    dec->AdvanceCodestream(total_bits / jxl::kBitsPerByte);
+    dec->codestream_bits_ahead = total_bits % jxl::kBitsPerByte;
+    dec->got_transform_data = true;
+  }
+
+  Span<const uint8_t> span;
+  JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
+  auto reader = GetBitReader(span);
+  reader->SkipBits(dec->codestream_bits_ahead);
+
+  if (dec->metadata.m.color_encoding.WantICC()) {
+    jxl::Status status =
+        dec->icc_reader.Init(reader.get(), dec->memory_limit_base);
+    // Always check AllReadsWithinBounds, not all the C++ decoder implementation
+    // handles reader out of bounds correctly  yet (e.g. context map). Not
+    // checking AllReadsWithinBounds can cause reader->Close() to trigger an
+    // assert, but we don't want library to quit program for invalid codestream.
+    if (!reader->AllReadsWithinBounds() ||
+        status.code() == StatusCode::kNotEnoughBytes) {
+      return dec->RequestMoreInput();
+    }
+    if (!status) {
+      // Other non-successful status is an error
+      return JXL_DEC_ERROR;
+    }
+    PaddedBytes icc;
+    status = dec->icc_reader.Process(reader.get(), &icc);
+    if (status.code() == StatusCode::kNotEnoughBytes) {
+      return dec->RequestMoreInput();
+    }
+    if (!status) {
+      // Other non-successful status is an error
+      return JXL_DEC_ERROR;
+    }
+    if (!dec->metadata.m.color_encoding.SetICCRaw(std::move(icc))) {
+      return JXL_DEC_ERROR;
+    }
+  }
+
+  dec->got_all_headers = true;
+  JXL_API_RETURN_IF_ERROR(reader->JumpToByteBoundary());
+
+  dec->AdvanceCodestream(reader->TotalBitsConsumed() / jxl::kBitsPerByte);
+  dec->codestream_bits_ahead = 0;
+
+  if (!dec->passes_state) {
+    dec->passes_state.reset(new jxl::PassesDecoderState());
+  }
+
+  JXL_API_RETURN_IF_ERROR(
+      dec->passes_state->output_encoding_info.SetFromMetadata(dec->metadata));
+  if (dec->desired_intensity_target > 0) {
+    dec->passes_state->output_encoding_info.desired_intensity_target =
+        dec->desired_intensity_target;
+  }
+  dec->image_metadata = dec->metadata.m;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderProcessSections(JxlDecoder* dec) {
+  Span<const uint8_t> span;
+  JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
+  const auto& toc = dec->frame_dec->Toc();
+  size_t pos = 0;
+  std::vector<jxl::FrameDecoder::SectionInfo> section_info;
+  std::vector<jxl::FrameDecoder::SectionStatus> section_status;
+  for (size_t i = dec->next_section; i < toc.size(); ++i) {
+    if (dec->section_processed[i]) {
+      pos += toc[i].size;
+      continue;
+    }
+    size_t id = toc[i].id;
+    size_t size = toc[i].size;
+    if (OutOfBounds(pos, size, span.size())) {
+      break;
+    }
+    auto br =
+        new jxl::BitReader(jxl::Span<const uint8_t>(span.data() + pos, size));
+    section_info.emplace_back(jxl::FrameDecoder::SectionInfo{br, id, i});
+    section_status.emplace_back();
+    pos += size;
+  }
+  jxl::Status status = dec->frame_dec->ProcessSections(
+      section_info.data(), section_info.size(), section_status.data());
+  bool out_of_bounds = false;
+  for (const auto& info : section_info) {
+    if (!info.br->AllReadsWithinBounds()) {
+      // Mark out of bounds section, but keep closing and deleting the next
+      // ones as well.
+      out_of_bounds = true;
+    }
+    JXL_ASSERT(info.br->Close());
+    delete info.br;
+  }
+  if (out_of_bounds) {
+    // If any bit reader indicates out of bounds, it's an error, not just
+    // needing more input, since we ensure only bit readers containing
+    // a complete section are provided to the FrameDecoder.
+    return JXL_API_ERROR("frame out of bounds");
+  }
+  if (!status) {
+    return JXL_API_ERROR("frame processing failed");
+  }
+  for (size_t i = 0; i < section_status.size(); ++i) {
+    auto status = section_status[i];
+    if (status == jxl::FrameDecoder::kDone) {
+      dec->section_processed[section_info[i].index] = 1;
+    } else if (status != jxl::FrameDecoder::kSkipped) {
+      return JXL_API_ERROR("unexpected section status");
+    }
+  }
+  size_t completed_prefix_bytes = 0;
+  while (dec->next_section < dec->section_processed.size() &&
+         dec->section_processed[dec->next_section] == 1) {
+    completed_prefix_bytes += toc[dec->next_section].size;
+    ++dec->next_section;
+  }
+  dec->remaining_frame_size -= completed_prefix_bytes;
+  dec->AdvanceCodestream(completed_prefix_bytes);
+  return JXL_DEC_SUCCESS;
+}
+
+// TODO(eustas): no CodecInOut -> no image size reinforcement -> possible OOM.
+JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec) {
+  // If no parallel runner is set, use the default
+  // TODO(lode): move this initialization to an appropriate location once the
+  // runner is used to decode pixels.
+  if (!dec->thread_pool) {
+    dec->thread_pool.reset(new jxl::ThreadPool(nullptr, nullptr));
+  }
+
+  // No matter what events are wanted, the basic info is always required.
+  if (!dec->got_basic_info) {
+    JxlDecoderStatus status = JxlDecoderReadBasicInfo(dec);
+    if (status != JXL_DEC_SUCCESS) return status;
+  }
+
+  if (dec->events_wanted & JXL_DEC_BASIC_INFO) {
+    dec->events_wanted &= ~JXL_DEC_BASIC_INFO;
+    return JXL_DEC_BASIC_INFO;
+  }
+
+  if (!dec->events_wanted) {
+    dec->stage = DecoderStage::kCodestreamFinished;
+    return JXL_DEC_SUCCESS;
+  }
+
+  if (!dec->got_all_headers) {
+    JxlDecoderStatus status = JxlDecoderReadAllHeaders(dec);
+    if (status != JXL_DEC_SUCCESS) return status;
+  }
+
+  if (dec->events_wanted & JXL_DEC_COLOR_ENCODING) {
+    dec->events_wanted &= ~JXL_DEC_COLOR_ENCODING;
+    return JXL_DEC_COLOR_ENCODING;
+  }
+
+  if (!dec->events_wanted) {
+    dec->stage = DecoderStage::kCodestreamFinished;
+    return JXL_DEC_SUCCESS;
+  }
+
+  dec->post_headers = true;
+
+  if (!dec->got_preview_image && dec->metadata.m.have_preview) {
+    dec->preview_frame = true;
+  }
+
+  // Handle frames
+  for (;;) {
+    bool parse_frames =
+        (dec->events_wanted &
+         (JXL_DEC_PREVIEW_IMAGE | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+    if (!parse_frames) {
+      break;
+    }
+    if (dec->frame_stage == FrameStage::kHeader && dec->is_last_total) {
+      break;
+    }
+    if (dec->frame_stage == FrameStage::kHeader) {
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      if (dec->recon_output_jpeg == JpegReconStage::kSettingMetadata ||
+          dec->recon_output_jpeg == JpegReconStage::kOutputting) {
+        // The image bundle contains the JPEG reconstruction frame, but the
+        // decoder is still waiting to decode an EXIF or XMP box. It's not
+        // implemented to decode additional frames during this, and a JPEG
+        // reconstruction image should have only one frame.
+        return JXL_API_ERROR(
+            "cannot decode a next frame after JPEG reconstruction frame");
+      }
+#endif
+      if (!dec->ib) {
+        dec->ib.reset(new jxl::ImageBundle(&dec->image_metadata));
+      }
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      // If JPEG reconstruction is wanted and possible, set the jpeg_data of
+      // the ImageBundle.
+      if (!dec->jpeg_decoder.SetImageBundleJpegData(dec->ib.get()))
+        return JXL_DEC_ERROR;
+#endif
+      dec->frame_dec.reset(new FrameDecoder(
+          dec->passes_state.get(), dec->metadata, dec->thread_pool.get(),
+          /*use_slow_rendering_pipeline=*/false));
+      dec->frame_header.reset(new FrameHeader(&dec->metadata));
+      Span<const uint8_t> span;
+      JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
+      auto reader = GetBitReader(span);
+      jxl::Status status = dec->frame_dec->InitFrame(
+          reader.get(), dec->ib.get(), dec->preview_frame);
+      if (!reader->AllReadsWithinBounds() ||
+          status.code() == StatusCode::kNotEnoughBytes) {
+        return dec->RequestMoreInput();
+      } else if (!status) {
+        return JXL_API_ERROR("invalid frame header");
+      }
+      dec->AdvanceCodestream(reader->TotalBitsConsumed() / kBitsPerByte);
+      *dec->frame_header = dec->frame_dec->GetFrameHeader();
+      jxl::FrameDimensions frame_dim = dec->frame_header->ToFrameDimensions();
+      if (!CheckSizeLimit(dec, frame_dim.xsize_upsampled_padded,
+                          frame_dim.ysize_upsampled_padded)) {
+        return JXL_API_ERROR("frame is too large");
+      }
+      bool output_needed =
+          (dec->preview_frame ? (dec->events_wanted & JXL_DEC_PREVIEW_IMAGE)
+                              : (dec->events_wanted & JXL_DEC_FULL_IMAGE));
+      if (output_needed) {
+        JXL_API_RETURN_IF_ERROR(dec->frame_dec->InitFrameOutput());
+      }
+      if (dec->cpu_limit_base != 0) {
+        // No overflow, checked in CheckSizeLimit.
+        size_t num_pixels = frame_dim.xsize * frame_dim.ysize;
+        if (dec->used_cpu_base + num_pixels < dec->used_cpu_base) {
+          return JXL_API_ERROR("used too much CPU");
+        }
+        dec->used_cpu_base += num_pixels;
+        if (dec->used_cpu_base > dec->cpu_limit_base) {
+          return JXL_API_ERROR("used too much CPU");
+        }
+      }
+      dec->remaining_frame_size = dec->frame_dec->SumSectionSizes();
+
+      dec->frame_stage = FrameStage::kTOC;
+      if (dec->preview_frame) {
+        if (!(dec->events_wanted & JXL_DEC_PREVIEW_IMAGE)) {
+          dec->frame_stage = FrameStage::kHeader;
+          dec->AdvanceCodestream(dec->remaining_frame_size);
+          dec->got_preview_image = true;
+          dec->preview_frame = false;
+        }
+        continue;
+      }
+
+      int saved_as = FrameDecoder::SavedAs(*dec->frame_header);
+      // is last in entire codestream
+      dec->is_last_total = dec->frame_header->is_last;
+      // is last of current still
+      dec->is_last_of_still =
+          dec->is_last_total || dec->frame_header->animation_frame.duration > 0;
+      // is kRegularFrame and coalescing is disabled
+      dec->is_last_of_still |=
+          (!dec->coalescing &&
+           dec->frame_header->frame_type == FrameType::kRegularFrame);
+      const size_t internal_frame_index = dec->internal_frames;
+      const size_t external_frame_index = dec->external_frames;
+      if (dec->is_last_of_still) dec->external_frames++;
+      dec->internal_frames++;
+
+      if (dec->skip_frames > 0) {
+        dec->skipping_frame = true;
+        if (dec->is_last_of_still) {
+          dec->skip_frames--;
+        }
+      } else {
+        dec->skipping_frame = false;
+      }
+
+      if (external_frame_index >= dec->frame_external_to_internal.size()) {
+        dec->frame_external_to_internal.push_back(internal_frame_index);
+        JXL_ASSERT(dec->frame_external_to_internal.size() ==
+                   external_frame_index + 1);
+      }
+
+      if (internal_frame_index >= dec->frame_saved_as.size()) {
+        dec->frame_saved_as.push_back(saved_as);
+        JXL_ASSERT(dec->frame_saved_as.size() == internal_frame_index + 1);
+
+        // add the value 0xff (which means all references) to new slots: we only
+        // know the references of the frame at FinalizeFrame, and fill in the
+        // correct values there. As long as this information is not known, the
+        // worst case where the frame depends on all storage slots is assumed.
+        dec->frame_references.push_back(0xff);
+        JXL_ASSERT(dec->frame_references.size() == internal_frame_index + 1);
+      }
+
+      if (dec->skipping_frame) {
+        // Whether this frame could be referenced by any future frame: either
+        // because it's a frame saved for blending or patches, or because it's
+        // a DC frame.
+        bool referenceable =
+            dec->frame_header->CanBeReferenced() ||
+            dec->frame_header->frame_type == FrameType::kDCFrame;
+        if (internal_frame_index < dec->frame_required.size() &&
+            !dec->frame_required[internal_frame_index]) {
+          referenceable = false;
+        }
+        if (!referenceable) {
+          // Skip all decoding for this frame, since the user is skipping this
+          // frame and no future frames can reference it.
+          dec->frame_stage = FrameStage::kHeader;
+          dec->AdvanceCodestream(dec->remaining_frame_size);
+          continue;
+        }
+      }
+
+      if ((dec->events_wanted & JXL_DEC_FRAME) && dec->is_last_of_still) {
+        // Only return this for the last of a series of stills: patches frames
+        // etc... before this one do not contain the correct information such
+        // as animation timing, ...
+        if (!dec->skipping_frame) {
+          return JXL_DEC_FRAME;
+        }
+      }
+    }
+
+    if (dec->frame_stage == FrameStage::kTOC) {
+      dec->frame_dec->SetRenderSpotcolors(dec->render_spotcolors);
+      dec->frame_dec->SetCoalescing(dec->coalescing);
+
+      if (!dec->preview_frame &&
+          (dec->events_wanted & JXL_DEC_FRAME_PROGRESSION)) {
+        dec->frame_prog_detail =
+            dec->frame_dec->SetPauseAtProgressive(dec->prog_detail);
+      } else {
+        dec->frame_prog_detail = JxlProgressiveDetail::kFrames;
+      }
+      dec->dc_frame_progression_done = 0;
+
+      dec->next_section = 0;
+      dec->section_processed.clear();
+      dec->section_processed.resize(dec->frame_dec->Toc().size(), 0);
+
+      // If we don't need pixels, we can skip actually decoding the frames.
+      if (dec->preview_frame || (dec->events_wanted & JXL_DEC_FULL_IMAGE)) {
+        dec->frame_stage = FrameStage::kFull;
+      } else if (!dec->is_last_total) {
+        dec->frame_stage = FrameStage::kHeader;
+        dec->AdvanceCodestream(dec->remaining_frame_size);
+        continue;
+      } else {
+        break;
+      }
+    }
+
+    if (dec->frame_stage == FrameStage::kFull) {
+      if (!dec->image_out_buffer_set) {
+        if (dec->preview_frame) {
+          return JXL_DEC_NEED_PREVIEW_OUT_BUFFER;
+        }
+        if (
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+            (!dec->jpeg_decoder.IsOutputSet() ||
+             dec->ib->jpeg_data == nullptr) &&
+#endif
+            dec->is_last_of_still && !dec->skipping_frame) {
+          // TODO(lode): remove the dec->is_last_of_still condition if the
+          // frame decoder needs the image buffer as working space for decoding
+          // non-visible or blending frames too
+          return JXL_DEC_NEED_IMAGE_OUT_BUFFER;
+        }
+      }
+
+      if (dec->image_out_buffer_set) {
+        size_t xsize, ysize;
+        GetCurrentDimensions(dec, xsize, ysize);
+        size_t bits_per_sample = GetBitDepth(
+            dec->image_out_bit_depth, dec->metadata.m, dec->image_out_format);
+        dec->frame_dec->SetImageOutput(
+            PixelCallback{
+                dec->image_out_init_callback, dec->image_out_run_callback,
+                dec->image_out_destroy_callback, dec->image_out_init_opaque},
+            reinterpret_cast<uint8_t*>(dec->image_out_buffer),
+            dec->image_out_size, xsize, ysize, dec->image_out_format,
+            bits_per_sample, dec->unpremul_alpha, !dec->keep_orientation);
+        for (size_t i = 0; i < dec->extra_channel_output.size(); ++i) {
+          const auto& extra = dec->extra_channel_output[i];
+          size_t ec_bits_per_sample =
+              GetBitDepth(dec->image_out_bit_depth,
+                          dec->metadata.m.extra_channel_info[i], extra.format);
+          dec->frame_dec->AddExtraChannelOutput(extra.buffer, extra.buffer_size,
+                                                xsize, extra.format,
+                                                ec_bits_per_sample);
+        }
+      }
+
+      size_t next_num_passes_to_pause = dec->frame_dec->NextNumPassesToPause();
+
+      JXL_API_RETURN_IF_ERROR(JxlDecoderProcessSections(dec));
+
+      bool all_sections_done = dec->frame_dec->HasDecodedAll();
+      bool got_dc_only = !all_sections_done && dec->frame_dec->HasDecodedDC();
+
+      if (dec->frame_prog_detail >= JxlProgressiveDetail::kDC &&
+          !dec->dc_frame_progression_done && got_dc_only) {
+        dec->dc_frame_progression_done = true;
+        dec->downsampling_target = 8;
+        return JXL_DEC_FRAME_PROGRESSION;
+      }
+
+      bool new_progression_step_done =
+          dec->frame_dec->NumCompletePasses() >= next_num_passes_to_pause;
+
+      if (!all_sections_done &&
+          dec->frame_prog_detail >= JxlProgressiveDetail::kLastPasses &&
+          new_progression_step_done) {
+        dec->downsampling_target =
+            dec->frame_header->passes.GetDownsamplingTargetForCompletedPasses(
+                dec->frame_dec->NumCompletePasses());
+        return JXL_DEC_FRAME_PROGRESSION;
+      }
+
+      if (!all_sections_done) {
+        // Not all sections have been processed yet
+        return dec->RequestMoreInput();
+      }
+
+      if (!dec->preview_frame) {
+        size_t internal_index = dec->internal_frames - 1;
+        JXL_ASSERT(dec->frame_references.size() > internal_index);
+        // Always fill this in, even if it was already written, it could be that
+        // this frame was skipped before and set to 255, while only now we know
+        // the true value.
+        dec->frame_references[internal_index] = dec->frame_dec->References();
+      }
+
+      if (!dec->frame_dec->FinalizeFrame()) {
+        return JXL_API_ERROR("decoding frame failed");
+      }
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      // If jpeg output was requested, we merely return the JXL_DEC_FULL_IMAGE
+      // status without outputting pixels.
+      if (dec->jpeg_decoder.IsOutputSet() && dec->ib->jpeg_data != nullptr) {
+        dec->frame_stage = FrameStage::kHeader;
+        dec->recon_output_jpeg = JpegReconStage::kSettingMetadata;
+        return JXL_DEC_FULL_IMAGE;
+      }
+#endif
+      if (dec->preview_frame || dec->is_last_of_still) {
+        dec->image_out_buffer_set = false;
+        dec->extra_channel_output.clear();
+      }
+    }
+
+    dec->frame_stage = FrameStage::kHeader;
+
+    // The pixels have been output or are not needed, do not keep them in
+    // memory here.
+    dec->ib.reset();
+    if (dec->preview_frame) {
+      dec->got_preview_image = true;
+      dec->preview_frame = false;
+      dec->events_wanted &= ~JXL_DEC_PREVIEW_IMAGE;
+      return JXL_DEC_PREVIEW_IMAGE;
+    } else if (dec->is_last_of_still &&
+               (dec->events_wanted & JXL_DEC_FULL_IMAGE) &&
+               !dec->skipping_frame) {
+      return JXL_DEC_FULL_IMAGE;
+    }
+  }
+
+  dec->stage = DecoderStage::kCodestreamFinished;
+  // Return success, this means there is nothing more to do.
+  return JXL_DEC_SUCCESS;
+}
+
+}  // namespace
+}  // namespace jxl
+
+JxlDecoderStatus JxlDecoderSetInput(JxlDecoder* dec, const uint8_t* data,
+                                    size_t size) {
+  if (dec->next_in) {
+    return JXL_API_ERROR("already set input, use JxlDecoderReleaseInput first");
+  }
+  if (dec->input_closed) {
+    return JXL_API_ERROR("input already closed");
+  }
+
+  dec->next_in = data;
+  dec->avail_in = size;
+  return JXL_DEC_SUCCESS;
+}
+
+size_t JxlDecoderReleaseInput(JxlDecoder* dec) {
+  size_t result = dec->avail_in;
+  dec->next_in = nullptr;
+  dec->avail_in = 0;
+  return result;
+}
+
+void JxlDecoderCloseInput(JxlDecoder* dec) { dec->input_closed = true; }
+
+JxlDecoderStatus JxlDecoderSetJPEGBuffer(JxlDecoder* dec, uint8_t* data,
+                                         size_t size) {
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+  // JPEG reconstruction buffer can only set and updated before or during the
+  // first frame, the reconstruction box refers to the first frame and in
+  // theory multi-frame images should not be used with a jbrd box.
+  if (dec->internal_frames > 1) {
+    return JXL_API_ERROR("JPEG reconstruction only works for the first frame");
+  }
+  if (dec->jpeg_decoder.IsOutputSet()) {
+    return JXL_API_ERROR("Already set JPEG buffer");
+  }
+  return dec->jpeg_decoder.SetOutputBuffer(data, size);
+#else
+  return JXL_API_ERROR("JPEG reconstruction is not supported.");
+#endif
+}
+
+size_t JxlDecoderReleaseJPEGBuffer(JxlDecoder* dec) {
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+  return dec->jpeg_decoder.ReleaseOutputBuffer();
+#else
+  return JXL_API_ERROR("JPEG reconstruction is not supported.");
+#endif
+}
+
+// Parses the header of the box, outputting the 4-character type and the box
+// size, including header size, as stored in the box header.
+// @param in current input bytes.
+// @param size available input size.
+// @param pos position in the input, must begin at the header of the box.
+// @param file_pos position of pos since the start of the JXL file, rather than
+// the current input, used for integer overflow checking.
+// @param type the output box type.
+// @param box_size output the total box size, including header, in bytes, or 0
+// if it's a final unbounded box.
+// @param header_size output size of the box header.
+// @return JXL_DEC_SUCCESS if the box header was fully parsed. In that case the
+// parsing position must be incremented by header_size bytes.
+// JXL_DEC_NEED_MORE_INPUT if not enough input bytes available, in that case
+// header_size indicates a lower bound for the known size the header has to be
+// at least. JXL_DEC_ERROR if the box header is invalid.
+static JxlDecoderStatus ParseBoxHeader(const uint8_t* in, size_t size,
+                                       size_t pos, size_t file_pos,
+                                       JxlBoxType type, uint64_t* box_size,
+                                       uint64_t* header_size) {
+  if (OutOfBounds(pos, 8, size)) {
+    *header_size = 8;
+    return JXL_DEC_NEED_MORE_INPUT;
+  }
+  size_t box_start = pos;
+  // Box size, including this header itself.
+  *box_size = LoadBE32(in + pos);
+  pos += 4;
+  if (*box_size == 1) {
+    *header_size = 16;
+    if (OutOfBounds(pos, 12, size)) return JXL_DEC_NEED_MORE_INPUT;
+    *box_size = LoadBE64(in + pos);
+    pos += 8;
+  }
+  memcpy(type, in + pos, 4);
+  pos += 4;
+  *header_size = pos - box_start;
+  if (*box_size > 0 && *box_size < *header_size) {
+    return JXL_API_ERROR("invalid box size");
+  }
+  if (SumOverflows(file_pos, pos, *box_size)) {
+    return JXL_API_ERROR("Box size overflow");
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+// This includes handling the codestream if it is not a box-based jxl file.
+static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
+  // Box handling loop
+  for (;;) {
+    if (dec->box_stage != BoxStage::kHeader) {
+      dec->AdvanceInput(dec->header_size);
+      dec->header_size = 0;
+#if JPEGXL_ENABLE_BOXES
+      if ((dec->events_wanted & JXL_DEC_BOX) &&
+          dec->box_out_buffer_set_current_box) {
+        uint8_t* next_out = dec->box_out_buffer + dec->box_out_buffer_pos;
+        size_t avail_out = dec->box_out_buffer_size - dec->box_out_buffer_pos;
+
+        JxlDecoderStatus box_result = dec->box_content_decoder.Process(
+            dec->next_in, dec->avail_in,
+            dec->file_pos - dec->box_contents_begin, &next_out, &avail_out);
+        size_t produced =
+            next_out - (dec->box_out_buffer + dec->box_out_buffer_pos);
+        dec->box_out_buffer_pos += produced;
+
+        // Don't return JXL_DEC_NEED_MORE_INPUT: the box stages below, instead,
+        // handle the input progression, and the above only outputs the part of
+        // the box seen so far.
+        if (box_result != JXL_DEC_SUCCESS &&
+            box_result != JXL_DEC_NEED_MORE_INPUT) {
+          return box_result;
+        }
+      }
+#endif
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      if (dec->store_exif == 1 || dec->store_xmp == 1) {
+        std::vector<uint8_t>& metadata =
+            (dec->store_exif == 1) ? dec->exif_metadata : dec->xmp_metadata;
+        for (;;) {
+          if (metadata.empty()) metadata.resize(64);
+          uint8_t* orig_next_out = metadata.data() + dec->recon_out_buffer_pos;
+          uint8_t* next_out = orig_next_out;
+          size_t avail_out = metadata.size() - dec->recon_out_buffer_pos;
+          JxlDecoderStatus box_result = dec->metadata_decoder.Process(
+              dec->next_in, dec->avail_in,
+              dec->file_pos - dec->box_contents_begin, &next_out, &avail_out);
+          size_t produced = next_out - orig_next_out;
+          dec->recon_out_buffer_pos += produced;
+          if (box_result == JXL_DEC_BOX_NEED_MORE_OUTPUT) {
+            metadata.resize(metadata.size() * 2);
+          } else if (box_result == JXL_DEC_NEED_MORE_INPUT) {
+            break;  // box stage handling below will handle this instead
+          } else if (box_result == JXL_DEC_SUCCESS) {
+            size_t needed_size = (dec->store_exif == 1) ? dec->recon_exif_size
+                                                        : dec->recon_xmp_size;
+            if (dec->box_contents_unbounded &&
+                dec->recon_out_buffer_pos < needed_size) {
+              // Unbounded box, but we know the expected size due to the jbrd
+              // box's data. Treat this as the JXL_DEC_NEED_MORE_INPUT case.
+              break;
+            } else {
+              metadata.resize(dec->recon_out_buffer_pos);
+              if (dec->store_exif == 1) dec->store_exif = 2;
+              if (dec->store_xmp == 1) dec->store_xmp = 2;
+              break;
+            }
+          } else {
+            // error
+            return box_result;
+          }
+        }
+      }
+#endif
+    }
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+    if (dec->recon_output_jpeg == JpegReconStage::kSettingMetadata &&
+        !dec->JbrdNeedMoreBoxes()) {
+      jxl::jpeg::JPEGData* jpeg_data = dec->ib->jpeg_data.get();
+      if (dec->recon_exif_size) {
+        JxlDecoderStatus status = jxl::JxlToJpegDecoder::SetExif(
+            dec->exif_metadata.data(), dec->exif_metadata.size(), jpeg_data);
+        if (status != JXL_DEC_SUCCESS) return status;
+      }
+      if (dec->recon_xmp_size) {
+        JxlDecoderStatus status = jxl::JxlToJpegDecoder::SetXmp(
+            dec->xmp_metadata.data(), dec->xmp_metadata.size(), jpeg_data);
+        if (status != JXL_DEC_SUCCESS) return status;
+      }
+      dec->recon_output_jpeg = JpegReconStage::kOutputting;
+    }
+
+    if (dec->recon_output_jpeg == JpegReconStage::kOutputting &&
+        !dec->JbrdNeedMoreBoxes()) {
+      JxlDecoderStatus status =
+          dec->jpeg_decoder.WriteOutput(*dec->ib->jpeg_data);
+      if (status != JXL_DEC_SUCCESS) return status;
+      dec->recon_output_jpeg = JpegReconStage::kFinished;
+      dec->ib.reset();
+      if (dec->events_wanted & JXL_DEC_FULL_IMAGE) {
+        // Return the full image event here now, this may be delayed if this
+        // could only be done after decoding an exif or xmp box after the
+        // codestream.
+        return JXL_DEC_FULL_IMAGE;
+      }
+    }
+#endif
+
+    if (dec->box_stage == BoxStage::kHeader) {
+      if (!dec->have_container) {
+        if (dec->stage == DecoderStage::kCodestreamFinished)
+          return JXL_DEC_SUCCESS;
+        dec->box_stage = BoxStage::kCodestream;
+        dec->box_contents_unbounded = true;
+        continue;
+      }
+      if (dec->avail_in == 0) {
+        if (dec->stage != DecoderStage::kCodestreamFinished) {
+          // Not yet seen (all) codestream boxes.
+          return JXL_DEC_NEED_MORE_INPUT;
+        }
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+        if (dec->JbrdNeedMoreBoxes()) {
+          return JXL_DEC_NEED_MORE_INPUT;
+        }
+#endif
+        if (dec->input_closed) {
+          return JXL_DEC_SUCCESS;
+        }
+        if (!(dec->events_wanted & JXL_DEC_BOX)) {
+          // All codestream and jbrd metadata boxes finished, and no individual
+          // boxes requested by user, so no need to request any more input.
+          // This returns success for backwards compatibility, when
+          // JxlDecoderCloseInput and JXL_DEC_BOX did not exist, as well
+          // as for efficiency.
+          return JXL_DEC_SUCCESS;
+        }
+        // Even though we are exactly at a box end, there still may be more
+        // boxes. The user may call JxlDecoderCloseInput to indicate the input
+        // is finished and get success instead.
+        return JXL_DEC_NEED_MORE_INPUT;
+      }
+
+      bool boxed_codestream_done =
+          ((dec->events_wanted & JXL_DEC_BOX) &&
+           dec->stage == DecoderStage::kCodestreamFinished &&
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+           !dec->JbrdNeedMoreBoxes() &&
+#endif
+           dec->last_codestream_seen);
+      if (boxed_codestream_done && dec->avail_in >= 2 &&
+          dec->next_in[0] == 0xff &&
+          dec->next_in[1] == jxl::kCodestreamMarker) {
+        // We detected the start of the next naked codestream, so we can return
+        // success here.
+        return JXL_DEC_SUCCESS;
+      }
+
+      uint64_t box_size, header_size;
+      JxlDecoderStatus status =
+          ParseBoxHeader(dec->next_in, dec->avail_in, 0, dec->file_pos,
+                         dec->box_type, &box_size, &header_size);
+      if (status != JXL_DEC_SUCCESS) {
+        if (status == JXL_DEC_NEED_MORE_INPUT) {
+          dec->basic_info_size_hint =
+              InitialBasicInfoSizeHint() + header_size - dec->file_pos;
+        }
+        return status;
+      }
+      if (memcmp(dec->box_type, "brob", 4) == 0) {
+        if (dec->avail_in < header_size + 4) {
+          return JXL_DEC_NEED_MORE_INPUT;
+        }
+        memcpy(dec->box_decoded_type, dec->next_in + header_size,
+               sizeof(dec->box_decoded_type));
+      } else {
+        memcpy(dec->box_decoded_type, dec->box_type,
+               sizeof(dec->box_decoded_type));
+      }
+
+      // Box order validity checks
+      // The signature box at box_count == 1 is not checked here since that's
+      // already done at the beginning.
+      dec->box_count++;
+      if (boxed_codestream_done && memcmp(dec->box_type, "JXL ", 4) == 0) {
+        // We detected the start of the next boxed stream, so we can return
+        // success here.
+        return JXL_DEC_SUCCESS;
+      }
+      if (dec->box_count == 2 && memcmp(dec->box_type, "ftyp", 4) != 0) {
+        return JXL_API_ERROR("the second box must be the ftyp box");
+      }
+      if (memcmp(dec->box_type, "ftyp", 4) == 0 && dec->box_count != 2) {
+        return JXL_API_ERROR("the ftyp box must come second");
+      }
+
+      dec->box_contents_unbounded = (box_size == 0);
+      dec->box_contents_begin = dec->file_pos + header_size;
+      dec->box_contents_end =
+          dec->box_contents_unbounded ? 0 : (dec->file_pos + box_size);
+      dec->box_contents_size =
+          dec->box_contents_unbounded ? 0 : (box_size - header_size);
+      dec->box_size = box_size;
+      dec->header_size = header_size;
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      if (dec->orig_events_wanted & JXL_DEC_JPEG_RECONSTRUCTION) {
+        // Initiate storing of Exif or XMP data for JPEG reconstruction
+        if (dec->store_exif == 0 &&
+            memcmp(dec->box_decoded_type, "Exif", 4) == 0) {
+          dec->store_exif = 1;
+          dec->recon_out_buffer_pos = 0;
+        }
+        if (dec->store_xmp == 0 &&
+            memcmp(dec->box_decoded_type, "xml ", 4) == 0) {
+          dec->store_xmp = 1;
+          dec->recon_out_buffer_pos = 0;
+        }
+      }
+#endif
+#if JPEGXL_ENABLE_BOXES
+      if (dec->events_wanted & JXL_DEC_BOX) {
+        bool decompress =
+            dec->decompress_boxes && memcmp(dec->box_type, "brob", 4) == 0;
+        dec->box_content_decoder.StartBox(
+            decompress, dec->box_contents_unbounded, dec->box_contents_size);
+      }
+#endif
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      if (dec->store_exif == 1 || dec->store_xmp == 1) {
+        bool brob = memcmp(dec->box_type, "brob", 4) == 0;
+        dec->metadata_decoder.StartBox(brob, dec->box_contents_unbounded,
+                                       dec->box_contents_size);
+      }
+#endif
+      if (memcmp(dec->box_type, "ftyp", 4) == 0) {
+        dec->box_stage = BoxStage::kFtyp;
+      } else if (memcmp(dec->box_type, "jxlc", 4) == 0) {
+        if (dec->last_codestream_seen) {
+          return JXL_API_ERROR("there can only be one jxlc box");
+        }
+        dec->last_codestream_seen = true;
+        dec->box_stage = BoxStage::kCodestream;
+      } else if (memcmp(dec->box_type, "jxlp", 4) == 0) {
+        dec->box_stage = BoxStage::kPartialCodestream;
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      } else if ((dec->orig_events_wanted & JXL_DEC_JPEG_RECONSTRUCTION) &&
+                 memcmp(dec->box_type, "jbrd", 4) == 0) {
+        if (!(dec->events_wanted & JXL_DEC_JPEG_RECONSTRUCTION)) {
+          return JXL_API_ERROR(
+              "multiple JPEG reconstruction boxes not supported");
+        }
+        dec->box_stage = BoxStage::kJpegRecon;
+#endif
+      } else {
+        dec->box_stage = BoxStage::kSkip;
+      }
+
+      if (dec->events_wanted & JXL_DEC_BOX) {
+        dec->box_event = true;
+        dec->box_out_buffer_set_current_box = false;
+        return JXL_DEC_BOX;
+      }
+    } else if (dec->box_stage == BoxStage::kFtyp) {
+      if (dec->box_contents_size < 12) {
+        return JXL_API_ERROR("file type box too small");
+      }
+      if (dec->avail_in < 4) return JXL_DEC_NEED_MORE_INPUT;
+      if (memcmp(dec->next_in, "jxl ", 4) != 0) {
+        return JXL_API_ERROR("file type box major brand must be \"jxl \"");
+      }
+      dec->AdvanceInput(4);
+      dec->box_stage = BoxStage::kSkip;
+    } else if (dec->box_stage == BoxStage::kPartialCodestream) {
+      if (dec->last_codestream_seen) {
+        return JXL_API_ERROR("cannot have jxlp box after last jxlp box");
+      }
+      // TODO(lode): error if box is unbounded but last bit not set
+      if (dec->avail_in < 4) return JXL_DEC_NEED_MORE_INPUT;
+      if (!dec->box_contents_unbounded && dec->box_contents_size < 4) {
+        return JXL_API_ERROR("jxlp box too small to contain index");
+      }
+      size_t jxlp_index = LoadBE32(dec->next_in);
+      // The high bit of jxlp_index indicates whether this is the last
+      // jxlp box.
+      if (jxlp_index & 0x80000000) {
+        dec->last_codestream_seen = true;
+      }
+      dec->AdvanceInput(4);
+      dec->box_stage = BoxStage::kCodestream;
+    } else if (dec->box_stage == BoxStage::kCodestream) {
+      JxlDecoderStatus status = jxl::JxlDecoderProcessCodestream(dec);
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      if (status == JXL_DEC_FULL_IMAGE) {
+        if (dec->recon_output_jpeg != JpegReconStage::kNone) {
+          continue;
+        }
+      }
+#endif
+      if (status == JXL_DEC_NEED_MORE_INPUT) {
+        if (dec->file_pos == dec->box_contents_end &&
+            !dec->box_contents_unbounded) {
+          dec->box_stage = BoxStage::kHeader;
+          continue;
+        }
+      }
+
+      if (status == JXL_DEC_SUCCESS) {
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+        if (dec->JbrdNeedMoreBoxes()) {
+          dec->box_stage = BoxStage::kSkip;
+          continue;
+        }
+#endif
+        if (dec->box_contents_unbounded) {
+          // Last box reached and codestream done, nothing more to do.
+          break;
+        }
+        if (dec->events_wanted & JXL_DEC_BOX) {
+          // Codestream done, but there may be more other boxes.
+          dec->box_stage = BoxStage::kSkip;
+          continue;
+        }
+      }
+      return status;
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+    } else if (dec->box_stage == BoxStage::kJpegRecon) {
+      if (!dec->jpeg_decoder.IsParsingBox()) {
+        // This is a new JPEG reconstruction metadata box.
+        dec->jpeg_decoder.StartBox(dec->box_contents_unbounded,
+                                   dec->box_contents_size);
+      }
+      const uint8_t* next_in = dec->next_in;
+      size_t avail_in = dec->avail_in;
+      JxlDecoderStatus recon_result =
+          dec->jpeg_decoder.Process(&next_in, &avail_in);
+      size_t consumed = next_in - dec->next_in;
+      dec->AdvanceInput(consumed);
+      if (recon_result == JXL_DEC_JPEG_RECONSTRUCTION) {
+        jxl::jpeg::JPEGData* jpeg_data = dec->jpeg_decoder.GetJpegData();
+        size_t num_exif = jxl::JxlToJpegDecoder::NumExifMarkers(*jpeg_data);
+        size_t num_xmp = jxl::JxlToJpegDecoder::NumXmpMarkers(*jpeg_data);
+        if (num_exif) {
+          if (num_exif > 1) {
+            return JXL_API_ERROR(
+                "multiple exif markers for JPEG reconstruction not supported");
+          }
+          if (JXL_DEC_SUCCESS != jxl::JxlToJpegDecoder::ExifBoxContentSize(
+                                     *jpeg_data, &dec->recon_exif_size)) {
+            return JXL_API_ERROR("invalid jbrd exif size");
+          }
+        }
+        if (num_xmp) {
+          if (num_xmp > 1) {
+            return JXL_API_ERROR(
+                "multiple XMP markers for JPEG reconstruction not supported");
+          }
+          if (JXL_DEC_SUCCESS != jxl::JxlToJpegDecoder::XmlBoxContentSize(
+                                     *jpeg_data, &dec->recon_xmp_size)) {
+            return JXL_API_ERROR("invalid jbrd XMP size");
+          }
+        }
+
+        dec->box_stage = BoxStage::kHeader;
+        // If successful JPEG reconstruction, return the success if the user
+        // cares about it, otherwise continue.
+        if (dec->events_wanted & recon_result) {
+          dec->events_wanted &= ~recon_result;
+          return recon_result;
+        }
+      } else {
+        // If anything else, return the result.
+        return recon_result;
+      }
+#endif
+    } else if (dec->box_stage == BoxStage::kSkip) {
+      if (dec->box_contents_unbounded) {
+        if (dec->input_closed) {
+          return JXL_DEC_SUCCESS;
+        }
+        if (!(dec->box_out_buffer_set)) {
+          // An unbounded box is always the last box. Not requesting box data,
+          // so return success even if JxlDecoderCloseInput was not called for
+          // backwards compatibility as well as efficiency since this box is
+          // being skipped.
+          return JXL_DEC_SUCCESS;
+        }
+        // Arbitrarily more bytes may follow, only JxlDecoderCloseInput can
+        // mark the end.
+        dec->AdvanceInput(dec->avail_in);
+        return JXL_DEC_NEED_MORE_INPUT;
+      }
+      // Amount of remaining bytes in the box that is being skipped.
+      size_t remaining = dec->box_contents_end - dec->file_pos;
+      if (dec->avail_in < remaining) {
+        // Indicate how many more bytes needed starting from next_in.
+        dec->basic_info_size_hint =
+            InitialBasicInfoSizeHint() + dec->box_contents_end - dec->file_pos;
+        // Don't have the full box yet, skip all we have so far
+        dec->AdvanceInput(dec->avail_in);
+        return JXL_DEC_NEED_MORE_INPUT;
+      } else {
+        // Full box available, skip all its remaining bytes
+        dec->AdvanceInput(remaining);
+        dec->box_stage = BoxStage::kHeader;
+      }
+    } else {
+      JXL_DASSERT(false);  // unknown box stage
+    }
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderProcessInput(JxlDecoder* dec) {
+  if (dec->stage == DecoderStage::kInited) {
+    dec->stage = DecoderStage::kStarted;
+  }
+  if (dec->stage == DecoderStage::kError) {
+    return JXL_API_ERROR(
+        "Cannot keep using decoder after it encountered an error, use "
+        "JxlDecoderReset to reset it");
+  }
+
+  if (!dec->got_signature) {
+    JxlSignature sig = JxlSignatureCheck(dec->next_in, dec->avail_in);
+    if (sig == JXL_SIG_INVALID) return JXL_API_ERROR("invalid signature");
+    if (sig == JXL_SIG_NOT_ENOUGH_BYTES) {
+      if (dec->input_closed) {
+        return JXL_API_ERROR("file too small for signature");
+      }
+      return JXL_DEC_NEED_MORE_INPUT;
+    }
+
+    dec->got_signature = true;
+
+    if (sig == JXL_SIG_CONTAINER) {
+      dec->have_container = 1;
+    } else {
+      dec->last_codestream_seen = true;
+    }
+  }
+
+  JxlDecoderStatus status = HandleBoxes(dec);
+
+  if (status == JXL_DEC_NEED_MORE_INPUT && dec->input_closed) {
+    return JXL_API_ERROR("missing input");
+  }
+
+  // Even if the box handling returns success, certain types of
+  // data may be missing.
+  if (status == JXL_DEC_SUCCESS) {
+    if (dec->CanUseMoreCodestreamInput()) {
+      return JXL_API_ERROR("codestream never finished");
+    }
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+    if (dec->JbrdNeedMoreBoxes()) {
+      return JXL_API_ERROR("missing metadata boxes for jpeg reconstruction");
+    }
+#endif
+  }
+
+  return status;
+}
+
+// To ensure ABI forward-compatibility, this struct has a constant size.
+static_assert(sizeof(JxlBasicInfo) == 204,
+              "JxlBasicInfo struct size should remain constant");
+
+JxlDecoderStatus JxlDecoderGetBasicInfo(const JxlDecoder* dec,
+                                        JxlBasicInfo* info) {
+  if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT;
+
+  if (info) {
+    memset(info, 0, sizeof(*info));
+
+    const jxl::ImageMetadata& meta = dec->metadata.m;
+
+    info->have_container = dec->have_container;
+    info->xsize = dec->metadata.size.xsize();
+    info->ysize = dec->metadata.size.ysize();
+    info->uses_original_profile = !meta.xyb_encoded;
+
+    info->bits_per_sample = meta.bit_depth.bits_per_sample;
+    info->exponent_bits_per_sample = meta.bit_depth.exponent_bits_per_sample;
+
+    info->have_preview = meta.have_preview;
+    info->have_animation = meta.have_animation;
+    info->orientation = static_cast<JxlOrientation>(meta.orientation);
+
+    if (!dec->keep_orientation) {
+      if (info->orientation >= JXL_ORIENT_TRANSPOSE) {
+        std::swap(info->xsize, info->ysize);
+      }
+      info->orientation = JXL_ORIENT_IDENTITY;
+    }
+
+    info->intensity_target = meta.IntensityTarget();
+    if (dec->desired_intensity_target > 0) {
+      info->intensity_target = dec->desired_intensity_target;
+    }
+    info->min_nits = meta.tone_mapping.min_nits;
+    info->relative_to_max_display = meta.tone_mapping.relative_to_max_display;
+    info->linear_below = meta.tone_mapping.linear_below;
+
+    const jxl::ExtraChannelInfo* alpha = meta.Find(jxl::ExtraChannel::kAlpha);
+    if (alpha != nullptr) {
+      info->alpha_bits = alpha->bit_depth.bits_per_sample;
+      info->alpha_exponent_bits = alpha->bit_depth.exponent_bits_per_sample;
+      info->alpha_premultiplied = alpha->alpha_associated;
+    } else {
+      info->alpha_bits = 0;
+      info->alpha_exponent_bits = 0;
+      info->alpha_premultiplied = 0;
+    }
+
+    info->num_color_channels =
+        meta.color_encoding.GetColorSpace() == jxl::ColorSpace::kGray ? 1 : 3;
+
+    info->num_extra_channels = meta.num_extra_channels;
+
+    if (info->have_preview) {
+      info->preview.xsize = dec->metadata.m.preview_size.xsize();
+      info->preview.ysize = dec->metadata.m.preview_size.ysize();
+    }
+
+    if (info->have_animation) {
+      info->animation.tps_numerator = dec->metadata.m.animation.tps_numerator;
+      info->animation.tps_denominator =
+          dec->metadata.m.animation.tps_denominator;
+      info->animation.num_loops = dec->metadata.m.animation.num_loops;
+      info->animation.have_timecodes = dec->metadata.m.animation.have_timecodes;
+    }
+
+    if (meta.have_intrinsic_size) {
+      info->intrinsic_xsize = dec->metadata.m.intrinsic_size.xsize();
+      info->intrinsic_ysize = dec->metadata.m.intrinsic_size.ysize();
+    } else {
+      info->intrinsic_xsize = info->xsize;
+      info->intrinsic_ysize = info->ysize;
+    }
+  }
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetExtraChannelInfo(const JxlDecoder* dec,
+                                               size_t index,
+                                               JxlExtraChannelInfo* info) {
+  if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT;
+
+  const std::vector<jxl::ExtraChannelInfo>& channels =
+      dec->metadata.m.extra_channel_info;
+
+  if (index >= channels.size()) return JXL_DEC_ERROR;  // out of bounds
+  const jxl::ExtraChannelInfo& channel = channels[index];
+
+  info->type = static_cast<JxlExtraChannelType>(channel.type);
+  info->bits_per_sample = channel.bit_depth.bits_per_sample;
+  info->exponent_bits_per_sample =
+      channel.bit_depth.floating_point_sample
+          ? channel.bit_depth.exponent_bits_per_sample
+          : 0;
+  info->dim_shift = channel.dim_shift;
+  info->name_length = channel.name.size();
+  info->alpha_premultiplied = channel.alpha_associated;
+  info->spot_color[0] = channel.spot_color[0];
+  info->spot_color[1] = channel.spot_color[1];
+  info->spot_color[2] = channel.spot_color[2];
+  info->spot_color[3] = channel.spot_color[3];
+  info->cfa_channel = channel.cfa_channel;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetExtraChannelName(const JxlDecoder* dec,
+                                               size_t index, char* name,
+                                               size_t size) {
+  if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT;
+
+  const std::vector<jxl::ExtraChannelInfo>& channels =
+      dec->metadata.m.extra_channel_info;
+
+  if (index >= channels.size()) return JXL_DEC_ERROR;  // out of bounds
+  const jxl::ExtraChannelInfo& channel = channels[index];
+
+  // Also need null-termination character
+  if (channel.name.size() + 1 > size) return JXL_DEC_ERROR;
+
+  memcpy(name, channel.name.c_str(), channel.name.size() + 1);
+
+  return JXL_DEC_SUCCESS;
+}
+
+namespace {
+
+// Gets the jxl::ColorEncoding for the desired target, and checks errors.
+// Returns the object regardless of whether the actual color space is in ICC,
+// but ensures that if the color encoding is not the encoding from the
+// codestream header metadata, it cannot require ICC profile.
+JxlDecoderStatus GetColorEncodingForTarget(
+    const JxlDecoder* dec, JxlColorProfileTarget target,
+    const jxl::ColorEncoding** encoding) {
+  if (!dec->got_all_headers) return JXL_DEC_NEED_MORE_INPUT;
+  *encoding = nullptr;
+  if (target == JXL_COLOR_PROFILE_TARGET_DATA && dec->metadata.m.xyb_encoded) {
+    *encoding = &dec->passes_state->output_encoding_info.color_encoding;
+  } else {
+    *encoding = &dec->metadata.m.color_encoding;
+  }
+  return JXL_DEC_SUCCESS;
+}
+}  // namespace
+
+JxlDecoderStatus JxlDecoderGetColorAsEncodedProfile(
+    const JxlDecoder* dec, const JxlPixelFormat* unused_format,
+    JxlColorProfileTarget target, JxlColorEncoding* color_encoding) {
+  const jxl::ColorEncoding* jxl_color_encoding = nullptr;
+  JxlDecoderStatus status =
+      GetColorEncodingForTarget(dec, target, &jxl_color_encoding);
+  if (status) return status;
+
+  if (jxl_color_encoding->WantICC())
+    return JXL_DEC_ERROR;  // Indicate no encoded profile available.
+
+  if (color_encoding) {
+    ConvertInternalToExternalColorEncoding(*jxl_color_encoding, color_encoding);
+  }
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetICCProfileSize(
+    const JxlDecoder* dec, const JxlPixelFormat* unused_format,
+    JxlColorProfileTarget target, size_t* size) {
+  const jxl::ColorEncoding* jxl_color_encoding = nullptr;
+  JxlDecoderStatus status =
+      GetColorEncodingForTarget(dec, target, &jxl_color_encoding);
+  if (status != JXL_DEC_SUCCESS) return status;
+
+  if (jxl_color_encoding->WantICC()) {
+    jxl::ColorSpace color_space =
+        dec->metadata.m.color_encoding.GetColorSpace();
+    if (color_space == jxl::ColorSpace::kUnknown ||
+        color_space == jxl::ColorSpace::kXYB) {
+      // This indicates there's no ICC profile available
+      // TODO(lode): for the XYB case, do we want to craft an ICC profile that
+      // represents XYB as an RGB profile? It may be possible, but not with
+      // only 1D transfer functions.
+      return JXL_DEC_ERROR;
+    }
+  }
+
+  if (size) {
+    *size = jxl_color_encoding->ICC().size();
+  }
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetColorAsICCProfile(
+    const JxlDecoder* dec, const JxlPixelFormat* unused_format,
+    JxlColorProfileTarget target, uint8_t* icc_profile, size_t size) {
+  size_t wanted_size;
+  // This also checks the NEED_MORE_INPUT and the unknown/xyb cases
+  JxlDecoderStatus status =
+      JxlDecoderGetICCProfileSize(dec, nullptr, target, &wanted_size);
+  if (status != JXL_DEC_SUCCESS) return status;
+  if (size < wanted_size) return JXL_API_ERROR("ICC profile output too small");
+
+  const jxl::ColorEncoding* jxl_color_encoding = nullptr;
+  status = GetColorEncodingForTarget(dec, target, &jxl_color_encoding);
+  if (status != JXL_DEC_SUCCESS) return status;
+
+  memcpy(icc_profile, jxl_color_encoding->ICC().data(),
+         jxl_color_encoding->ICC().size());
+
+  return JXL_DEC_SUCCESS;
+}
+
+namespace {
+
+// Returns the amount of bits needed for getting memory buffer size, and does
+// all error checking required for size checking and format validity.
+JxlDecoderStatus PrepareSizeCheck(const JxlDecoder* dec,
+                                  const JxlPixelFormat* format, size_t* bits) {
+  if (!dec->got_basic_info) {
+    // Don't know image dimensions yet, cannot check for valid size.
+    return JXL_DEC_NEED_MORE_INPUT;
+  }
+  if (!dec->coalescing &&
+      (!dec->frame_header || dec->frame_stage == FrameStage::kHeader)) {
+    return JXL_API_ERROR("Don't know frame dimensions yet");
+  }
+  if (format->num_channels > 4) {
+    return JXL_API_ERROR("More than 4 channels not supported");
+  }
+
+  *bits = BitsPerChannel(format->data_type);
+
+  if (*bits == 0) {
+    return JXL_API_ERROR("Invalid/unsupported data type");
+  }
+
+  return JXL_DEC_SUCCESS;
+}
+
+}  // namespace
+
+size_t JxlDecoderGetIntendedDownsamplingRatio(JxlDecoder* dec) {
+  return dec->downsampling_target;
+}
+
+JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec) {
+  if (!dec->image_out_buffer_set) return JXL_DEC_ERROR;
+  if (dec->frame_stage != FrameStage::kFull) {
+    return JXL_DEC_ERROR;
+  }
+  JXL_DASSERT(dec->frame_dec);
+  if (!dec->frame_dec->HasDecodedDC()) {
+    // FrameDecoder::Flush currently requires DC to have been decoded already
+    // to work correctly.
+    return JXL_DEC_ERROR;
+  }
+
+  if (!dec->frame_dec->Flush()) {
+    return JXL_DEC_ERROR;
+  }
+
+  return JXL_DEC_SUCCESS;
+}
+
+JXL_EXPORT JxlDecoderStatus JxlDecoderPreviewOutBufferSize(
+    const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size) {
+  size_t bits;
+  JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits);
+  if (status != JXL_DEC_SUCCESS) return status;
+  if (format->num_channels < 3 &&
+      !dec->image_metadata.color_encoding.IsGray()) {
+    return JXL_API_ERROR("Number of channels is too low for color output");
+  }
+
+  size_t xsize = dec->metadata.oriented_preview_xsize(dec->keep_orientation);
+  size_t ysize = dec->metadata.oriented_preview_ysize(dec->keep_orientation);
+
+  size_t row_size =
+      jxl::DivCeil(xsize * format->num_channels * bits, jxl::kBitsPerByte);
+  size_t last_row_size = row_size;
+  if (format->align > 1) {
+    row_size = jxl::DivCeil(row_size, format->align) * format->align;
+  }
+  *size = row_size * (ysize - 1) + last_row_size;
+  return JXL_DEC_SUCCESS;
+}
+
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetPreviewOutBuffer(
+    JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size) {
+  if (!dec->got_basic_info || !dec->metadata.m.have_preview ||
+      !(dec->orig_events_wanted & JXL_DEC_PREVIEW_IMAGE)) {
+    return JXL_API_ERROR("No preview out buffer needed at this time");
+  }
+  if (format->num_channels < 3 &&
+      !dec->image_metadata.color_encoding.IsGray()) {
+    return JXL_API_ERROR("Number of channels is too low for color output");
+  }
+
+  size_t min_size;
+  // This also checks whether the format is valid and supported and basic info
+  // is available.
+  JxlDecoderStatus status =
+      JxlDecoderPreviewOutBufferSize(dec, format, &min_size);
+  if (status != JXL_DEC_SUCCESS) return status;
+
+  if (size < min_size) return JXL_DEC_ERROR;
+
+  dec->image_out_buffer_set = true;
+  dec->image_out_buffer = buffer;
+  dec->image_out_size = size;
+  dec->image_out_format = *format;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JXL_EXPORT JxlDecoderStatus JxlDecoderImageOutBufferSize(
+    const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size) {
+  size_t bits;
+  JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits);
+  if (status != JXL_DEC_SUCCESS) return status;
+  if (format->num_channels < 3 &&
+      !dec->image_metadata.color_encoding.IsGray()) {
+    return JXL_API_ERROR("Number of channels is too low for color output");
+  }
+  size_t xsize, ysize;
+  GetCurrentDimensions(dec, xsize, ysize);
+  size_t row_size =
+      jxl::DivCeil(xsize * format->num_channels * bits, jxl::kBitsPerByte);
+  if (format->align > 1) {
+    row_size = jxl::DivCeil(row_size, format->align) * format->align;
+  }
+  *size = row_size * ysize;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetImageOutBuffer(JxlDecoder* dec,
+                                             const JxlPixelFormat* format,
+                                             void* buffer, size_t size) {
+  if (!dec->got_basic_info || !(dec->orig_events_wanted & JXL_DEC_FULL_IMAGE)) {
+    return JXL_API_ERROR("No image out buffer needed at this time");
+  }
+  if (dec->image_out_buffer_set && !!dec->image_out_run_callback) {
+    return JXL_API_ERROR(
+        "Cannot change from image out callback to image out buffer");
+  }
+  if (format->num_channels < 3 &&
+      !dec->image_metadata.color_encoding.IsGray()) {
+    return JXL_API_ERROR("Number of channels is too low for color output");
+  }
+  size_t min_size;
+  // This also checks whether the format is valid and supported and basic info
+  // is available.
+  JxlDecoderStatus status =
+      JxlDecoderImageOutBufferSize(dec, format, &min_size);
+  if (status != JXL_DEC_SUCCESS) return status;
+
+  if (size < min_size) return JXL_DEC_ERROR;
+
+  dec->image_out_buffer_set = true;
+  dec->image_out_buffer = buffer;
+  dec->image_out_size = size;
+  dec->image_out_format = *format;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderExtraChannelBufferSize(const JxlDecoder* dec,
+                                                  const JxlPixelFormat* format,
+                                                  size_t* size,
+                                                  uint32_t index) {
+  if (!dec->got_basic_info || !(dec->orig_events_wanted & JXL_DEC_FULL_IMAGE)) {
+    return JXL_API_ERROR("No extra channel buffer needed at this time");
+  }
+
+  if (index >= dec->metadata.m.num_extra_channels) {
+    return JXL_API_ERROR("Invalid extra channel index");
+  }
+
+  size_t num_channels = 1;  // Do not use format's num_channels
+
+  size_t bits;
+  JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits);
+  if (status != JXL_DEC_SUCCESS) return status;
+
+  size_t xsize, ysize;
+  GetCurrentDimensions(dec, xsize, ysize);
+  size_t row_size =
+      jxl::DivCeil(xsize * num_channels * bits, jxl::kBitsPerByte);
+  if (format->align > 1) {
+    row_size = jxl::DivCeil(row_size, format->align) * format->align;
+  }
+  *size = row_size * ysize;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetExtraChannelBuffer(JxlDecoder* dec,
+                                                 const JxlPixelFormat* format,
+                                                 void* buffer, size_t size,
+                                                 uint32_t index) {
+  size_t min_size;
+  // This also checks whether the format and index are valid and supported and
+  // basic info is available.
+  JxlDecoderStatus status =
+      JxlDecoderExtraChannelBufferSize(dec, format, &min_size, index);
+  if (status != JXL_DEC_SUCCESS) return status;
+
+  if (size < min_size) return JXL_DEC_ERROR;
+
+  if (dec->extra_channel_output.size() <= index) {
+    dec->extra_channel_output.resize(dec->metadata.m.num_extra_channels,
+                                     {{}, nullptr, 0});
+  }
+  // Guaranteed correct thanks to check in JxlDecoderExtraChannelBufferSize.
+  JXL_ASSERT(index < dec->extra_channel_output.size());
+
+  dec->extra_channel_output[index].format = *format;
+  dec->extra_channel_output[index].format.num_channels = 1;
+  dec->extra_channel_output[index].buffer = buffer;
+  dec->extra_channel_output[index].buffer_size = size;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetImageOutCallback(JxlDecoder* dec,
+                                               const JxlPixelFormat* format,
+                                               JxlImageOutCallback callback,
+                                               void* opaque) {
+  dec->simple_image_out_callback.callback = callback;
+  dec->simple_image_out_callback.opaque = opaque;
+  const auto init_callback =
+      +[](void* init_opaque, size_t num_threads, size_t num_pixels_per_thread) {
+        // No initialization to do, just reuse init_opaque as run_opaque.
+        return init_opaque;
+      };
+  const auto run_callback =
+      +[](void* run_opaque, size_t thread_id, size_t x, size_t y,
+          size_t num_pixels, const void* pixels) {
+        const auto* const simple_callback =
+            static_cast<const JxlDecoder::SimpleImageOutCallback*>(run_opaque);
+        simple_callback->callback(simple_callback->opaque, x, y, num_pixels,
+                                  pixels);
+      };
+  const auto destroy_callback = +[](void* run_opaque) {};
+  return JxlDecoderSetMultithreadedImageOutCallback(
+      dec, format, init_callback, run_callback,
+      /*destroy_callback=*/destroy_callback, &dec->simple_image_out_callback);
+}
+
+JxlDecoderStatus JxlDecoderSetMultithreadedImageOutCallback(
+    JxlDecoder* dec, const JxlPixelFormat* format,
+    JxlImageOutInitCallback init_callback, JxlImageOutRunCallback run_callback,
+    JxlImageOutDestroyCallback destroy_callback, void* init_opaque) {
+  if (dec->image_out_buffer_set && !!dec->image_out_buffer) {
+    return JXL_API_ERROR(
+        "Cannot change from image out buffer to image out callback");
+  }
+
+  if (init_callback == nullptr || run_callback == nullptr ||
+      destroy_callback == nullptr) {
+    return JXL_API_ERROR("All callbacks are required");
+  }
+
+  // Perform error checking for invalid format.
+  size_t bits_dummy;
+  JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits_dummy);
+  if (status != JXL_DEC_SUCCESS) return status;
+
+  dec->image_out_buffer_set = true;
+  dec->image_out_init_callback = init_callback;
+  dec->image_out_run_callback = run_callback;
+  dec->image_out_destroy_callback = destroy_callback;
+  dec->image_out_init_opaque = init_opaque;
+  dec->image_out_format = *format;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetFrameHeader(const JxlDecoder* dec,
+                                          JxlFrameHeader* header) {
+  if (!dec->frame_header || dec->frame_stage == FrameStage::kHeader) {
+    return JXL_API_ERROR("no frame header available");
+  }
+  const auto& metadata = dec->metadata.m;
+  memset(header, 0, sizeof(*header));
+  if (metadata.have_animation) {
+    header->duration = dec->frame_header->animation_frame.duration;
+    if (metadata.animation.have_timecodes) {
+      header->timecode = dec->frame_header->animation_frame.timecode;
+    }
+  }
+  header->name_length = dec->frame_header->name.size();
+  header->is_last = dec->frame_header->is_last;
+  size_t xsize, ysize;
+  GetCurrentDimensions(dec, xsize, ysize);
+  header->layer_info.xsize = xsize;
+  header->layer_info.ysize = ysize;
+  if (!dec->coalescing && dec->frame_header->custom_size_or_origin) {
+    header->layer_info.crop_x0 = dec->frame_header->frame_origin.x0;
+    header->layer_info.crop_y0 = dec->frame_header->frame_origin.y0;
+    header->layer_info.have_crop = JXL_TRUE;
+  } else {
+    header->layer_info.crop_x0 = 0;
+    header->layer_info.crop_y0 = 0;
+    header->layer_info.have_crop = JXL_FALSE;
+  }
+  if (!dec->keep_orientation && !dec->coalescing) {
+    // orient the crop offset
+    size_t W = dec->metadata.oriented_xsize(false);
+    size_t H = dec->metadata.oriented_ysize(false);
+    if (metadata.orientation > 4) {
+      std::swap(header->layer_info.crop_x0, header->layer_info.crop_y0);
+    }
+    size_t o = (metadata.orientation - 1) & 3;
+    if (o > 0 && o < 3) {
+      header->layer_info.crop_x0 = W - xsize - header->layer_info.crop_x0;
+    }
+    if (o > 1) {
+      header->layer_info.crop_y0 = H - ysize - header->layer_info.crop_y0;
+    }
+  }
+  if (dec->coalescing) {
+    header->layer_info.blend_info.blendmode = JXL_BLEND_REPLACE;
+    header->layer_info.blend_info.source = 0;
+    header->layer_info.blend_info.alpha = 0;
+    header->layer_info.blend_info.clamp = JXL_FALSE;
+    header->layer_info.save_as_reference = 0;
+  } else {
+    header->layer_info.blend_info.blendmode =
+        static_cast<JxlBlendMode>(dec->frame_header->blending_info.mode);
+    header->layer_info.blend_info.source =
+        dec->frame_header->blending_info.source;
+    header->layer_info.blend_info.alpha =
+        dec->frame_header->blending_info.alpha_channel;
+    header->layer_info.blend_info.clamp =
+        dec->frame_header->blending_info.clamp;
+    header->layer_info.save_as_reference = dec->frame_header->save_as_reference;
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetExtraChannelBlendInfo(const JxlDecoder* dec,
+                                                    size_t index,
+                                                    JxlBlendInfo* blend_info) {
+  if (!dec->frame_header || dec->frame_stage == FrameStage::kHeader) {
+    return JXL_API_ERROR("no frame header available");
+  }
+  const auto& metadata = dec->metadata.m;
+  if (index >= metadata.num_extra_channels) {
+    return JXL_API_ERROR("Invalid extra channel index");
+  }
+  blend_info->blendmode = static_cast<JxlBlendMode>(
+      dec->frame_header->extra_channel_blending_info[index].mode);
+  blend_info->source =
+      dec->frame_header->extra_channel_blending_info[index].source;
+  blend_info->alpha =
+      dec->frame_header->extra_channel_blending_info[index].alpha_channel;
+  blend_info->clamp =
+      dec->frame_header->extra_channel_blending_info[index].clamp;
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetFrameName(const JxlDecoder* dec, char* name,
+                                        size_t size) {
+  if (!dec->frame_header || dec->frame_stage == FrameStage::kHeader) {
+    return JXL_API_ERROR("no frame header available");
+  }
+  if (size < dec->frame_header->name.size() + 1) {
+    return JXL_API_ERROR("too small frame name output buffer");
+  }
+  memcpy(name, dec->frame_header->name.c_str(),
+         dec->frame_header->name.size() + 1);
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetPreferredColorProfile(
+    JxlDecoder* dec, const JxlColorEncoding* color_encoding) {
+  if (!dec->got_all_headers) {
+    return JXL_API_ERROR("color info not yet available");
+  }
+  if (dec->post_headers) {
+    return JXL_API_ERROR("too late to set the color encoding");
+  }
+  if (dec->image_metadata.color_encoding.IsGray() &&
+      color_encoding->color_space != JXL_COLOR_SPACE_GRAY &&
+      dec->image_out_buffer_set && dec->image_out_format.num_channels < 3) {
+    return JXL_API_ERROR("Number of channels is too low for color output");
+  }
+  if (color_encoding->color_space == JXL_COLOR_SPACE_UNKNOWN) {
+    return JXL_API_ERROR("Unknown output colorspace");
+  }
+  jxl::ColorEncoding c_out;
+  JXL_API_RETURN_IF_ERROR(
+      ConvertExternalToInternalColorEncoding(*color_encoding, &c_out));
+  JXL_API_RETURN_IF_ERROR(!c_out.ICC().empty());
+  auto& output_encoding = dec->passes_state->output_encoding_info;
+  if (!c_out.SameColorEncoding(output_encoding.color_encoding)) {
+    JXL_API_RETURN_IF_ERROR(output_encoding.MaybeSetColorEncoding(c_out));
+    dec->image_metadata.color_encoding = output_encoding.color_encoding;
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetDesiredIntensityTarget(
+    JxlDecoder* dec, float desired_intensity_target) {
+  if (desired_intensity_target < 0) {
+    return JXL_API_ERROR("negative intensity target requested");
+  }
+  dec->desired_intensity_target = desired_intensity_target;
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetBoxBuffer(JxlDecoder* dec, uint8_t* data,
+                                        size_t size) {
+  if (dec->box_out_buffer_set) {
+    return JXL_API_ERROR("must release box buffer before setting it again");
+  }
+  if (!dec->box_event) {
+    return JXL_API_ERROR("can only set box buffer after box event");
+  }
+
+  dec->box_out_buffer_set = true;
+  dec->box_out_buffer_set_current_box = true;
+  dec->box_out_buffer = data;
+  dec->box_out_buffer_size = size;
+  dec->box_out_buffer_pos = 0;
+  return JXL_DEC_SUCCESS;
+}
+
+size_t JxlDecoderReleaseBoxBuffer(JxlDecoder* dec) {
+  if (!dec->box_out_buffer_set) {
+    return 0;
+  }
+  size_t result = dec->box_out_buffer_size - dec->box_out_buffer_pos;
+  dec->box_out_buffer_set = false;
+  dec->box_out_buffer = nullptr;
+  dec->box_out_buffer_size = 0;
+  if (!dec->box_out_buffer_set_current_box) {
+    dec->box_out_buffer_begin = 0;
+  } else {
+    dec->box_out_buffer_begin += dec->box_out_buffer_pos;
+  }
+  dec->box_out_buffer_set_current_box = false;
+  return result;
+}
+
+JxlDecoderStatus JxlDecoderSetDecompressBoxes(JxlDecoder* dec,
+                                              JXL_BOOL decompress) {
+  // TODO(lode): return error if libbrotli is not compiled in the jxl decoding
+  // library
+  dec->decompress_boxes = decompress;
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetBoxType(JxlDecoder* dec, JxlBoxType type,
+                                      JXL_BOOL decompressed) {
+  if (!dec->box_event) {
+    return JXL_API_ERROR("can only get box info after JXL_DEC_BOX event");
+  }
+  if (decompressed) {
+    memcpy(type, dec->box_decoded_type, sizeof(dec->box_decoded_type));
+  } else {
+    memcpy(type, dec->box_type, sizeof(dec->box_type));
+  }
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetBoxSizeRaw(const JxlDecoder* dec,
+                                         uint64_t* size) {
+  if (!dec->box_event) {
+    return JXL_API_ERROR("can only get box info after JXL_DEC_BOX event");
+  }
+  if (size) {
+    *size = dec->box_size;
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetProgressiveDetail(JxlDecoder* dec,
+                                                JxlProgressiveDetail detail) {
+  if (detail != kDC && detail != kLastPasses && detail != kPasses) {
+    return JXL_API_ERROR(
+        "Values other than kDC (%d), kLastPasses (%d) and kPasses (%d), "
+        "like %d are not implemented.",
+        kDC, kLastPasses, kPasses, detail);
+  }
+  dec->prog_detail = detail;
+  return JXL_DEC_SUCCESS;
+}
+
+namespace {
+
+template <typename T>
+JxlDecoderStatus VerifyOutputBitDepth(JxlBitDepth bit_depth, const T& metadata,
+                                      JxlPixelFormat format) {
+  if ((format.data_type == JXL_TYPE_FLOAT ||
+       format.data_type == JXL_TYPE_FLOAT16) &&
+      bit_depth.type != JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
+    return JXL_API_ERROR(
+        "Only JXL_BIT_DEPTH_FROM_PIXEL_FORMAT is implemented "
+        "for float types.");
+  }
+  uint32_t bits_per_sample = GetBitDepth(bit_depth, metadata, format);
+  if (format.data_type == JXL_TYPE_UINT8 &&
+      (bits_per_sample == 0 || bits_per_sample > 8)) {
+    return JXL_API_ERROR("Inavlid bit depth %u for uint8 output",
+                         bits_per_sample);
+  } else if (format.data_type == JXL_TYPE_UINT16 &&
+             (bits_per_sample == 0 || bits_per_sample > 16)) {
+    return JXL_API_ERROR("Inavlid bit depth %u for uint16 output",
+                         bits_per_sample);
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+}  // namespace
+
+JxlDecoderStatus JxlDecoderSetImageOutBitDepth(JxlDecoder* dec,
+                                               const JxlBitDepth* bit_depth) {
+  if (!dec->image_out_buffer_set) {
+    return JXL_API_ERROR("No image out buffer was set.");
+  }
+  JXL_API_RETURN_IF_ERROR(
+      VerifyOutputBitDepth(*bit_depth, dec->metadata.m, dec->image_out_format));
+  dec->image_out_bit_depth = *bit_depth;
+  return JXL_DEC_SUCCESS;
+}
diff --git a/third_party/jpeg-xl/lib/jxl/decode_test.cc b/third_party/jpeg-xl/lib/jxl/decode_test.cc
new file mode 100644
index 0000000000..30f6b61183
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/decode_test.cc
@@ -0,0 +1,5507 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/resizable_parallel_runner_cxx.h>
+#include <jxl/thread_parallel_runner_cxx.h>
+#include <jxl/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/codec.h"
+#include "lib/extras/dec/color_description.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_external_image.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_icc_codec.h"
+#include "lib/jxl/enc_progressive_split.h"
+#include "lib/jxl/encode_internal.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/icc_codec.h"
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/jpeg/enc_jpeg_data.h"
+#include "lib/jxl/test_image.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+#include "lib/jxl/toc.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+void AppendU32BE(uint32_t u32, jxl::PaddedBytes* bytes) {
+  bytes->push_back(u32 >> 24);
+  bytes->push_back(u32 >> 16);
+  bytes->push_back(u32 >> 8);
+  bytes->push_back(u32 >> 0);
+}
+
+// What type of codestream format in the boxes to use for testing
+enum CodeStreamBoxFormat {
+  // Do not use box format at all, only pure codestream
+  kCSBF_None,
+  // Have a single codestream box, with its actual size given in the box
+  kCSBF_Single,
+  // Have a single codestream box, with box size 0 (final box running to end)
+  kCSBF_Single_Zero_Terminated,
+  // Single codestream box, with another unknown box behind it
+  kCSBF_Single_Other,
+  // Have multiple partial codestream boxes
+  kCSBF_Multi,
+  // Have multiple partial codestream boxes, with final box size 0 (running
+  // to end)
+  kCSBF_Multi_Zero_Terminated,
+  // Have multiple partial codestream boxes, terminated by non-codestream box
+  kCSBF_Multi_Other_Terminated,
+  // Have multiple partial codestream boxes, terminated by non-codestream box
+  // that has its size set to 0 (running to end)
+  kCSBF_Multi_Other_Zero_Terminated,
+  // Have multiple partial codestream boxes, and the first one has a content
+  // of zero length
+  kCSBF_Multi_First_Empty,
+  // Have multiple partial codestream boxes, and the last one has a content
+  // of zero length and there is an unknown empty box at the end
+  kCSBF_Multi_Last_Empty_Other,
+  // Have a compressed exif box before a regular codestream box
+  kCSBF_Brob_Exif,
+  // Not a value but used for counting amount of enum entries
+  kCSBF_NUM_ENTRIES,
+};
+
+// Unknown boxes for testing
+static const char* unk1_box_type = "unk1";
+static const char* unk1_box_contents = "abcdefghijklmnopqrstuvwxyz";
+static const size_t unk1_box_size = strlen(unk1_box_contents);
+static const char* unk2_box_type = "unk2";
+static const char* unk2_box_contents = "0123456789";
+static const size_t unk2_box_size = strlen(unk2_box_contents);
+static const char* unk3_box_type = "unk3";
+static const char* unk3_box_contents = "ABCDEF123456";
+static const size_t unk3_box_size = strlen(unk3_box_contents);
+// Box with brob-compressed exif, including header
+static const uint8_t* box_brob_exif = reinterpret_cast<const uint8_t*>(
+    "\0\0\0@brobExif\241\350\2\300\177\244v\2525\304\360\27=?\267{"
+    "\33\37\314\332\214QX17PT\"\256\0\0\202s\214\313t\333\310\320k\20\276\30"
+    "\204\277l$\326c#\1\b");
+size_t box_brob_exif_size = 64;
+// The uncompressed Exif data from the brob box
+static const uint8_t* exif_uncompressed = reinterpret_cast<const uint8_t*>(
+    "\0\0\0\0MM\0*"
+    "\0\0\0\b\0\5\1\22\0\3\0\0\0\1\0\5\0\0\1\32\0\5\0\0\0\1\0\0\0J\1\33\0\5\0\0"
+    "\0\1\0\0\0R\1("
+    "\0\3\0\0\0\1\0\1\0\0\2\23\0\3\0\0\0\1\0\1\0\0\0\0\0\0\0\0\0\1\0\0\0\1\0\0"
+    "\0\1\0\0\0\1");
+size_t exif_uncompressed_size = 94;
+
+// Returns an ICC profile output by the JPEG XL decoder for RGB_D65_SRG_Rel_Lin,
+// but with, on purpose, rXYZ, bXYZ and gXYZ (the RGB primaries) switched to a
+// different order to ensure the profile does not match any known profile, so
+// the encoder cannot encode it in a compact struct instead.
+jxl::PaddedBytes GetIccTestProfile() {
+  const uint8_t* profile = reinterpret_cast<const uint8_t*>(
+      "\0\0\3\200lcms\0040\0\0mntrRGB XYZ "
+      "\a\344\0\a\0\27\0\21\0$"
+      "\0\37acspAPPL\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0\366"
+      "\326\0\1\0\0\0\0\323-lcms\372c\207\36\227\200{"
+      "\2\232s\255\327\340\0\n\26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+      "\0\0\0\0\0\0\0\0\rdesc\0\0\1 "
+      "\0\0\0Bcprt\0\0\1d\0\0\1\0wtpt\0\0\2d\0\0\0\24chad\0\0\2x\0\0\0,"
+      "bXYZ\0\0\2\244\0\0\0\24gXYZ\0\0\2\270\0\0\0\24rXYZ\0\0\2\314\0\0\0\24rTR"
+      "C\0\0\2\340\0\0\0 gTRC\0\0\2\340\0\0\0 bTRC\0\0\2\340\0\0\0 "
+      "chrm\0\0\3\0\0\0\0$dmnd\0\0\3$\0\0\0("
+      "dmdd\0\0\3L\0\0\0002mluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0&"
+      "\0\0\0\34\0R\0G\0B\0_\0D\0006\0005\0_\0S\0R\0G\0_\0R\0e\0l\0_"
+      "\0L\0i\0n\0\0mluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0\344\0\0\0\34\0C\0o\0"
+      "p\0y\0r\0i\0g\0h\0t\0 \0002\0000\0001\08\0 \0G\0o\0o\0g\0l\0e\0 "
+      "\0L\0L\0C\0,\0 \0C\0C\0-\0B\0Y\0-\0S\0A\0 \0003\0.\0000\0 "
+      "\0U\0n\0p\0o\0r\0t\0e\0d\0 "
+      "\0l\0i\0c\0e\0n\0s\0e\0(\0h\0t\0t\0p\0s\0:\0/\0/"
+      "\0c\0r\0e\0a\0t\0i\0v\0e\0c\0o\0m\0m\0o\0n\0s\0.\0o\0r\0g\0/"
+      "\0l\0i\0c\0e\0n\0s\0e\0s\0/\0b\0y\0-\0s\0a\0/\0003\0.\0000\0/"
+      "\0l\0e\0g\0a\0l\0c\0o\0d\0e\0)XYZ "
+      "\0\0\0\0\0\0\366\326\0\1\0\0\0\0\323-"
+      "sf32\0\0\0\0\0\1\fB\0\0\5\336\377\377\363%"
+      "\0\0\a\223\0\0\375\220\377\377\373\241\377\377\375\242\0\0\3\334\0\0\300"
+      "nXYZ \0\0\0\0\0\0o\240\0\08\365\0\0\3\220XYZ "
+      "\0\0\0\0\0\0$\237\0\0\17\204\0\0\266\304XYZ "
+      "\0\0\0\0\0\0b\227\0\0\267\207\0\0\30\331para\0\0\0\0\0\3\0\0\0\1\0\0\0\1"
+      "\0\0\0\0\0\0\0\1\0\0\0\0\0\0chrm\0\0\0\0\0\3\0\0\0\0\243\327\0\0T|"
+      "\0\0L\315\0\0\231\232\0\0&"
+      "g\0\0\17\\mluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0\f\0\0\0\34\0G\0o\0o\0g"
+      "\0l\0emluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0\26\0\0\0\34\0I\0m\0a\0g\0e"
+      "\0 \0c\0o\0d\0e\0c\0\0");
+  size_t profile_size = 896;
+  jxl::PaddedBytes icc_profile;
+  icc_profile.assign(profile, profile + profile_size);
+  return icc_profile;
+}
+
+}  // namespace
+
+namespace jxl {
+namespace {
+
+void AppendTestBox(const char* type, const char* contents, size_t contents_size,
+                   bool unbounded, PaddedBytes* bytes) {
+  AppendU32BE(contents_size + 8, bytes);
+  bytes->push_back(type[0]);
+  bytes->push_back(type[1]);
+  bytes->push_back(type[2]);
+  bytes->push_back(type[3]);
+  const uint8_t* contents_u = reinterpret_cast<const uint8_t*>(contents);
+  bytes->append(contents_u, contents_u + contents_size);
+}
+
+enum PreviewMode {
+  kNoPreview,
+  kSmallPreview,
+  kBigPreview,
+  kNumPreviewModes,
+};
+
+void GeneratePreview(PreviewMode preview_mode, ImageBundle* ib) {
+  if (preview_mode == kSmallPreview) {
+    ib->ShrinkTo(ib->xsize() / 7, ib->ysize() / 7);
+  } else if (preview_mode == kBigPreview) {
+    auto upsample7 = [&](const ImageF& in, ImageF* out) {
+      for (size_t y = 0; y < out->ysize(); ++y) {
+        for (size_t x = 0; x < out->xsize(); ++x) {
+          out->Row(y)[x] = in.ConstRow(y / 7)[x / 7];
+        }
+      }
+    };
+    Image3F preview(ib->xsize() * 7, ib->ysize() * 7);
+    for (size_t c = 0; c < 3; ++c) {
+      upsample7(ib->color()->Plane(c), &preview.Plane(c));
+    }
+    std::vector<ImageF> extra_channels;
+    for (size_t i = 0; i < ib->extra_channels().size(); ++i) {
+      ImageF ec(ib->xsize() * 7, ib->ysize() * 7);
+      upsample7(ib->extra_channels()[i], &ec);
+      extra_channels.emplace_back(std::move(ec));
+    }
+    ib->RemoveColor();
+    ib->ClearExtraChannels();
+    ib->SetFromImage(std::move(preview), ib->c_current());
+    ib->SetExtraChannels(std::move(extra_channels));
+  }
+}
+
+struct TestCodestreamParams {
+  CompressParams cparams;
+  CodeStreamBoxFormat box_format = kCSBF_None;
+  JxlOrientation orientation = JXL_ORIENT_IDENTITY;
+  PreviewMode preview_mode = kNoPreview;
+  bool add_intrinsic_size = false;
+  bool add_icc_profile = false;
+  float intensity_target = 0.0;
+  std::string color_space;
+  PaddedBytes* jpeg_codestream = nullptr;
+  const ProgressiveMode* progressive_mode = nullptr;
+};
+
+// Input pixels always given as 16-bit RGBA, 8 bytes per pixel.
+// include_alpha determines if the encoded image should contain the alpha
+// channel.
+// add_icc_profile: if false, encodes the image as sRGB using the JXL fields,
+// for grayscale or RGB images. If true, encodes the image using the ICC profile
+// returned by GetIccTestProfile, without the JXL fields, this requires the
+// image is RGB, not grayscale.
+// Providing jpeg_codestream will populate the jpeg_codestream with compressed
+// JPEG bytes, and make it possible to reconstruct those exact JPEG bytes using
+// the return value _if_ add_container indicates a box format.
+PaddedBytes CreateTestJXLCodestream(Span<const uint8_t> pixels, size_t xsize,
+                                    size_t ysize, size_t num_channels,
+                                    const TestCodestreamParams& params) {
+  // Compress the pixels with JPEG XL.
+  bool grayscale = (num_channels <= 2);
+  bool include_alpha = !(num_channels & 1) && params.jpeg_codestream == nullptr;
+  size_t bitdepth = params.jpeg_codestream == nullptr ? 16 : 8;
+  CodecInOut io;
+  io.SetSize(xsize, ysize);
+  ColorEncoding color_encoding;
+  if (params.add_icc_profile) {
+    // the hardcoded ICC profile we attach requires RGB.
+    EXPECT_EQ(false, grayscale);
+    EXPECT_TRUE(params.color_space.empty());
+    EXPECT_TRUE(color_encoding.SetICC(GetIccTestProfile()));
+  } else if (!params.color_space.empty()) {
+    JxlColorEncoding c;
+    EXPECT_TRUE(jxl::ParseDescription(params.color_space, &c));
+    EXPECT_TRUE(ConvertExternalToInternalColorEncoding(c, &color_encoding));
+    EXPECT_EQ(color_encoding.IsGray(), grayscale);
+  } else {
+    color_encoding = jxl::ColorEncoding::SRGB(/*is_gray=*/grayscale);
+  }
+  ThreadPool pool(nullptr, nullptr);
+  io.metadata.m.SetUintSamples(bitdepth);
+  if (include_alpha) {
+    io.metadata.m.SetAlphaBits(bitdepth);
+  }
+  if (params.intensity_target != 0) {
+    io.metadata.m.SetIntensityTarget(params.intensity_target);
+  }
+  JxlPixelFormat format = {static_cast<uint32_t>(num_channels), JXL_TYPE_UINT16,
+                           JXL_BIG_ENDIAN, 0};
+  // Make the grayscale-ness of the io metadata color_encoding and the packed
+  // image match.
+  io.metadata.m.color_encoding = color_encoding;
+  EXPECT_TRUE(ConvertFromExternal(pixels, xsize, ysize, color_encoding,
+                                  /*bits_per_sample=*/16, format, &pool,
+                                  &io.Main()));
+  jxl::PaddedBytes jpeg_data;
+  if (params.jpeg_codestream != nullptr) {
+#if JPEGXL_ENABLE_JPEG
+    std::vector<uint8_t> jpeg_bytes;
+    io.jpeg_quality = 70;
+    EXPECT_TRUE(Encode(io, extras::Codec::kJPG, io.metadata.m.color_encoding,
+                       /*bits_per_sample=*/8, &jpeg_bytes, &pool));
+    params.jpeg_codestream->append(jpeg_bytes.data(),
+                                   jpeg_bytes.data() + jpeg_bytes.size());
+    EXPECT_TRUE(jxl::jpeg::DecodeImageJPG(
+        jxl::Span<const uint8_t>(jpeg_bytes.data(), jpeg_bytes.size()), &io));
+    EXPECT_TRUE(
+        EncodeJPEGData(*io.Main().jpeg_data, &jpeg_data, params.cparams));
+    io.metadata.m.xyb_encoded = false;
+#else   // JPEGXL_ENABLE_JPEG
+    JXL_ABORT(
+        "unable to create reconstructible JPEG without JPEG support enabled");
+#endif  // JPEGXL_ENABLE_JPEG
+  }
+  if (params.preview_mode) {
+    io.preview_frame = io.Main().Copy();
+    GeneratePreview(params.preview_mode, &io.preview_frame);
+    io.metadata.m.have_preview = true;
+    EXPECT_TRUE(io.metadata.m.preview_size.Set(io.preview_frame.xsize(),
+                                               io.preview_frame.ysize()));
+  }
+  if (params.add_intrinsic_size) {
+    EXPECT_TRUE(io.metadata.m.intrinsic_size.Set(xsize / 3, ysize / 3));
+  }
+  io.metadata.m.orientation = params.orientation;
+  AuxOut aux_out;
+  PaddedBytes compressed;
+  PassesEncoderState enc_state;
+  if (params.progressive_mode) {
+    enc_state.progressive_splitter.SetProgressiveMode(*params.progressive_mode);
+  }
+  EXPECT_TRUE(EncodeFile(params.cparams, &io, &enc_state, &compressed,
+                         GetJxlCms(), &aux_out, &pool));
+  CodeStreamBoxFormat add_container = params.box_format;
+  if (add_container != kCSBF_None) {
+    // Header with signature box and ftyp box.
+    const uint8_t header[] = {0,    0,    0,    0xc,  0x4a, 0x58, 0x4c, 0x20,
+                              0xd,  0xa,  0x87, 0xa,  0,    0,    0,    0x14,
+                              0x66, 0x74, 0x79, 0x70, 0x6a, 0x78, 0x6c, 0x20,
+                              0,    0,    0,    0,    0x6a, 0x78, 0x6c, 0x20};
+
+    bool is_multi = add_container == kCSBF_Multi ||
+                    add_container == kCSBF_Multi_Zero_Terminated ||
+                    add_container == kCSBF_Multi_Other_Terminated ||
+                    add_container == kCSBF_Multi_Other_Zero_Terminated ||
+                    add_container == kCSBF_Multi_First_Empty ||
+                    add_container == kCSBF_Multi_Last_Empty_Other;
+
+    if (is_multi) {
+      size_t third = compressed.size() / 3;
+      std::vector<uint8_t> compressed0(compressed.data(),
+                                       compressed.data() + third);
+      std::vector<uint8_t> compressed1(compressed.data() + third,
+                                       compressed.data() + 2 * third);
+      std::vector<uint8_t> compressed2(compressed.data() + 2 * third,
+                                       compressed.data() + compressed.size());
+
+      PaddedBytes c;
+      c.append(header, header + sizeof(header));
+      if (params.jpeg_codestream != nullptr) {
+        jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false,
+                             &c);
+        c.append(jpeg_data.data(), jpeg_data.data() + jpeg_data.size());
+      }
+      uint32_t jxlp_index = 0;
+      if (add_container == kCSBF_Multi_First_Empty) {
+        // Dummy (empty) codestream part
+        AppendU32BE(12, &c);
+        c.push_back('j');
+        c.push_back('x');
+        c.push_back('l');
+        c.push_back('p');
+        AppendU32BE(jxlp_index++, &c);
+      }
+      // First codestream part
+      AppendU32BE(compressed0.size() + 12, &c);
+      c.push_back('j');
+      c.push_back('x');
+      c.push_back('l');
+      c.push_back('p');
+      AppendU32BE(jxlp_index++, &c);
+      c.append(compressed0.data(), compressed0.data() + compressed0.size());
+      // A few non-codestream boxes in between
+      AppendTestBox(unk1_box_type, unk1_box_contents, unk1_box_size, false, &c);
+      AppendTestBox(unk2_box_type, unk2_box_contents, unk2_box_size, false, &c);
+      // Dummy (empty) codestream part
+      AppendU32BE(12, &c);
+      c.push_back('j');
+      c.push_back('x');
+      c.push_back('l');
+      c.push_back('p');
+      AppendU32BE(jxlp_index++, &c);
+      // Second codestream part
+      AppendU32BE(compressed1.size() + 12, &c);
+      c.push_back('j');
+      c.push_back('x');
+      c.push_back('l');
+      c.push_back('p');
+      AppendU32BE(jxlp_index++, &c);
+      c.append(compressed1.data(), compressed1.data() + compressed1.size());
+      // Third (last) codestream part
+      AppendU32BE(add_container == kCSBF_Multi_Zero_Terminated
+                      ? 0
+                      : (compressed2.size() + 12),
+                  &c);
+      c.push_back('j');
+      c.push_back('x');
+      c.push_back('l');
+      c.push_back('p');
+      if (add_container != kCSBF_Multi_Last_Empty_Other) {
+        AppendU32BE(jxlp_index++ | 0x80000000, &c);
+      } else {
+        AppendU32BE(jxlp_index++, &c);
+      }
+      c.append(compressed2.data(), compressed2.data() + compressed2.size());
+      if (add_container == kCSBF_Multi_Last_Empty_Other) {
+        // Dummy (empty) codestream part
+        AppendU32BE(12, &c);
+        c.push_back('j');
+        c.push_back('x');
+        c.push_back('l');
+        c.push_back('p');
+        AppendU32BE(jxlp_index++ | 0x80000000, &c);
+        AppendTestBox(unk3_box_type, unk3_box_contents, unk3_box_size, false,
+                      &c);
+      }
+      if (add_container == kCSBF_Multi_Other_Terminated) {
+        AppendTestBox(unk3_box_type, unk3_box_contents, unk3_box_size, false,
+                      &c);
+      }
+      if (add_container == kCSBF_Multi_Other_Zero_Terminated) {
+        AppendTestBox(unk3_box_type, unk3_box_contents, unk3_box_size, true,
+                      &c);
+      }
+      compressed.swap(c);
+    } else {
+      PaddedBytes c;
+      c.append(header, header + sizeof(header));
+      if (params.jpeg_codestream != nullptr) {
+        jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false,
+                             &c);
+        c.append(jpeg_data.data(), jpeg_data.data() + jpeg_data.size());
+      }
+      if (add_container == kCSBF_Brob_Exif) {
+        c.append(box_brob_exif, box_brob_exif + box_brob_exif_size);
+      }
+      AppendU32BE(add_container == kCSBF_Single_Zero_Terminated
+                      ? 0
+                      : (compressed.size() + 8),
+                  &c);
+      c.push_back('j');
+      c.push_back('x');
+      c.push_back('l');
+      c.push_back('c');
+      c.append(compressed.data(), compressed.data() + compressed.size());
+      if (add_container == kCSBF_Single_Other) {
+        AppendTestBox(unk1_box_type, unk1_box_contents, unk1_box_size, false,
+                      &c);
+      }
+      compressed.swap(c);
+    }
+  }
+
+  return compressed;
+}
+
+JxlDecoderStatus ProcessInputIgnoreBoxes(JxlDecoder* dec) {
+  JxlDecoderStatus status = JXL_DEC_BOX;
+  while (status == JXL_DEC_BOX) {
+    status = JxlDecoderProcessInput(dec);
+  }
+  return status;
+}
+
+// Decodes one-shot with the API for non-streaming decoding tests.
+std::vector<uint8_t> DecodeWithAPI(JxlDecoder* dec,
+                                   Span<const uint8_t> compressed,
+                                   const JxlPixelFormat& format,
+                                   bool use_callback, bool set_buffer_early,
+                                   bool use_resizable_runner,
+                                   bool require_boxes, bool expect_success,
+                                   PaddedBytes* icc = nullptr) {
+  JxlThreadParallelRunnerPtr runner_fixed;
+  JxlResizableParallelRunnerPtr runner_resizable;
+  JxlParallelRunner runner_fn;
+  void* runner;
+
+  if (use_resizable_runner) {
+    runner_resizable = JxlResizableParallelRunnerMake(nullptr);
+    runner = runner_resizable.get();
+    runner_fn = JxlResizableParallelRunner;
+  } else {
+    size_t hw_threads = JxlThreadParallelRunnerDefaultNumWorkerThreads();
+    runner_fixed =
+        JxlThreadParallelRunnerMake(nullptr, std::min<size_t>(hw_threads, 16));
+    runner = runner_fixed.get();
+    runner_fn = JxlThreadParallelRunner;
+  }
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetParallelRunner(dec, runner_fn, runner));
+
+  auto process_input =
+      require_boxes ? ProcessInputIgnoreBoxes : JxlDecoderProcessInput;
+
+  EXPECT_EQ(
+      JXL_DEC_SUCCESS,
+      JxlDecoderSubscribeEvents(
+          dec, JXL_DEC_BASIC_INFO | (set_buffer_early ? JXL_DEC_FRAME : 0) |
+                   JXL_DEC_PREVIEW_IMAGE | JXL_DEC_FULL_IMAGE |
+                   (require_boxes ? JXL_DEC_BOX : 0) |
+                   (icc != nullptr ? JXL_DEC_COLOR_ENCODING : 0)));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetInput(dec, compressed.data(), compressed.size()));
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, process_input(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  if (use_resizable_runner) {
+    JxlResizableParallelRunnerSetThreads(
+        runner,
+        JxlResizableParallelRunnerSuggestThreads(info.xsize, info.ysize));
+  }
+
+  std::vector<uint8_t> pixels(buffer_size);
+  size_t bytes_per_pixel = format.num_channels *
+                           test::GetDataBits(format.data_type) /
+                           jxl::kBitsPerByte;
+  size_t stride = bytes_per_pixel * info.xsize;
+  if (format.align > 1) {
+    stride = jxl::DivCeil(stride, format.align) * format.align;
+  }
+  auto callback = [&](size_t x, size_t y, size_t num_pixels,
+                      const void* pixels_row) {
+    memcpy(pixels.data() + stride * y + bytes_per_pixel * x, pixels_row,
+           num_pixels * bytes_per_pixel);
+  };
+
+  JxlDecoderStatus status = process_input(dec);
+
+  if (status == JXL_DEC_COLOR_ENCODING) {
+    size_t icc_size = 0;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderGetICCProfileSize(
+                  dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &icc_size));
+    icc->resize(icc_size);
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
+                                   dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
+                                   icc->data(), icc_size));
+
+    status = process_input(dec);
+  }
+
+  std::vector<uint8_t> preview;
+  if (status == JXL_DEC_NEED_PREVIEW_OUT_BUFFER) {
+    size_t buffer_size;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size));
+    preview.resize(buffer_size);
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetPreviewOutBuffer(dec, &format, preview.data(),
+                                            preview.size()));
+    EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, process_input(dec));
+
+    status = process_input(dec);
+  }
+
+  if (set_buffer_early) {
+    EXPECT_EQ(JXL_DEC_FRAME, status);
+  } else {
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, status);
+  }
+
+  if (use_callback) {
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetImageOutCallback(
+                  dec, &format,
+                  [](void* opaque, size_t x, size_t y, size_t xsize,
+                     const void* pixels_row) {
+                    auto cb = static_cast<decltype(&callback)>(opaque);
+                    (*cb)(x, y, xsize, pixels_row);
+                  },
+                  /*opaque=*/&callback));
+  } else {
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+  }
+
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, process_input(dec));
+
+  // After the full image was output, JxlDecoderProcessInput should return
+  // success to indicate all is done, unless we requested boxes and the last
+  // box was not a terminal unbounded box, in which case it should ask for
+  // more input.
+  JxlDecoderStatus expected_status =
+      expect_success ? JXL_DEC_SUCCESS : JXL_DEC_NEED_MORE_INPUT;
+  EXPECT_EQ(expected_status, process_input(dec));
+
+  return pixels;
+}
+
+// Decodes one-shot with the API for non-streaming decoding tests.
+std::vector<uint8_t> DecodeWithAPI(Span<const uint8_t> compressed,
+                                   const JxlPixelFormat& format,
+                                   bool use_callback, bool set_buffer_early,
+                                   bool use_resizable_runner,
+                                   bool require_boxes, bool expect_success) {
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+  std::vector<uint8_t> pixels =
+      DecodeWithAPI(dec, compressed, format, use_callback, set_buffer_early,
+                    use_resizable_runner, require_boxes, expect_success);
+  JxlDecoderDestroy(dec);
+  return pixels;
+}
+
+}  // namespace
+}  // namespace jxl
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(DecodeTest, JxlSignatureCheckTest) {
+  std::vector<std::pair<int, std::vector<uint8_t>>> tests = {
+      // No JPEGXL header starts with 'a'.
+      {JXL_SIG_INVALID, {'a'}},
+      {JXL_SIG_INVALID, {'a', 'b', 'c', 'd', 'e', 'f'}},
+
+      // Empty file is not enough bytes.
+      {JXL_SIG_NOT_ENOUGH_BYTES, {}},
+
+      // JPEGXL headers.
+      {JXL_SIG_NOT_ENOUGH_BYTES, {0xff}},  // Part of a signature.
+      {JXL_SIG_INVALID, {0xff, 0xD8}},     // JPEG-1
+      {JXL_SIG_CODESTREAM, {0xff, 0x0a}},
+
+      // JPEGXL container file.
+      {JXL_SIG_CONTAINER,
+       {0, 0, 0, 0xc, 'J', 'X', 'L', ' ', 0xD, 0xA, 0x87, 0xA}},
+      // Ending with invalid byte.
+      {JXL_SIG_INVALID, {0, 0, 0, 0xc, 'J', 'X', 'L', ' ', 0xD, 0xA, 0x87, 0}},
+      // Part of signature.
+      {JXL_SIG_NOT_ENOUGH_BYTES,
+       {0, 0, 0, 0xc, 'J', 'X', 'L', ' ', 0xD, 0xA, 0x87}},
+      {JXL_SIG_NOT_ENOUGH_BYTES, {0}},
+  };
+  for (const auto& test : tests) {
+    EXPECT_EQ(test.first,
+              JxlSignatureCheck(test.second.data(), test.second.size()))
+        << "Where test data is " << ::testing::PrintToString(test.second);
+  }
+}
+
+TEST(DecodeTest, DefaultAllocTest) {
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_NE(nullptr, dec);
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, CustomAllocTest) {
+  struct CalledCounters {
+    int allocs = 0;
+    int frees = 0;
+  } counters;
+
+  JxlMemoryManager mm;
+  mm.opaque = &counters;
+  mm.alloc = [](void* opaque, size_t size) {
+    reinterpret_cast<CalledCounters*>(opaque)->allocs++;
+    return malloc(size);
+  };
+  mm.free = [](void* opaque, void* address) {
+    reinterpret_cast<CalledCounters*>(opaque)->frees++;
+    free(address);
+  };
+
+  JxlDecoder* dec = JxlDecoderCreate(&mm);
+  EXPECT_NE(nullptr, dec);
+  EXPECT_LE(1, counters.allocs);
+  EXPECT_EQ(0, counters.frees);
+  JxlDecoderDestroy(dec);
+  EXPECT_LE(1, counters.frees);
+}
+
+// TODO(lode): add multi-threaded test when multithreaded pixel decoding from
+// API is implemented.
+TEST(DecodeTest, DefaultParallelRunnerTest) {
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_NE(nullptr, dec);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetParallelRunner(dec, nullptr, nullptr));
+  JxlDecoderDestroy(dec);
+}
+
+// Creates the header of a JPEG XL file with various custom parameters for
+// testing.
+// xsize, ysize: image dimensions to store in the SizeHeader, max 512.
+// bits_per_sample, orientation: a selection of header parameters to test with.
+// orientation: image orientation to set in the metadata
+// alpha_bits: if non-0, alpha extra channel bits to set in the metadata. Also
+//   gives the alpha channel the name "alpha_test"
+// have_container: add box container format around the codestream.
+// metadata_default: if true, ImageMetadata is set to default and
+//   bits_per_sample, orientation and alpha_bits are ignored.
+// insert_box: insert an extra box before the codestream box, making the header
+// farther away from the front than is ideal. Only used if have_container.
+std::vector<uint8_t> GetTestHeader(size_t xsize, size_t ysize,
+                                   size_t bits_per_sample, size_t orientation,
+                                   size_t alpha_bits, bool xyb_encoded,
+                                   bool have_container, bool metadata_default,
+                                   bool insert_extra_box,
+                                   const jxl::PaddedBytes& icc_profile) {
+  jxl::BitWriter writer;
+  jxl::BitWriter::Allotment allotment(&writer, 65536);  // Large enough
+
+  if (have_container) {
+    const std::vector<uint8_t> signature_box = {0,   0,   0,   0xc, 'J',  'X',
+                                                'L', ' ', 0xd, 0xa, 0x87, 0xa};
+    const std::vector<uint8_t> filetype_box = {
+        0,   0,   0, 0x14, 'f', 't', 'y', 'p', 'j', 'x',
+        'l', ' ', 0, 0,    0,   0,   'j', 'x', 'l', ' '};
+    const std::vector<uint8_t> extra_box_header = {0,   0,   0,   0xff,
+                                                   't', 'e', 's', 't'};
+    // Beginning of codestream box, with an arbitrary size certainly large
+    // enough to contain the header
+    const std::vector<uint8_t> codestream_box_header = {0,   0,   0,   0xff,
+                                                        'j', 'x', 'l', 'c'};
+
+    for (size_t i = 0; i < signature_box.size(); i++) {
+      writer.Write(8, signature_box[i]);
+    }
+    for (size_t i = 0; i < filetype_box.size(); i++) {
+      writer.Write(8, filetype_box[i]);
+    }
+    if (insert_extra_box) {
+      for (size_t i = 0; i < extra_box_header.size(); i++) {
+        writer.Write(8, extra_box_header[i]);
+      }
+      for (size_t i = 0; i < 255 - 8; i++) {
+        writer.Write(8, 0);
+      }
+    }
+    for (size_t i = 0; i < codestream_box_header.size(); i++) {
+      writer.Write(8, codestream_box_header[i]);
+    }
+  }
+
+  // JXL signature
+  writer.Write(8, 0xff);
+  writer.Write(8, 0x0a);
+
+  // SizeHeader
+  jxl::CodecMetadata metadata;
+  EXPECT_TRUE(metadata.size.Set(xsize, ysize));
+  EXPECT_TRUE(WriteSizeHeader(metadata.size, &writer, 0, nullptr));
+
+  if (!metadata_default) {
+    metadata.m.SetUintSamples(bits_per_sample);
+    metadata.m.orientation = orientation;
+    metadata.m.SetAlphaBits(alpha_bits);
+    metadata.m.xyb_encoded = xyb_encoded;
+    if (alpha_bits != 0) {
+      metadata.m.extra_channel_info[0].name = "alpha_test";
+    }
+  }
+
+  if (!icc_profile.empty()) {
+    jxl::PaddedBytes copy = icc_profile;
+    EXPECT_TRUE(metadata.m.color_encoding.SetICC(std::move(copy)));
+  }
+
+  EXPECT_TRUE(jxl::Bundle::Write(metadata.m, &writer, 0, nullptr));
+  metadata.transform_data.nonserialized_xyb_encoded = metadata.m.xyb_encoded;
+  EXPECT_TRUE(jxl::Bundle::Write(metadata.transform_data, &writer, 0, nullptr));
+
+  if (!icc_profile.empty()) {
+    EXPECT_TRUE(metadata.m.color_encoding.WantICC());
+    EXPECT_TRUE(jxl::WriteICC(icc_profile, &writer, 0, nullptr));
+  }
+
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, 0, nullptr);
+  return std::vector<uint8_t>(
+      writer.GetSpan().data(),
+      writer.GetSpan().data() + writer.GetSpan().size());
+}
+
+TEST(DecodeTest, BasicInfoTest) {
+  size_t xsize[2] = {50, 33};
+  size_t ysize[2] = {50, 77};
+  size_t bits_per_sample[2] = {8, 23};
+  size_t orientation[2] = {3, 5};
+  size_t alpha_bits[2] = {0, 8};
+  JXL_BOOL have_container[2] = {0, 1};
+  bool xyb_encoded = false;
+
+  std::vector<std::vector<uint8_t>> test_samples;
+  // Test with direct codestream
+  test_samples.push_back(GetTestHeader(
+      xsize[0], ysize[0], bits_per_sample[0], orientation[0], alpha_bits[0],
+      xyb_encoded, have_container[0], /*metadata_default=*/false,
+      /*insert_extra_box=*/false, {}));
+  // Test with container and different parameters
+  test_samples.push_back(GetTestHeader(
+      xsize[1], ysize[1], bits_per_sample[1], orientation[1], alpha_bits[1],
+      xyb_encoded, have_container[1], /*metadata_default=*/false,
+      /*insert_extra_box=*/false, {}));
+
+  for (size_t i = 0; i < test_samples.size(); ++i) {
+    const std::vector<uint8_t>& data = test_samples[i];
+    // Test decoding too small header first, until we reach the final byte.
+    for (size_t size = 0; size <= data.size(); ++size) {
+      // Test with a new decoder for each tested byte size.
+      JxlDecoder* dec = JxlDecoderCreate(nullptr);
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO));
+      const uint8_t* next_in = data.data();
+      size_t avail_in = size;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+      JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+
+      JxlBasicInfo info;
+      bool have_basic_info = !JxlDecoderGetBasicInfo(dec, &info);
+
+      if (size == data.size()) {
+        EXPECT_EQ(JXL_DEC_BASIC_INFO, status);
+
+        // All header bytes given so the decoder must have the basic info.
+        EXPECT_EQ(true, have_basic_info);
+        EXPECT_EQ(have_container[i], info.have_container);
+        EXPECT_EQ(alpha_bits[i], info.alpha_bits);
+        // Orientations 5..8 swap the dimensions
+        if (orientation[i] >= 5) {
+          EXPECT_EQ(xsize[i], info.ysize);
+          EXPECT_EQ(ysize[i], info.xsize);
+        } else {
+          EXPECT_EQ(xsize[i], info.xsize);
+          EXPECT_EQ(ysize[i], info.ysize);
+        }
+        // The API should set the orientation to identity by default since it
+        // already applies the transformation internally by default.
+        EXPECT_EQ(1u, info.orientation);
+
+        EXPECT_EQ(3u, info.num_color_channels);
+
+        if (alpha_bits[i] != 0) {
+          // Expect an extra channel
+          EXPECT_EQ(1u, info.num_extra_channels);
+          JxlExtraChannelInfo extra;
+          EXPECT_EQ(0, JxlDecoderGetExtraChannelInfo(dec, 0, &extra));
+          EXPECT_EQ(alpha_bits[i], extra.bits_per_sample);
+          EXPECT_EQ(JXL_CHANNEL_ALPHA, extra.type);
+          EXPECT_EQ(0, extra.alpha_premultiplied);
+          // Verify the name "alpha_test" given to the alpha channel
+          EXPECT_EQ(10u, extra.name_length);
+          char name[11];
+          EXPECT_EQ(0,
+                    JxlDecoderGetExtraChannelName(dec, 0, name, sizeof(name)));
+          EXPECT_EQ(std::string("alpha_test"), std::string(name));
+        } else {
+          EXPECT_EQ(0u, info.num_extra_channels);
+        }
+
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+      } else {
+        // If we did not give the full header, the basic info should not be
+        // available. Allow a few bytes of slack due to some bits for default
+        // opsinmatrix/extension bits.
+        if (size + 2 < data.size()) {
+          EXPECT_EQ(false, have_basic_info);
+          EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, status);
+        }
+      }
+
+      // Test that decoder doesn't allow setting a setting required at beginning
+      // unless it's reset
+      EXPECT_EQ(JXL_DEC_ERROR,
+                JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO));
+      JxlDecoderReset(dec);
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO));
+
+      JxlDecoderDestroy(dec);
+    }
+  }
+}
+
+TEST(DecodeTest, BufferSizeTest) {
+  size_t xsize = 33;
+  size_t ysize = 77;
+  size_t bits_per_sample = 8;
+  size_t orientation = 1;
+  size_t alpha_bits = 8;
+  bool have_container = false;
+  bool xyb_encoded = false;
+
+  std::vector<uint8_t> header =
+      GetTestHeader(xsize, ysize, bits_per_sample, orientation, alpha_bits,
+                    xyb_encoded, have_container, /*metadata_default=*/false,
+                    /*insert_extra_box=*/false, {});
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO));
+  const uint8_t* next_in = header.data();
+  size_t avail_in = header.size();
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+  JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, status);
+
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(xsize, info.xsize);
+  EXPECT_EQ(ysize, info.ysize);
+
+  JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+  size_t image_out_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &image_out_size));
+  EXPECT_EQ(xsize * ysize * 4, image_out_size);
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, BasicInfoSizeHintTest) {
+  // Test on a file where the size hint is too small initially due to inserting
+  // a box before the codestream (something that is normally not recommended)
+  size_t xsize = 50;
+  size_t ysize = 50;
+  size_t bits_per_sample = 16;
+  size_t orientation = 1;
+  size_t alpha_bits = 0;
+  bool xyb_encoded = false;
+  std::vector<uint8_t> data = GetTestHeader(
+      xsize, ysize, bits_per_sample, orientation, alpha_bits, xyb_encoded,
+      /*have_container=*/true, /*metadata_default=*/false,
+      /*insert_extra_box=*/true, {});
+
+  JxlDecoderStatus status;
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO));
+
+  size_t hint0 = JxlDecoderSizeHintBasicInfo(dec);
+  // Test that the test works as intended: we construct a file on purpose to
+  // be larger than the first hint by having that extra box.
+  EXPECT_LT(hint0, data.size());
+  const uint8_t* next_in = data.data();
+  // Do as if we have only as many bytes as indicated by the hint available
+  size_t avail_in = std::min(hint0, data.size());
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+  status = JxlDecoderProcessInput(dec);
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, status);
+  // Basic info cannot be available yet due to the extra inserted box.
+  EXPECT_EQ(false, !JxlDecoderGetBasicInfo(dec, nullptr));
+
+  size_t num_read = avail_in - JxlDecoderReleaseInput(dec);
+  EXPECT_LT(num_read, data.size());
+
+  size_t hint1 = JxlDecoderSizeHintBasicInfo(dec);
+  // The hint must be larger than the previous hint (taking already processed
+  // bytes into account, the hint is a hint for the next avail_in) since the
+  // decoder now knows there is a box in between.
+  EXPECT_GT(hint1 + num_read, hint0);
+  avail_in = std::min<size_t>(hint1, data.size() - num_read);
+  next_in += num_read;
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+  status = JxlDecoderProcessInput(dec);
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, status);
+  JxlBasicInfo info;
+  // We should have the basic info now, since we only added one box in-between,
+  // and the decoder should have known its size, its implementation can return
+  // a correct hint.
+  EXPECT_EQ(true, !JxlDecoderGetBasicInfo(dec, &info));
+
+  // Also test if the basic info is correct.
+  EXPECT_EQ(1, info.have_container);
+  EXPECT_EQ(xsize, info.xsize);
+  EXPECT_EQ(ysize, info.ysize);
+  EXPECT_EQ(orientation, info.orientation);
+  EXPECT_EQ(bits_per_sample, info.bits_per_sample);
+
+  JxlDecoderDestroy(dec);
+}
+
+std::vector<uint8_t> GetIccTestHeader(const jxl::PaddedBytes& icc_profile,
+                                      bool xyb_encoded) {
+  size_t xsize = 50;
+  size_t ysize = 50;
+  size_t bits_per_sample = 16;
+  size_t orientation = 1;
+  size_t alpha_bits = 0;
+  return GetTestHeader(xsize, ysize, bits_per_sample, orientation, alpha_bits,
+                       xyb_encoded,
+                       /*have_container=*/false, /*metadata_default=*/false,
+                       /*insert_extra_box=*/false, icc_profile);
+}
+
+// Tests the case where pixels and metadata ICC profile are the same
+TEST(DecodeTest, IccProfileTestOriginal) {
+  jxl::PaddedBytes icc_profile = GetIccTestProfile();
+  bool xyb_encoded = false;
+  std::vector<uint8_t> data = GetIccTestHeader(icc_profile, xyb_encoded);
+  JxlPixelFormat format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), data.size()));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+
+  // Expect the opposite of xyb_encoded for uses_original_profile
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(JXL_TRUE, info.uses_original_profile);
+
+  EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+
+  // the encoded color profile expected to be not available, since the image
+  // has an ICC profile instead
+  EXPECT_EQ(JXL_DEC_ERROR,
+            JxlDecoderGetColorAsEncodedProfile(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr));
+
+  size_t dec_profile_size;
+  EXPECT_EQ(
+      JXL_DEC_SUCCESS,
+      JxlDecoderGetICCProfileSize(
+          dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_profile_size));
+
+  // Check that can get return status with NULL size
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetICCProfileSize(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr));
+
+  // The profiles must be equal. This requires they have equal size, and if
+  // they do, we can get the profile and compare the contents.
+  EXPECT_EQ(icc_profile.size(), dec_profile_size);
+  if (icc_profile.size() == dec_profile_size) {
+    jxl::PaddedBytes icc_profile2(icc_profile.size());
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderGetColorAsICCProfile(
+                  dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                  icc_profile2.data(), icc_profile2.size()));
+    EXPECT_EQ(icc_profile, icc_profile2);
+  }
+
+  // the data is not xyb_encoded, so same result expected for the pixel data
+  // color profile
+  EXPECT_EQ(JXL_DEC_ERROR,
+            JxlDecoderGetColorAsEncodedProfile(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, nullptr));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetICCProfileSize(
+                                 dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
+                                 &dec_profile_size));
+  EXPECT_EQ(icc_profile.size(), dec_profile_size);
+
+  JxlDecoderDestroy(dec);
+}
+
+// Tests the case where pixels and metadata ICC profile are different
+TEST(DecodeTest, IccProfileTestXybEncoded) {
+  jxl::PaddedBytes icc_profile = GetIccTestProfile();
+  bool xyb_encoded = true;
+  std::vector<uint8_t> data = GetIccTestHeader(icc_profile, xyb_encoded);
+  JxlPixelFormat format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+  JxlPixelFormat format_int = {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), data.size()));
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+
+  // Expect the opposite of xyb_encoded for uses_original_profile
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(JXL_FALSE, info.uses_original_profile);
+
+  EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+
+  // the encoded color profile expected to be not available, since the image
+  // has an ICC profile instead
+  EXPECT_EQ(JXL_DEC_ERROR,
+            JxlDecoderGetColorAsEncodedProfile(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr));
+
+  // Check that can get return status with NULL size
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetICCProfileSize(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr));
+
+  size_t dec_profile_size;
+  EXPECT_EQ(
+      JXL_DEC_SUCCESS,
+      JxlDecoderGetICCProfileSize(
+          dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_profile_size));
+
+  // The profiles must be equal. This requires they have equal size, and if
+  // they do, we can get the profile and compare the contents.
+  EXPECT_EQ(icc_profile.size(), dec_profile_size);
+  if (icc_profile.size() == dec_profile_size) {
+    jxl::PaddedBytes icc_profile2(icc_profile.size());
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderGetColorAsICCProfile(
+                  dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                  icc_profile2.data(), icc_profile2.size()));
+    EXPECT_EQ(icc_profile, icc_profile2);
+  }
+
+  // Data is xyb_encoded, so the data profile is a different profile, encoded
+  // as structured profile.
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsEncodedProfile(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, nullptr));
+  JxlColorEncoding pixel_encoding;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsEncodedProfile(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
+  EXPECT_EQ(JXL_PRIMARIES_SRGB, pixel_encoding.primaries);
+  // The API returns LINEAR by default when the colorspace cannot be represented
+  // by enum values.
+  EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function);
+
+  // Test the same but with integer format.
+  EXPECT_EQ(
+      JXL_DEC_SUCCESS,
+      JxlDecoderGetColorAsEncodedProfile(
+          dec, &format_int, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
+  EXPECT_EQ(JXL_PRIMARIES_SRGB, pixel_encoding.primaries);
+  EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function);
+
+  // Test after setting the preferred color profile to non-linear sRGB:
+  // for XYB images with ICC profile, this setting is expected to take effect.
+  jxl::ColorEncoding temp_jxl_srgb = jxl::ColorEncoding::SRGB(false);
+  JxlColorEncoding pixel_encoding_srgb;
+  ConvertInternalToExternalColorEncoding(temp_jxl_srgb, &pixel_encoding_srgb);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetPreferredColorProfile(dec, &pixel_encoding_srgb));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsEncodedProfile(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
+  EXPECT_EQ(JXL_TRANSFER_FUNCTION_SRGB, pixel_encoding.transfer_function);
+
+  // The decoder can also output this as a generated ICC profile anyway, and
+  // we're certain that it will differ from the above defined profile since
+  // the sRGB data should not have swapped R/G/B primaries.
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetICCProfileSize(
+                                 dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
+                                 &dec_profile_size));
+  // We don't need to dictate exactly what size the generated ICC profile
+  // must be (since there are many ways to represent the same color space),
+  // but it should not be zero.
+  EXPECT_NE(0u, dec_profile_size);
+  jxl::PaddedBytes icc_profile2(dec_profile_size);
+  if (0 != dec_profile_size) {
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
+                                   dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
+                                   icc_profile2.data(), icc_profile2.size()));
+    // expected not equal
+    EXPECT_NE(icc_profile, icc_profile2);
+  }
+
+  // Test setting another different preferred profile, to verify that the
+  // returned JXL_COLOR_PROFILE_TARGET_DATA ICC profile is correctly
+  // updated.
+
+  jxl::ColorEncoding temp_jxl_linear = jxl::ColorEncoding::LinearSRGB(false);
+  JxlColorEncoding pixel_encoding_linear;
+  ConvertInternalToExternalColorEncoding(temp_jxl_linear,
+                                         &pixel_encoding_linear);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetPreferredColorProfile(dec, &pixel_encoding_linear));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsEncodedProfile(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
+  EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetICCProfileSize(
+                                 dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
+                                 &dec_profile_size));
+  EXPECT_NE(0u, dec_profile_size);
+  jxl::PaddedBytes icc_profile3(dec_profile_size);
+  if (0 != dec_profile_size) {
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
+                                   dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
+                                   icc_profile3.data(), icc_profile3.size()));
+    // expected not equal to the previously set preferred profile.
+    EXPECT_NE(icc_profile2, icc_profile3);
+  }
+
+  JxlDecoderDestroy(dec);
+}
+
+// Test decoding ICC from partial files byte for byte.
+// This test must pass also if JXL_CRASH_ON_ERROR is enabled, that is, the
+// decoding of the ANS histogram and stream of the encoded ICC profile must also
+// handle the case of not enough input bytes with StatusCode::kNotEnoughBytes
+// rather than fatal error status codes.
+TEST(DecodeTest, ICCPartialTest) {
+  jxl::PaddedBytes icc_profile = GetIccTestProfile();
+  std::vector<uint8_t> data = GetIccTestHeader(icc_profile, false);
+  JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+  const uint8_t* next_in = data.data();
+  size_t avail_in = 0;
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING));
+
+  bool seen_basic_info = false;
+  bool seen_color_encoding = false;
+  size_t total_size = 0;
+
+  for (;;) {
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+    JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+    size_t remaining = JxlDecoderReleaseInput(dec);
+    EXPECT_LE(remaining, avail_in);
+    next_in += avail_in - remaining;
+    avail_in = remaining;
+    if (status == JXL_DEC_NEED_MORE_INPUT) {
+      if (total_size >= data.size()) {
+        // End of partial codestream with codestrema headers and ICC profile
+        // reached, it should not require more input since full image is not
+        // requested
+        FAIL();
+        break;
+      }
+      size_t increment = 1;
+      if (total_size + increment > data.size()) {
+        increment = data.size() - total_size;
+      }
+      total_size += increment;
+      avail_in += increment;
+    } else if (status == JXL_DEC_BASIC_INFO) {
+      EXPECT_FALSE(seen_basic_info);
+      seen_basic_info = true;
+    } else if (status == JXL_DEC_COLOR_ENCODING) {
+      EXPECT_TRUE(seen_basic_info);
+      EXPECT_FALSE(seen_color_encoding);
+      seen_color_encoding = true;
+
+      // Sanity check that the ICC profile was decoded correctly
+      size_t dec_profile_size;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderGetICCProfileSize(dec, &format,
+                                            JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                            &dec_profile_size));
+      EXPECT_EQ(icc_profile.size(), dec_profile_size);
+
+    } else if (status == JXL_DEC_SUCCESS) {
+      EXPECT_TRUE(seen_color_encoding);
+      break;
+    } else {
+      // We do not expect any other events or errors
+      FAIL();
+      break;
+    }
+  }
+
+  EXPECT_TRUE(seen_basic_info);
+  EXPECT_TRUE(seen_color_encoding);
+
+  JxlDecoderDestroy(dec);
+}
+
+struct PixelTestConfig {
+  // Input image definition.
+  bool grayscale;
+  bool include_alpha;
+  size_t xsize;
+  size_t ysize;
+  jxl::PreviewMode preview_mode;
+  bool add_intrinsic_size;
+  // Output format.
+  JxlEndianness endianness;
+  JxlDataType data_type;
+  uint32_t output_channels;
+  // Container options.
+  CodeStreamBoxFormat add_container;
+  // Decoding mode.
+  bool use_callback;
+  bool set_buffer_early;
+  bool use_resizable_runner;
+  // Exif orientation, 1-8
+  JxlOrientation orientation;
+  bool keep_orientation;
+  size_t upsampling;
+};
+
+class DecodeTestParam : public ::testing::TestWithParam<PixelTestConfig> {};
+
+TEST_P(DecodeTestParam, PixelTest) {
+  PixelTestConfig config = GetParam();
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+  if (config.keep_orientation) {
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetKeepOrientation(dec, JXL_TRUE));
+  }
+
+  size_t num_pixels = config.xsize * config.ysize;
+  uint32_t orig_channels =
+      (config.grayscale ? 1 : 3) + (config.include_alpha ? 1 : 0);
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(config.xsize, config.ysize, orig_channels, 0);
+  JxlPixelFormat format_orig = {orig_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN,
+                                0};
+  jxl::TestCodestreamParams params;
+  // Lossless to verify pixels exactly after roundtrip.
+  params.cparams.SetLossless();
+  params.cparams.speed_tier = jxl::SpeedTier::kThunder;
+  params.cparams.resampling = config.upsampling;
+  params.cparams.ec_resampling = config.upsampling;
+  params.box_format = config.add_container;
+  params.orientation = config.orientation;
+  params.preview_mode = config.preview_mode;
+  params.add_intrinsic_size = config.add_intrinsic_size;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), config.xsize,
+      config.ysize, orig_channels, params);
+
+  JxlPixelFormat format = {config.output_channels, config.data_type,
+                           config.endianness, 0};
+
+  bool swap_xy = !config.keep_orientation && (config.orientation > 4);
+  size_t xsize = swap_xy ? config.ysize : config.xsize;
+  size_t ysize = swap_xy ? config.xsize : config.ysize;
+
+  std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+      dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+      format, config.use_callback, config.set_buffer_early,
+      config.use_resizable_runner, /*require_boxes=*/false,
+      /*expect_success=*/true);
+  JxlDecoderReset(dec);
+  EXPECT_EQ(num_pixels * config.output_channels *
+                jxl::test::GetDataBits(config.data_type) / jxl::kBitsPerByte,
+            pixels2.size());
+
+  // If an orientation transformation is expected, to compare the pixels, also
+  // apply this transformation to the original pixels. ConvertToExternal is
+  // used to achieve this, with a temporary conversion to CodecInOut and back.
+  if (config.orientation > 1 && !config.keep_orientation) {
+    jxl::Span<const uint8_t> bytes(pixels.data(), pixels.size());
+    jxl::ColorEncoding color_encoding =
+        jxl::ColorEncoding::SRGB(config.grayscale);
+
+    jxl::CodecInOut io;
+    if (config.include_alpha) io.metadata.m.SetAlphaBits(16);
+    io.metadata.m.color_encoding = color_encoding;
+    io.SetSize(config.xsize, config.ysize);
+
+    EXPECT_TRUE(ConvertFromExternal(bytes, config.xsize, config.ysize,
+                                    color_encoding, 16, format_orig, nullptr,
+                                    &io.Main()));
+
+    for (size_t i = 0; i < pixels.size(); i++) pixels[i] = 0;
+    EXPECT_TRUE(ConvertToExternal(
+        io.Main(), 16,
+        /*float_out=*/false, orig_channels, JXL_BIG_ENDIAN,
+        xsize * 2 * orig_channels, nullptr, pixels.data(), pixels.size(),
+        /*out_callback=*/{},
+        static_cast<jxl::Orientation>(config.orientation)));
+  }
+  if (config.upsampling == 1) {
+    EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize,
+                                           ysize, format_orig, format));
+  } else {
+    // resampling is of course not lossless, so as a rough check:
+    // count pixels that are more than off-by-25 in the 8-bit value of one of
+    // the channels
+    EXPECT_LE(
+        jxl::test::ComparePixels(
+            pixels.data(), pixels2.data(), xsize, ysize, format_orig, format,
+            50.0 * (config.data_type == JXL_TYPE_UINT8 ? 1.0 : 256.0)),
+        300u);
+  }
+
+  JxlDecoderDestroy(dec);
+}
+
+std::vector<PixelTestConfig> GeneratePixelTests() {
+  std::vector<PixelTestConfig> all_tests;
+  struct ChannelInfo {
+    bool grayscale;
+    bool include_alpha;
+    size_t output_channels;
+  };
+  ChannelInfo ch_info[] = {
+      {false, true, 4},   // RGBA -> RGBA
+      {true, false, 1},   // G -> G
+      {true, true, 1},    // GA -> G
+      {true, true, 2},    // GA -> GA
+      {false, false, 3},  // RGB -> RGB
+      {false, true, 3},   // RGBA -> RGB
+      {false, false, 4},  // RGB -> RGBA
+  };
+
+  struct OutputFormat {
+    JxlEndianness endianness;
+    JxlDataType data_type;
+  };
+  OutputFormat out_formats[] = {
+      {JXL_NATIVE_ENDIAN, JXL_TYPE_UINT8},
+      {JXL_LITTLE_ENDIAN, JXL_TYPE_UINT16},
+      {JXL_BIG_ENDIAN, JXL_TYPE_UINT16},
+      {JXL_NATIVE_ENDIAN, JXL_TYPE_FLOAT16},
+      {JXL_LITTLE_ENDIAN, JXL_TYPE_FLOAT},
+      {JXL_BIG_ENDIAN, JXL_TYPE_FLOAT},
+  };
+
+  auto make_test = [&](ChannelInfo ch, size_t xsize, size_t ysize,
+                       jxl::PreviewMode preview_mode, bool intrinsic_size,
+                       CodeStreamBoxFormat box, JxlOrientation orientation,
+                       bool keep_orientation, OutputFormat format,
+                       bool use_callback, bool set_buffer_early,
+                       bool resizable_runner, size_t upsampling) {
+    PixelTestConfig c;
+    c.grayscale = ch.grayscale;
+    c.include_alpha = ch.include_alpha;
+    c.preview_mode = preview_mode;
+    c.add_intrinsic_size = intrinsic_size;
+    c.xsize = xsize;
+    c.ysize = ysize;
+    c.add_container = (CodeStreamBoxFormat)box;
+    c.output_channels = ch.output_channels;
+    c.data_type = format.data_type;
+    c.endianness = format.endianness;
+    c.use_callback = use_callback;
+    c.set_buffer_early = set_buffer_early;
+    c.use_resizable_runner = resizable_runner;
+    c.orientation = orientation;
+    c.keep_orientation = keep_orientation;
+    c.upsampling = upsampling;
+    all_tests.push_back(c);
+  };
+
+  // Test output formats and methods.
+  for (ChannelInfo ch : ch_info) {
+    for (int use_callback = 0; use_callback <= 1; use_callback++) {
+      for (size_t upsampling : {1, 2, 4, 8}) {
+        for (OutputFormat fmt : out_formats) {
+          make_test(ch, 301, 33, jxl::kNoPreview,
+                    /*add_intrinsic_size=*/false,
+                    CodeStreamBoxFormat::kCSBF_None, JXL_ORIENT_IDENTITY,
+                    /*keep_orientation=*/false, fmt, use_callback,
+                    /*set_buffer_early=*/false, /*resizable_runner=*/false,
+                    upsampling);
+        }
+      }
+    }
+  }
+  // Test codestream formats.
+  for (size_t box = 1; box < kCSBF_NUM_ENTRIES; ++box) {
+    make_test(ch_info[0], 77, 33, jxl::kNoPreview,
+              /*add_intrinsic_size=*/false, (CodeStreamBoxFormat)box,
+              JXL_ORIENT_IDENTITY,
+              /*keep_orientation=*/false, out_formats[0],
+              /*use_callback=*/false,
+              /*set_buffer_early=*/false, /*resizable_runner=*/false, 1);
+  }
+  // Test previews.
+  for (int preview_mode = 0; preview_mode < jxl::kNumPreviewModes;
+       preview_mode++) {
+    make_test(ch_info[0], 77, 33, (jxl::PreviewMode)preview_mode,
+              /*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None,
+              JXL_ORIENT_IDENTITY,
+              /*keep_orientation=*/false, out_formats[0],
+              /*use_callback=*/false, /*set_buffer_early=*/false,
+              /*resizable_runner=*/false, 1);
+  }
+  // Test intrinsic sizes.
+  for (int add_intrinsic_size = 0; add_intrinsic_size <= 1;
+       add_intrinsic_size++) {
+    make_test(ch_info[0], 55, 34, jxl::kNoPreview, add_intrinsic_size,
+              CodeStreamBoxFormat::kCSBF_None, JXL_ORIENT_IDENTITY,
+              /*keep_orientation=*/false, out_formats[0],
+              /*use_callback=*/false, /*set_buffer_early=*/false,
+              /*resizable_runner=*/false, 1);
+  }
+  // Test setting buffers early.
+  make_test(ch_info[0], 300, 33, jxl::kNoPreview,
+            /*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None,
+            JXL_ORIENT_IDENTITY,
+            /*keep_orientation=*/false, out_formats[0],
+            /*use_callback=*/false, /*set_buffer_early=*/true,
+            /*resizable_runner=*/false, 1);
+
+  // Test using the resizable runner
+  for (size_t i = 0; i < 4; i++) {
+    make_test(ch_info[0], 300 << i, 33 << i, jxl::kNoPreview,
+              /*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None,
+              JXL_ORIENT_IDENTITY,
+              /*keep_orientation=*/false, out_formats[0],
+              /*use_callback=*/false, /*set_buffer_early=*/false,
+              /*resizable_runner=*/true, 1);
+  }
+
+  // Test orientations.
+  for (int orientation = 2; orientation <= 8; ++orientation) {
+    for (int keep_orientation = 0; keep_orientation <= 1; keep_orientation++) {
+      for (int use_callback = 0; use_callback <= 1; use_callback++) {
+        for (ChannelInfo ch : ch_info) {
+          for (OutputFormat fmt : out_formats) {
+            make_test(ch, 280, 12, jxl::kNoPreview,
+                      /*add_intrinsic_size=*/false,
+                      CodeStreamBoxFormat::kCSBF_None,
+                      static_cast<JxlOrientation>(orientation),
+                      /*keep_orientation=*/keep_orientation, fmt,
+                      /*use_callback=*/use_callback, /*set_buffer_early=*/true,
+                      /*resizable_runner=*/false, 1);
+          }
+        }
+      }
+    }
+  }
+
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const PixelTestConfig& c) {
+  os << c.xsize << "x" << c.ysize;
+  const char* colors[] = {"", "G", "GA", "RGB", "RGBA"};
+  os << colors[(c.grayscale ? 1 : 3) + (c.include_alpha ? 1 : 0)];
+  os << "to";
+  os << colors[c.output_channels];
+  switch (c.data_type) {
+    case JXL_TYPE_UINT8:
+      os << "u8";
+      break;
+    case JXL_TYPE_UINT16:
+      os << "u16";
+      break;
+    case JXL_TYPE_FLOAT:
+      os << "f32";
+      break;
+    case JXL_TYPE_FLOAT16:
+      os << "f16";
+      break;
+    default:
+      JXL_ASSERT(false);
+  };
+  if (jxl::test::GetDataBits(c.data_type) > jxl::kBitsPerByte) {
+    if (c.endianness == JXL_NATIVE_ENDIAN) {
+      // add nothing
+    } else if (c.endianness == JXL_BIG_ENDIAN) {
+      os << "BE";
+    } else if (c.endianness == JXL_LITTLE_ENDIAN) {
+      os << "LE";
+    }
+  }
+  if (c.add_container != CodeStreamBoxFormat::kCSBF_None) {
+    os << "Box";
+    os << (size_t)c.add_container;
+  }
+  if (c.preview_mode == jxl::kSmallPreview) os << "Preview";
+  if (c.preview_mode == jxl::kBigPreview) os << "BigPreview";
+  if (c.add_intrinsic_size) os << "IntrinicSize";
+  if (c.use_callback) os << "Callback";
+  if (c.set_buffer_early) os << "EarlyBuffer";
+  if (c.use_resizable_runner) os << "ResizableRunner";
+  if (c.orientation != 1) os << "O" << c.orientation;
+  if (c.keep_orientation) os << "Keep";
+  if (c.upsampling > 1) os << "x" << c.upsampling;
+  return os;
+}
+
+std::string PixelTestDescription(
+    const testing::TestParamInfo<DecodeTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(DecodeTest, DecodeTestParam,
+                                   testing::ValuesIn(GeneratePixelTests()),
+                                   PixelTestDescription);
+
+TEST(DecodeTest, PixelTestWithICCProfileLossless) {
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+  size_t xsize = 123, ysize = 77;
+  size_t num_pixels = xsize * ysize;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  jxl::TestCodestreamParams params;
+  // Lossless to verify pixels exactly after roundtrip.
+  params.cparams.SetLossless();
+  params.cparams.speed_tier = jxl::SpeedTier::kThunder;
+  params.add_icc_profile = true;
+  // For variation: some have container and no preview, others have preview
+  // and no container.
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
+      params);
+
+  for (uint32_t channels = 3; channels <= 4; ++channels) {
+    {
+      JxlPixelFormat format = {channels, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+      std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+          dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+          format, /*use_callback=*/false, /*set_buffer_early=*/false,
+          /*use_resizable_runner=*/false, /*require_boxes=*/false,
+          /*expect_success=*/true);
+      JxlDecoderReset(dec);
+      EXPECT_EQ(num_pixels * channels, pixels2.size());
+      EXPECT_EQ(0u,
+                jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize,
+                                         ysize, format_orig, format));
+    }
+    {
+      JxlPixelFormat format = {channels, JXL_TYPE_UINT16, JXL_LITTLE_ENDIAN, 0};
+
+      // Test with the container for one of the pixel formats.
+      std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+          dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+          format, /*use_callback=*/true, /*set_buffer_early=*/true,
+          /*use_resizable_runner=*/false, /*require_boxes=*/false,
+          /*expect_success=*/true);
+      JxlDecoderReset(dec);
+      EXPECT_EQ(num_pixels * channels * 2, pixels2.size());
+      EXPECT_EQ(0u,
+                jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize,
+                                         ysize, format_orig, format));
+    }
+
+    {
+      JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+
+      std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+          dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+          format, /*use_callback=*/false, /*set_buffer_early=*/false,
+          /*use_resizable_runner=*/false, /*require_boxes=*/false,
+          /*expect_success=*/true);
+      JxlDecoderReset(dec);
+      EXPECT_EQ(num_pixels * channels * 4, pixels2.size());
+      EXPECT_EQ(0u,
+                jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize,
+                                         ysize, format_orig, format));
+    }
+  }
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, PixelTestWithICCProfileLossy) {
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+  size_t xsize = 123, ysize = 77;
+  size_t num_pixels = xsize * ysize;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  jxl::TestCodestreamParams params;
+  params.add_icc_profile = true;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+      params);
+  uint32_t channels = 3;
+
+  JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+
+  jxl::PaddedBytes icc;
+  std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+      dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+      format, /*use_callback=*/false, /*set_buffer_early=*/true,
+      /*use_resizable_runner=*/false, /*require_boxes=*/false,
+      /*expect_success=*/true, /*icc=*/&icc);
+  JxlDecoderReset(dec);
+  EXPECT_EQ(num_pixels * channels * 4, pixels2.size());
+
+  // The input pixels use the profile matching GetIccTestProfile, since we set
+  // add_icc_profile for CreateTestJXLCodestream to true.
+  jxl::ColorEncoding color_encoding0;
+  EXPECT_TRUE(color_encoding0.SetICC(GetIccTestProfile()));
+  jxl::Span<const uint8_t> span0(pixels.data(), pixels.size());
+  jxl::CodecInOut io0;
+  io0.SetSize(xsize, ysize);
+  EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0,
+                                  /*bits_per_sample=*/16, format_orig,
+                                  /*pool=*/nullptr, &io0.Main()));
+
+  jxl::ColorEncoding color_encoding1;
+  EXPECT_TRUE(color_encoding1.SetICC(std::move(icc)));
+  jxl::Span<const uint8_t> span1(pixels2.data(), pixels2.size());
+  jxl::CodecInOut io1;
+  io1.SetSize(xsize, ysize);
+  EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1,
+                                  /*bits_per_sample=*/32, format,
+                                  /*pool=*/nullptr, &io1.Main()));
+
+  jxl::ButteraugliParams ba;
+  EXPECT_THAT(ButteraugliDistance(io0.frames, io1.frames, ba, jxl::GetJxlCms(),
+                                  /*distmap=*/nullptr, nullptr),
+              IsSlightlyBelow(0.79f));
+
+  JxlDecoderDestroy(dec);
+}
+
+std::string ColorDescription(JxlColorEncoding c) {
+  jxl::ColorEncoding color_encoding;
+  EXPECT_TRUE(ConvertExternalToInternalColorEncoding(c, &color_encoding));
+  return Description(color_encoding);
+}
+
+std::string GetOrigProfile(JxlDecoder* dec) {
+  JxlColorEncoding c;
+  JxlColorProfileTarget target = JXL_COLOR_PROFILE_TARGET_ORIGINAL;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsEncodedProfile(dec, nullptr, target, &c));
+  return ColorDescription(c);
+}
+
+std::string GetDataProfile(JxlDecoder* dec) {
+  JxlColorEncoding c;
+  JxlColorProfileTarget target = JXL_COLOR_PROFILE_TARGET_DATA;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsEncodedProfile(dec, nullptr, target, &c));
+  return ColorDescription(c);
+}
+
+double ButteraugliDistance(size_t xsize, size_t ysize,
+                           const std::vector<uint8_t>& pixels_in,
+                           const jxl::ColorEncoding& color_in,
+                           float intensity_in,
+                           const std::vector<uint8_t>& pixels_out,
+                           const jxl::ColorEncoding& color_out,
+                           float intensity_out) {
+  jxl::CodecInOut in;
+  in.metadata.m.color_encoding = color_in;
+  in.metadata.m.SetIntensityTarget(intensity_in);
+  JxlPixelFormat format_in = {static_cast<uint32_t>(color_in.Channels()),
+                              JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  EXPECT_TRUE(jxl::ConvertFromExternal(
+      jxl::Span<const uint8_t>(pixels_in.data(), pixels_in.size()), xsize,
+      ysize, color_in,
+      /*bits_per_sample=*/16, format_in,
+      /*pool=*/nullptr, &in.Main()));
+  jxl::CodecInOut out;
+  out.metadata.m.color_encoding = color_out;
+  out.metadata.m.SetIntensityTarget(intensity_out);
+  JxlPixelFormat format_out = {static_cast<uint32_t>(color_out.Channels()),
+                               JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  EXPECT_TRUE(jxl::ConvertFromExternal(
+      jxl::Span<const uint8_t>(pixels_out.data(), pixels_out.size()), xsize,
+      ysize, color_out,
+      /*bits_per_sample=*/16, format_out,
+      /*pool=*/nullptr, &out.Main()));
+  return ButteraugliDistance(in.frames, out.frames, jxl::ButteraugliParams(),
+                             jxl::GetJxlCms(), nullptr, nullptr);
+}
+
+class DecodeAllEncodingsTest
+    : public ::testing::TestWithParam<jxl::test::ColorEncodingDescriptor> {};
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(
+    DecodeAllEncodingsTestInstantiation, DecodeAllEncodingsTest,
+    ::testing::ValuesIn(jxl::test::AllEncodings()));
+TEST_P(DecodeAllEncodingsTest, PreserveOriginalProfileTest) {
+  size_t xsize = 123, ysize = 77;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  int events = JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING | JXL_DEC_FULL_IMAGE;
+  const auto& cdesc = GetParam();
+  jxl::ColorEncoding c_in = jxl::test::ColorEncodingFromDescriptor(cdesc);
+  if (c_in.rendering_intent != jxl::RenderingIntent::kRelative) return;
+  std::string color_space_in = Description(c_in);
+  float intensity_in = c_in.tf.IsPQ() ? 10000 : 255;
+  printf("Testing input color space %s\n", color_space_in.c_str());
+  jxl::TestCodestreamParams params;
+  params.color_space = color_space_in;
+  params.intensity_target = intensity_in;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+      params);
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), data.size()));
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(xsize, info.xsize);
+  EXPECT_EQ(ysize, info.ysize);
+  EXPECT_FALSE(info.uses_original_profile);
+  EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+  EXPECT_EQ(GetOrigProfile(dec), color_space_in);
+  EXPECT_EQ(GetDataProfile(dec), color_space_in);
+  EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+  std::vector<uint8_t> out(pixels.size());
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetImageOutBuffer(dec, &format, out.data(), out.size()));
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+  double dist = ButteraugliDistance(xsize, ysize, pixels, c_in, intensity_in,
+                                    out, c_in, intensity_in);
+  EXPECT_LT(dist, 1.29);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+  JxlDecoderDestroy(dec);
+}
+
+namespace {
+void SetPreferredColorProfileTest(
+    const jxl::test::ColorEncodingDescriptor& from) {
+  size_t xsize = 123, ysize = 77;
+  int events = JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING | JXL_DEC_FULL_IMAGE;
+  jxl::ColorEncoding c_in = jxl::test::ColorEncodingFromDescriptor(from);
+  if (c_in.rendering_intent != jxl::RenderingIntent::kRelative) return;
+  if (c_in.white_point != jxl::WhitePoint::kD65) return;
+  uint32_t num_channels = c_in.Channels();
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::string color_space_in = Description(c_in);
+  float intensity_in = c_in.tf.IsPQ() ? 10000 : 255;
+  jxl::TestCodestreamParams params;
+  params.color_space = color_space_in;
+  params.intensity_target = intensity_in;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  auto all_encodings = jxl::test::AllEncodings();
+  all_encodings.push_back(
+      {jxl::ColorSpace::kXYB, jxl::WhitePoint::kD65, jxl::Primaries::kCustom,
+       jxl::TransferFunction::kUnknown, jxl::RenderingIntent::kPerceptual});
+  for (const auto& c1 : all_encodings) {
+    jxl::ColorEncoding c_out = jxl::test::ColorEncodingFromDescriptor(c1);
+    float intensity_out = intensity_in;
+    if (c_out.GetColorSpace() != jxl::ColorSpace::kXYB) {
+      if (c_out.rendering_intent != jxl::RenderingIntent::kRelative) {
+        continue;
+      }
+      if ((c_in.primaries == jxl::Primaries::k2100 &&
+           c_out.primaries != jxl::Primaries::k2100) ||
+          (c_in.primaries == jxl::Primaries::kP3 &&
+           c_out.primaries == jxl::Primaries::kSRGB)) {
+        // Converting to a narrower gamut does not work without gammut mapping.
+        continue;
+      }
+    }
+    if (c_out.tf.IsHLG() && intensity_out > 300) {
+      // The Linear->HLG OOTF function at this intensity level can push
+      // saturated colors out of gamut, so we would need gamut mapping in
+      // this case too.
+      continue;
+    }
+    std::string color_space_out = Description(c_out);
+    if (color_space_in == color_space_out) continue;
+    printf("Testing input color space %s with output color space %s\n",
+           color_space_in.c_str(), color_space_out.c_str());
+    JxlDecoder* dec = JxlDecoderCreate(nullptr);
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events));
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetInput(dec, data.data(), data.size()));
+    EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+    JxlBasicInfo info;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+    EXPECT_EQ(xsize, info.xsize);
+    EXPECT_EQ(ysize, info.ysize);
+    EXPECT_FALSE(info.uses_original_profile);
+    EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(GetOrigProfile(dec), color_space_in);
+    EXPECT_EQ(GetDataProfile(dec), color_space_in);
+    JxlColorEncoding encoding_out;
+    EXPECT_TRUE(jxl::ParseDescription(color_space_out, &encoding_out));
+    if (c_out.GetColorSpace() == jxl::ColorSpace::kXYB &&
+        (c_in.primaries != jxl::Primaries::kSRGB || c_in.tf.IsPQ())) {
+      EXPECT_EQ(JXL_DEC_ERROR,
+                JxlDecoderSetPreferredColorProfile(dec, &encoding_out));
+      JxlDecoderDestroy(dec);
+      continue;
+    }
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetPreferredColorProfile(dec, &encoding_out));
+    EXPECT_EQ(GetOrigProfile(dec), color_space_in);
+    EXPECT_EQ(GetDataProfile(dec), color_space_out);
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+    size_t buffer_size;
+    JxlPixelFormat out_format = format;
+    out_format.num_channels = c_out.Channels();
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderImageOutBufferSize(dec, &out_format, &buffer_size));
+    std::vector<uint8_t> out(buffer_size);
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &out_format, out.data(), out.size()));
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    double dist = ButteraugliDistance(xsize, ysize, pixels, c_in, intensity_in,
+                                      out, c_out, intensity_out);
+    if (c_in.white_point == c_out.white_point) {
+      EXPECT_LT(dist, 1.29);
+    } else {
+      EXPECT_LT(dist, 4.0);
+    }
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+    JxlDecoderDestroy(dec);
+  }
+}
+}  // namespace
+
+TEST(DecodeTest, SetPreferredColorProfileTestFromGray) {
+  jxl::test::ColorEncodingDescriptor gray = {
+      jxl::ColorSpace::kGray, jxl::WhitePoint::kD65, jxl::Primaries::kSRGB,
+      jxl::TransferFunction::kSRGB, jxl::RenderingIntent::kRelative};
+  SetPreferredColorProfileTest(gray);
+}
+
+TEST_P(DecodeAllEncodingsTest, SetPreferredColorProfileTest) {
+  const auto& from = GetParam();
+  SetPreferredColorProfileTest(from);
+}
+
+// Tests the case of lossy sRGB image without alpha channel, decoded to RGB8
+// and to RGBA8
+TEST(DecodeTest, PixelTestOpaqueSrgbLossy) {
+  for (unsigned channels = 3; channels <= 4; channels++) {
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+    size_t xsize = 123, ysize = 77;
+    size_t num_pixels = xsize * ysize;
+    std::vector<uint8_t> pixels =
+        jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+    JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+        jxl::TestCodestreamParams());
+
+    JxlPixelFormat format = {channels, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+    std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+        dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+        format, /*use_callback=*/true, /*set_buffer_early=*/false,
+        /*use_resizable_runner=*/false, /*require_boxes=*/false,
+        /*expect_success*/ true);
+    JxlDecoderReset(dec);
+    EXPECT_EQ(num_pixels * channels, pixels2.size());
+
+    jxl::ColorEncoding color_encoding0 = jxl::ColorEncoding::SRGB(false);
+    jxl::Span<const uint8_t> span0(pixels.data(), pixels.size());
+    jxl::CodecInOut io0;
+    io0.SetSize(xsize, ysize);
+    EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0,
+                                    /*bits_per_sample=*/16, format_orig,
+                                    /*pool=*/nullptr, &io0.Main()));
+
+    jxl::ColorEncoding color_encoding1 = jxl::ColorEncoding::SRGB(false);
+    jxl::Span<const uint8_t> span1(pixels2.data(), pixels2.size());
+    jxl::CodecInOut io1;
+    EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1,
+                                    /*bits_per_sample=*/8, format,
+                                    /*pool=*/nullptr, &io1.Main()));
+
+    jxl::ButteraugliParams ba;
+    EXPECT_THAT(
+        ButteraugliDistance(io0.frames, io1.frames, ba, jxl::GetJxlCms(),
+                            /*distmap=*/nullptr, nullptr),
+        IsSlightlyBelow(0.7f));
+
+    JxlDecoderDestroy(dec);
+  }
+}
+
+// Opaque image with noise enabled, decoded to RGB8 and RGBA8.
+TEST(DecodeTest, PixelTestOpaqueSrgbLossyNoise) {
+  for (unsigned channels = 3; channels <= 4; channels++) {
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+    size_t xsize = 512, ysize = 300;
+    size_t num_pixels = xsize * ysize;
+    std::vector<uint8_t> pixels =
+        jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+    JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    jxl::TestCodestreamParams params;
+    params.cparams.noise = jxl::Override::kOn;
+    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+        params);
+
+    JxlPixelFormat format = {channels, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+    std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+        dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+        format, /*use_callback=*/false, /*set_buffer_early=*/true,
+        /*use_resizable_runner=*/false, /*require_boxes=*/false,
+        /*expect_success=*/true);
+    JxlDecoderReset(dec);
+    EXPECT_EQ(num_pixels * channels, pixels2.size());
+
+    jxl::ColorEncoding color_encoding0 = jxl::ColorEncoding::SRGB(false);
+    jxl::Span<const uint8_t> span0(pixels.data(), pixels.size());
+    jxl::CodecInOut io0;
+    io0.SetSize(xsize, ysize);
+    EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0,
+                                    /*bits_per_sample=*/16, format_orig,
+                                    /*pool=*/nullptr, &io0.Main()));
+
+    jxl::ColorEncoding color_encoding1 = jxl::ColorEncoding::SRGB(false);
+    jxl::Span<const uint8_t> span1(pixels2.data(), pixels2.size());
+    jxl::CodecInOut io1;
+    EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1,
+                                    /*bits_per_sample=*/8, format,
+                                    /*pool=*/nullptr, &io1.Main()));
+
+    jxl::ButteraugliParams ba;
+    EXPECT_THAT(
+        ButteraugliDistance(io0.frames, io1.frames, ba, jxl::GetJxlCms(),
+                            /*distmap=*/nullptr, nullptr),
+        IsSlightlyBelow(1.7f));
+
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, ProcessEmptyInputWithBoxes) {
+  size_t xsize = 123, ysize = 77;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  jxl::CompressParams cparams;
+  uint32_t channels = 3;
+  JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+  for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+    jxl::TestCodestreamParams params;
+    params.box_format = (CodeStreamBoxFormat)i;
+    printf("Testing empty input with box format %d\n", (int)params.box_format);
+    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+        params);
+    const int events =
+        JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE | JXL_DEC_COLOR_ENCODING;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events));
+    EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetInput(dec, compressed.data(), compressed.size()));
+    EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+    size_t buffer_size;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+    JxlBasicInfo info;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+    const size_t remaining = JxlDecoderReleaseInput(dec);
+    EXPECT_LE(remaining, compressed.size());
+    EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, ExtraBytesAfterCompressedStream) {
+  size_t xsize = 123, ysize = 77;
+  size_t num_pixels = xsize * ysize;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  jxl::CompressParams cparams;
+  for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+    CodeStreamBoxFormat box_format = (CodeStreamBoxFormat)i;
+    if (box_format == kCSBF_Multi_Other_Zero_Terminated) continue;
+    printf("Testing with box format %d\n", (int)box_format);
+    size_t last_unknown_box_size = 0;
+    if (box_format == kCSBF_Single_Other) {
+      last_unknown_box_size = unk1_box_size + 8;
+    } else if (box_format == kCSBF_Multi_Other_Terminated) {
+      last_unknown_box_size = unk3_box_size + 8;
+    } else if (box_format == kCSBF_Multi_Last_Empty_Other) {
+      // If boxes are not required, the decoder won't consume the last empty
+      // jxlp box.
+      last_unknown_box_size = 12 + unk3_box_size + 8;
+    }
+    jxl::TestCodestreamParams params;
+    params.box_format = box_format;
+    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+        params);
+    // Add some more bytes after compressed data.
+    compressed.push_back(0);
+    compressed.push_back(1);
+    compressed.push_back(2);
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+    uint32_t channels = 3;
+    JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+    std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+        dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+        format, /*use_callback=*/false, /*set_buffer_early=*/true,
+        /*use_resizable_runner=*/false, /*require_boxes=*/false,
+        /*expect_success=*/true);
+    size_t unconsumed_bytes = JxlDecoderReleaseInput(dec);
+    EXPECT_EQ(last_unknown_box_size + 3, unconsumed_bytes);
+    EXPECT_EQ(num_pixels * channels * 4, pixels2.size());
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, ExtraBytesAfterCompressedStreamRequireBoxes) {
+  size_t xsize = 123, ysize = 77;
+  size_t num_pixels = xsize * ysize;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  jxl::CompressParams cparams;
+  for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+    CodeStreamBoxFormat box_format = (CodeStreamBoxFormat)i;
+    if (box_format == kCSBF_Multi_Other_Zero_Terminated) continue;
+    printf("Testing with box format %d\n", (int)box_format);
+    bool expect_success = (box_format == kCSBF_None ||
+                           box_format == kCSBF_Single_Zero_Terminated ||
+                           box_format == kCSBF_Multi_Zero_Terminated);
+    jxl::TestCodestreamParams params;
+    params.box_format = box_format;
+    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+        params);
+    // Add some more bytes after compressed data.
+    compressed.push_back(0);
+    compressed.push_back(1);
+    compressed.push_back(2);
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+    uint32_t channels = 3;
+    JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+    std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+        dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+        format, /*use_callback=*/false, /*set_buffer_early=*/true,
+        /*use_resizable_runner=*/false, /*require_boxes=*/true, expect_success);
+    size_t unconsumed_bytes = JxlDecoderReleaseInput(dec);
+    EXPECT_EQ(3, unconsumed_bytes);
+    EXPECT_EQ(num_pixels * channels * 4, pixels2.size());
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, ConcatenatedCompressedStreams) {
+  size_t xsize = 123, ysize = 77;
+  size_t num_pixels = xsize * ysize;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  jxl::CompressParams cparams;
+  for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+    CodeStreamBoxFormat first_box_format = (CodeStreamBoxFormat)i;
+    if (first_box_format == kCSBF_Multi_Other_Zero_Terminated) continue;
+    jxl::TestCodestreamParams params1;
+    params1.box_format = first_box_format;
+    jxl::PaddedBytes compressed1 = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+        params1);
+    for (int j = 0; j < kCSBF_NUM_ENTRIES; ++j) {
+      CodeStreamBoxFormat second_box_format = (CodeStreamBoxFormat)j;
+      if (second_box_format == kCSBF_Multi_Other_Zero_Terminated) continue;
+      printf("Testing with box format pair %d, %d\n", (int)first_box_format,
+             (int)second_box_format);
+      jxl::TestCodestreamParams params2;
+      params2.box_format = second_box_format;
+      jxl::PaddedBytes compressed2 = jxl::CreateTestJXLCodestream(
+          jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+          3, params2);
+      jxl::PaddedBytes concat;
+      concat.append(compressed1);
+      concat.append(compressed2);
+      uint32_t channels = 3;
+      JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+      size_t remaining = concat.size();
+      for (int part = 0; part < 2; ++part) {
+        printf("  Decoding part %d\n", part + 1);
+        JxlDecoder* dec = JxlDecoderCreate(NULL);
+        size_t pos = concat.size() - remaining;
+        bool expect_success =
+            (part == 0 || second_box_format == kCSBF_None ||
+             second_box_format == kCSBF_Single_Zero_Terminated ||
+             second_box_format == kCSBF_Multi_Zero_Terminated);
+        std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+            dec, jxl::Span<const uint8_t>(concat.data() + pos, remaining),
+            format, /*use_callback=*/false, /*set_buffer_early=*/true,
+            /*use_resizable_runner=*/false, /*require_boxes=*/true,
+            expect_success);
+        EXPECT_EQ(num_pixels * channels * 4, pixels2.size());
+        remaining = JxlDecoderReleaseInput(dec);
+        JxlDecoderDestroy(dec);
+      }
+      EXPECT_EQ(0, remaining);
+    }
+  }
+}
+
+void TestPartialStream(bool reconstructible_jpeg) {
+  size_t xsize = 123, ysize = 77;
+  uint32_t channels = 4;
+  if (reconstructible_jpeg) {
+    channels = 3;
+  }
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, channels, 0);
+  JxlPixelFormat format_orig = {channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  jxl::TestCodestreamParams params;
+  if (reconstructible_jpeg) {
+    params.cparams.color_transform = jxl::ColorTransform::kNone;
+  } else {
+    // Lossless to verify pixels exactly after roundtrip.
+    params.cparams.SetLossless();
+  }
+
+  std::vector<uint8_t> pixels2;
+  pixels2.resize(pixels.size());
+
+  jxl::PaddedBytes jpeg_output(64);
+  size_t used_jpeg_output = 0;
+
+  std::vector<jxl::PaddedBytes> codestreams(kCSBF_NUM_ENTRIES);
+  std::vector<jxl::PaddedBytes> jpeg_codestreams(kCSBF_NUM_ENTRIES);
+  for (size_t i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+    params.box_format = (CodeStreamBoxFormat)i;
+    if (reconstructible_jpeg) {
+      params.jpeg_codestream = &jpeg_codestreams[i];
+    }
+    codestreams[i] = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+        channels, params);
+  }
+
+  // Test multiple step sizes, to test different combinations of the streaming
+  // box parsing.
+  std::vector<size_t> increments = {1, 3, 17, 23, 120, 700, 1050};
+
+  for (size_t index = 0; index < increments.size(); index++) {
+    for (size_t i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+      if (reconstructible_jpeg &&
+          (CodeStreamBoxFormat)i == CodeStreamBoxFormat::kCSBF_None) {
+        continue;
+      }
+      const jxl::PaddedBytes& data = codestreams[i];
+      const uint8_t* next_in = data.data();
+      size_t avail_in = 0;
+
+      JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSubscribeEvents(
+                    dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE |
+                             JXL_DEC_JPEG_RECONSTRUCTION));
+
+      bool seen_basic_info = false;
+      bool seen_full_image = false;
+      bool seen_jpeg_recon = false;
+
+      size_t total_size = 0;
+
+      for (;;) {
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+        JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+        size_t remaining = JxlDecoderReleaseInput(dec);
+        EXPECT_LE(remaining, avail_in);
+        next_in += avail_in - remaining;
+        avail_in = remaining;
+        if (status == JXL_DEC_NEED_MORE_INPUT) {
+          if (total_size >= data.size()) {
+            // End of test data reached, it should have successfully decoded the
+            // image now.
+            FAIL();
+            break;
+          }
+
+          size_t increment = increments[index];
+          // End of the file reached, should be the final test.
+          if (total_size + increment > data.size()) {
+            increment = data.size() - total_size;
+          }
+          total_size += increment;
+          avail_in += increment;
+        } else if (status == JXL_DEC_BASIC_INFO) {
+          // This event should happen exactly once
+          EXPECT_FALSE(seen_basic_info);
+          if (seen_basic_info) break;
+          seen_basic_info = true;
+          JxlBasicInfo info;
+          EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+          EXPECT_EQ(info.xsize, xsize);
+          EXPECT_EQ(info.ysize, ysize);
+        } else if (status == JXL_DEC_JPEG_RECONSTRUCTION) {
+          EXPECT_FALSE(seen_basic_info);
+          EXPECT_FALSE(seen_full_image);
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetJPEGBuffer(dec, jpeg_output.data(),
+                                            jpeg_output.size()));
+          seen_jpeg_recon = true;
+        } else if (status == JXL_DEC_JPEG_NEED_MORE_OUTPUT) {
+          EXPECT_TRUE(seen_jpeg_recon);
+          used_jpeg_output =
+              jpeg_output.size() - JxlDecoderReleaseJPEGBuffer(dec);
+          jpeg_output.resize(jpeg_output.size() * 2);
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetJPEGBuffer(
+                        dec, jpeg_output.data() + used_jpeg_output,
+                        jpeg_output.size() - used_jpeg_output));
+        } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetImageOutBuffer(
+                        dec, &format_orig, pixels2.data(), pixels2.size()));
+        } else if (status == JXL_DEC_FULL_IMAGE) {
+          // This event should happen exactly once
+          EXPECT_FALSE(seen_full_image);
+          if (seen_full_image) break;
+          // This event should happen after basic info
+          EXPECT_TRUE(seen_basic_info);
+          seen_full_image = true;
+          if (reconstructible_jpeg) {
+            used_jpeg_output =
+                jpeg_output.size() - JxlDecoderReleaseJPEGBuffer(dec);
+            EXPECT_EQ(used_jpeg_output, jpeg_codestreams[i].size());
+            EXPECT_EQ(0, memcmp(jpeg_output.data(), jpeg_codestreams[i].data(),
+                                used_jpeg_output));
+          } else {
+            EXPECT_EQ(pixels, pixels2);
+          }
+        } else if (status == JXL_DEC_SUCCESS) {
+          EXPECT_TRUE(seen_full_image);
+          break;
+        } else {
+          // We do not expect any other events or errors
+          FAIL();
+          break;
+        }
+      }
+
+      // Ensure the decoder emitted the basic info and full image events
+      EXPECT_TRUE(seen_basic_info);
+      EXPECT_TRUE(seen_full_image);
+
+      JxlDecoderDestroy(dec);
+    }
+  }
+}
+
+// Tests the return status when trying to decode pixels on incomplete file: it
+// should return JXL_DEC_NEED_MORE_INPUT, not error.
+TEST(DecodeTest, PixelPartialTest) { TestPartialStream(false); }
+
+#if JPEGXL_ENABLE_JPEG
+// Tests the return status when trying to decode JPEG bytes on incomplete file.
+TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGPartialTest)) {
+  TestPartialStream(true);
+}
+#endif  // JPEGXL_ENABLE_JPEG
+
+// The DC event still exists, but is no longer implemented, it is deprecated.
+TEST(DecodeTest, DCNotGettableTest) {
+  // 1x1 pixel JXL image
+  std::string compressed(
+      "\377\n\0\20\260\23\0H\200("
+      "\0\334\0U\17\0\0\250P\31e\334\340\345\\\317\227\37:,"
+      "\246m\\gh\253m\vK\22E\306\261I\252C&pH\22\353 "
+      "\363\6\22\bp\0\200\237\34\231W2d\255$\1",
+      68);
+
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetInput(
+                dec, reinterpret_cast<const uint8_t*>(compressed.data()),
+                compressed.size()));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+
+  // Since the image is only 1x1 pixel, there is only 1 group, the decoder is
+  // unable to get DC size from this, and will not return the DC at all. Since
+  // no full image is requested either, it is expected to return success.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, PreviewTest) {
+  size_t xsize = 77, ysize = 120;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  for (jxl::PreviewMode mode : {jxl::kSmallPreview, jxl::kBigPreview}) {
+    jxl::TestCodestreamParams params;
+    params.preview_mode = mode;
+
+    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+        params);
+
+    JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+    const uint8_t* next_in = compressed.data();
+    size_t avail_in = compressed.size();
+
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSubscribeEvents(
+                  dec, JXL_DEC_BASIC_INFO | JXL_DEC_PREVIEW_IMAGE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+    EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+    JxlBasicInfo info;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+    size_t buffer_size;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size));
+
+    jxl::ColorEncoding c_srgb = jxl::ColorEncoding::SRGB(false);
+    jxl::CodecInOut io0;
+    EXPECT_TRUE(jxl::ConvertFromExternal(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+        c_srgb, /*bits_per_sample=*/16, format_orig, /*pool=*/nullptr,
+        &io0.Main()));
+    GeneratePreview(params.preview_mode, &io0.Main());
+
+    size_t xsize_preview = io0.Main().xsize();
+    size_t ysize_preview = io0.Main().ysize();
+    EXPECT_EQ(xsize_preview, info.preview.xsize);
+    EXPECT_EQ(ysize_preview, info.preview.ysize);
+    EXPECT_EQ(xsize_preview * ysize_preview * 3, buffer_size);
+
+    EXPECT_EQ(JXL_DEC_NEED_PREVIEW_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    std::vector<uint8_t> preview(buffer_size);
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetPreviewOutBuffer(dec, &format, preview.data(),
+                                            preview.size()));
+
+    EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, JxlDecoderProcessInput(dec));
+
+    jxl::CodecInOut io1;
+    EXPECT_TRUE(jxl::ConvertFromExternal(
+        jxl::Span<const uint8_t>(preview.data(), preview.size()), xsize_preview,
+        ysize_preview, c_srgb,
+        /*bits_per_sample=*/8, format,
+        /*pool=*/nullptr, &io1.Main()));
+
+    jxl::ButteraugliParams ba;
+    // TODO(lode): this ButteraugliDistance silently returns 0 (dangerous for
+    // tests) if xsize or ysize is < 8, no matter how different the images, a
+    // tiny size that could happen for a preview. ButteraugliDiffmap does
+    // support smaller than 8x8, but jxl's ButteraugliDistance does not. Perhaps
+    // move butteraugli's <8x8 handling from ButteraugliDiffmap to
+    // ButteraugliComparator::Diffmap in butteraugli.cc.
+    EXPECT_LE(ButteraugliDistance(io0.frames, io1.frames, ba, jxl::GetJxlCms(),
+                                  /*distmap=*/nullptr, nullptr),
+              mode == jxl::kSmallPreview ? 0.7f : 1.2f);
+
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, AlignTest) {
+  size_t xsize = 123, ysize = 77;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::TestCodestreamParams params;
+  // Lossless to verify pixels exactly after roundtrip.
+  params.cparams.SetLossless();
+  params.cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
+      params);
+
+  size_t align = 17;
+  JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, align};
+  // On purpose not using jxl::RoundUpTo to test it independently.
+  size_t expected_line_bytes = (1 * 3 * xsize + align - 1) / align * align;
+
+  for (int use_callback = 0; use_callback <= 1; ++use_callback) {
+    std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+        jxl::Span<const uint8_t>(compressed.data(), compressed.size()), format,
+        use_callback, /*set_buffer_early=*/false,
+        /*use_resizable_runner=*/false, /*require_boxes=*/false,
+        /*expect_success=*/true);
+    EXPECT_EQ(expected_line_bytes * ysize, pixels2.size());
+    EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize,
+                                           ysize, format_orig, format));
+  }
+}
+
+TEST(DecodeTest, AnimationTest) {
+  size_t xsize = 123, ysize = 77;
+  static const size_t num_frames = 2;
+  std::vector<uint8_t> frames[2];
+  frames[0] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  frames[1] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 1);
+  JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetUintSamples(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+  io.metadata.m.have_animation = true;
+  io.frames.clear();
+  io.frames.reserve(num_frames);
+  io.SetSize(xsize, ysize);
+
+  std::vector<uint32_t> frame_durations(num_frames);
+  for (size_t i = 0; i < num_frames; ++i) {
+    frame_durations[i] = 5 + i;
+  }
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    jxl::ImageBundle bundle(&io.metadata.m);
+
+    EXPECT_TRUE(ConvertFromExternal(
+        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
+        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
+    bundle.duration = frame_durations[i];
+    io.frames.push_back(std::move(bundle));
+  }
+
+  jxl::CompressParams cparams;
+  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::AuxOut aux_out;
+  jxl::PaddedBytes compressed;
+  jxl::PassesEncoderState enc_state;
+  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
+                              jxl::GetJxlCms(), &aux_out, nullptr));
+
+  // Decode and test the animation frames
+
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+
+  void* runner = JxlThreadParallelRunnerCreate(
+      NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    std::vector<uint8_t> pixels(buffer_size);
+
+    EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+    JxlFrameHeader frame_header;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+    EXPECT_EQ(frame_durations[i], frame_header.duration);
+    EXPECT_EQ(0u, frame_header.name_length);
+    // For now, test with empty name, there's currently no easy way to encode
+    // a jxl file with a frame name because ImageBundle doesn't have a
+    // jxl::FrameHeader to set the name in. We can test the null termination
+    // character though.
+    char name;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameName(dec, &name, 1));
+    EXPECT_EQ(0, name);
+
+    EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                           xsize, ysize, format, format));
+  }
+
+  // After all frames were decoded, JxlDecoderProcessInput should return
+  // success to indicate all is done.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  JxlThreadParallelRunnerDestroy(runner);
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, AnimationTestStreaming) {
+  size_t xsize = 123, ysize = 77;
+  static const size_t num_frames = 2;
+  std::vector<uint8_t> frames[2];
+  frames[0] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  frames[1] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 1);
+  JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetUintSamples(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+  io.metadata.m.have_animation = true;
+  io.frames.clear();
+  io.frames.reserve(num_frames);
+  io.SetSize(xsize, ysize);
+
+  std::vector<uint32_t> frame_durations(num_frames);
+  for (size_t i = 0; i < num_frames; ++i) {
+    frame_durations[i] = 5 + i;
+  }
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    jxl::ImageBundle bundle(&io.metadata.m);
+
+    EXPECT_TRUE(ConvertFromExternal(
+        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
+        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
+    bundle.duration = frame_durations[i];
+    io.frames.push_back(std::move(bundle));
+  }
+
+  jxl::CompressParams cparams;
+  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::AuxOut aux_out;
+  jxl::PaddedBytes compressed;
+  jxl::PassesEncoderState enc_state;
+  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
+                              jxl::GetJxlCms(), &aux_out, nullptr));
+
+  // Decode and test the animation frames
+
+  const size_t step_size = 16;
+
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = 0;
+  size_t frame_headers_seen = 0;
+  size_t frames_seen = 0;
+  bool seen_basic_info = false;
+
+  void* runner = JxlThreadParallelRunnerCreate(
+      NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+
+  std::vector<uint8_t> frames2[2];
+  for (size_t i = 0; i < num_frames; ++i) {
+    frames2[i].resize(frames[i].size());
+  }
+
+  size_t total_in = 0;
+  size_t loop_count = 0;
+
+  for (;;) {
+    if (loop_count++ > compressed.size()) {
+      fprintf(stderr, "Too many loops\n");
+      FAIL();
+      break;
+    }
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+    auto status = JxlDecoderProcessInput(dec);
+    size_t remaining = JxlDecoderReleaseInput(dec);
+    EXPECT_LE(remaining, avail_in);
+    next_in += avail_in - remaining;
+    avail_in = remaining;
+
+    if (status == JXL_DEC_SUCCESS) {
+      break;
+    } else if (status == JXL_DEC_ERROR) {
+      FAIL();
+    } else if (status == JXL_DEC_NEED_MORE_INPUT) {
+      if (total_in >= compressed.size()) {
+        fprintf(stderr, "Already gave all input data\n");
+        FAIL();
+        break;
+      }
+      size_t amount = step_size;
+      if (total_in + amount > compressed.size()) {
+        amount = compressed.size() - total_in;
+      }
+      avail_in += amount;
+      total_in += amount;
+    } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                     dec, &format, frames2[frames_seen].data(),
+                                     frames2[frames_seen].size()));
+    } else if (status == JXL_DEC_BASIC_INFO) {
+      EXPECT_EQ(false, seen_basic_info);
+      seen_basic_info = true;
+      JxlBasicInfo info;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+      EXPECT_EQ(xsize, info.xsize);
+      EXPECT_EQ(ysize, info.ysize);
+    } else if (status == JXL_DEC_FRAME) {
+      EXPECT_EQ(true, seen_basic_info);
+      frame_headers_seen++;
+    } else if (status == JXL_DEC_FULL_IMAGE) {
+      frames_seen++;
+      EXPECT_EQ(frame_headers_seen, frames_seen);
+    } else {
+      fprintf(stderr, "Unexpected status: %d\n", (int)status);
+      FAIL();
+    }
+  }
+
+  EXPECT_EQ(true, seen_basic_info);
+  EXPECT_EQ(num_frames, frames_seen);
+  EXPECT_EQ(num_frames, frame_headers_seen);
+  for (size_t i = 0; i < num_frames; ++i) {
+    EXPECT_EQ(frames[i], frames2[i]);
+  }
+
+  JxlThreadParallelRunnerDestroy(runner);
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, ExtraChannelTest) {
+  size_t xsize = 55, ysize = 257;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::TestCodestreamParams params;
+  // Lossless to verify pixels exactly after roundtrip.
+  params.cparams.SetLossless();
+  params.cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
+      params);
+
+  size_t align = 17;
+  JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, align};
+
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
+                                 dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetInput(dec, compressed.data(), compressed.size()));
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(1u, info.num_extra_channels);
+  EXPECT_EQ(JXL_FALSE, info.alpha_premultiplied);
+
+  JxlExtraChannelInfo extra_info;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetExtraChannelInfo(dec, 0, &extra_info));
+  EXPECT_EQ(0, extra_info.type);
+
+  EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  size_t extra_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderExtraChannelBufferSize(dec, &format, &extra_size, 0));
+
+  std::vector<uint8_t> image(buffer_size);
+  std::vector<uint8_t> extra(extra_size);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                 dec, &format, image.data(), image.size()));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetExtraChannelBuffer(
+                                 dec, &format, extra.data(), extra.size(), 0));
+
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+
+  // After the full image was output, JxlDecoderProcessInput should return
+  // success to indicate all is done.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+  JxlDecoderDestroy(dec);
+
+  EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), image.data(), xsize,
+                                         ysize, format_orig, format));
+
+  // Compare the extracted extra channel with the original alpha channel
+
+  std::vector<uint8_t> alpha(pixels.size() / 4);
+  for (size_t i = 0; i < pixels.size(); i += 8) {
+    size_t index_alpha = i / 4;
+    alpha[index_alpha + 0] = pixels[i + 6];
+    alpha[index_alpha + 1] = pixels[i + 7];
+  }
+  JxlPixelFormat format_alpha = format;
+  format_alpha.num_channels = 1;
+  JxlPixelFormat format_orig_alpha = format_orig;
+  format_orig_alpha.num_channels = 1;
+
+  EXPECT_EQ(0u,
+            jxl::test::ComparePixels(alpha.data(), extra.data(), xsize, ysize,
+                                     format_orig_alpha, format_alpha));
+}
+
+TEST(DecodeTest, SkipCurrentFrameTest) {
+  size_t xsize = 90, ysize = 120;
+  constexpr size_t num_frames = 7;
+  std::vector<uint8_t> frames[num_frames];
+  for (size_t i = 0; i < num_frames; i++) {
+    frames[i] = jxl::test::GetSomeTestImage(xsize, ysize, 3, i);
+  }
+  JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetUintSamples(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+  io.metadata.m.have_animation = true;
+  io.frames.clear();
+  io.frames.reserve(num_frames);
+  io.SetSize(xsize, ysize);
+
+  std::vector<uint32_t> frame_durations(num_frames);
+  for (size_t i = 0; i < num_frames; ++i) {
+    frame_durations[i] = 5 + i;
+  }
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    jxl::ImageBundle bundle(&io.metadata.m);
+    if (i & 1) {
+      // Mark some frames as referenceable, others not.
+      bundle.use_for_next_frame = true;
+    }
+
+    EXPECT_TRUE(ConvertFromExternal(
+        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
+        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
+    bundle.duration = frame_durations[i];
+    io.frames.push_back(std::move(bundle));
+  }
+
+  jxl::CompressParams cparams;
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::AuxOut aux_out;
+  jxl::PaddedBytes compressed;
+  jxl::PassesEncoderState enc_state;
+  jxl::PassDefinition passes[] = {{2, 0, 4}, {4, 0, 4}, {8, 2, 2}, {8, 0, 1}};
+  jxl::ProgressiveMode progressive_mode{passes};
+  enc_state.progressive_splitter.SetProgressiveMode(progressive_mode);
+  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
+                              jxl::GetJxlCms(), &aux_out, nullptr));
+
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME |
+                                               JXL_DEC_FRAME_PROGRESSION |
+                                               JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetProgressiveDetail(dec, kLastPasses));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    printf("Decoding frame %d\n", (int)i);
+    EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSkipCurrentFrame(dec));
+    std::vector<uint8_t> pixels(buffer_size);
+    EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSkipCurrentFrame(dec));
+    JxlFrameHeader frame_header;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+    EXPECT_EQ(frame_durations[i], frame_header.duration);
+    EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+    if (i == 2) {
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSkipCurrentFrame(dec));
+      continue;
+    }
+    EXPECT_EQ(JXL_DEC_FRAME_PROGRESSION, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(8, JxlDecoderGetIntendedDownsamplingRatio(dec));
+    if (i == 3) {
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSkipCurrentFrame(dec));
+      continue;
+    }
+    EXPECT_EQ(JXL_DEC_FRAME_PROGRESSION, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(4, JxlDecoderGetIntendedDownsamplingRatio(dec));
+    if (i == 4) {
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSkipCurrentFrame(dec));
+      continue;
+    }
+    EXPECT_EQ(JXL_DEC_FRAME_PROGRESSION, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(2, JxlDecoderGetIntendedDownsamplingRatio(dec));
+    if (i == 5) {
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSkipCurrentFrame(dec));
+      continue;
+    }
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSkipCurrentFrame(dec));
+  }
+
+  // After all frames were decoded, JxlDecoderProcessInput should return
+  // success to indicate all is done.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, SkipFrameTest) {
+  size_t xsize = 90, ysize = 120;
+  constexpr size_t num_frames = 16;
+  std::vector<uint8_t> frames[num_frames];
+  for (size_t i = 0; i < num_frames; i++) {
+    frames[i] = jxl::test::GetSomeTestImage(xsize, ysize, 3, i);
+  }
+  JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetUintSamples(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+  io.metadata.m.have_animation = true;
+  io.frames.clear();
+  io.frames.reserve(num_frames);
+  io.SetSize(xsize, ysize);
+
+  std::vector<uint32_t> frame_durations(num_frames);
+  for (size_t i = 0; i < num_frames; ++i) {
+    frame_durations[i] = 5 + i;
+  }
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    jxl::ImageBundle bundle(&io.metadata.m);
+    if (i & 1) {
+      // Mark some frames as referenceable, others not.
+      bundle.use_for_next_frame = true;
+    }
+
+    EXPECT_TRUE(ConvertFromExternal(
+        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
+        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
+    bundle.duration = frame_durations[i];
+    io.frames.push_back(std::move(bundle));
+  }
+
+  jxl::CompressParams cparams;
+  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::AuxOut aux_out;
+  jxl::PaddedBytes compressed;
+  jxl::PassesEncoderState enc_state;
+  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
+                              jxl::GetJxlCms(), &aux_out, nullptr));
+
+  // Decode and test the animation frames
+
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+
+  void* runner = JxlThreadParallelRunnerCreate(
+      NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    if (i == 3) {
+      JxlDecoderSkipFrames(dec, 5);
+      i += 5;
+    }
+    std::vector<uint8_t> pixels(buffer_size);
+
+    EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+    JxlFrameHeader frame_header;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+    EXPECT_EQ(frame_durations[i], frame_header.duration);
+
+    EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                           xsize, ysize, format, format));
+  }
+
+  // After all frames were decoded, JxlDecoderProcessInput should return
+  // success to indicate all is done.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  // Test rewinding the decoder and skipping different frames
+
+  JxlDecoderRewind(dec);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    int test_skipping = (i == 9) ? 3 : 0;
+    std::vector<uint8_t> pixels(buffer_size);
+
+    EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+    // Since this is after JXL_DEC_FRAME but before JXL_DEC_FULL_IMAGE, this
+    // should only skip the next frame, not the currently processed one.
+    if (test_skipping) JxlDecoderSkipFrames(dec, test_skipping);
+
+    JxlFrameHeader frame_header;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+    EXPECT_EQ(frame_durations[i], frame_header.duration);
+
+    EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                           xsize, ysize, format, format));
+
+    if (test_skipping) i += test_skipping;
+  }
+
+  JxlThreadParallelRunnerDestroy(runner);
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, SkipFrameWithBlendingTest) {
+  size_t xsize = 90, ysize = 120;
+  constexpr size_t num_frames = 16;
+  std::vector<uint8_t> frames[num_frames];
+  JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetUintSamples(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+  io.metadata.m.have_animation = true;
+  io.frames.clear();
+  io.frames.reserve(num_frames);
+  io.SetSize(xsize, ysize);
+
+  std::vector<uint32_t> frame_durations(num_frames);
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    if (i < 5) {
+      std::vector<uint8_t> frame_internal =
+          jxl::test::GetSomeTestImage(xsize, ysize, 3, i * 2 + 1);
+      // An internal frame with 0 duration, and use_for_next_frame, this is a
+      // frame that is not rendered and not output by the API, but on which the
+      // rendered frames depend
+      jxl::ImageBundle bundle_internal(&io.metadata.m);
+      EXPECT_TRUE(ConvertFromExternal(
+          jxl::Span<const uint8_t>(frame_internal.data(),
+                                   frame_internal.size()),
+          xsize, ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+          /*bits_per_sample=*/16, format,
+          /*pool=*/nullptr, &bundle_internal));
+      bundle_internal.duration = 0;
+      bundle_internal.use_for_next_frame = true;
+      io.frames.push_back(std::move(bundle_internal));
+    }
+
+    std::vector<uint8_t> frame =
+        jxl::test::GetSomeTestImage(xsize, ysize, 3, i * 2);
+    // Actual rendered frame
+    frame_durations[i] = 5 + i;
+    jxl::ImageBundle bundle(&io.metadata.m);
+    EXPECT_TRUE(ConvertFromExternal(
+        jxl::Span<const uint8_t>(frame.data(), frame.size()), xsize, ysize,
+        jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
+    bundle.duration = frame_durations[i];
+    // Create some variation in which frames depend on which.
+    if (i != 3 && i != 9 && i != 10) {
+      bundle.use_for_next_frame = true;
+    }
+    if (i != 12) {
+      bundle.blend = true;
+      // Choose a blend mode that depends on the pixels of the saved frame and
+      // doesn't use alpha
+      bundle.blendmode = jxl::BlendMode::kMul;
+    }
+    io.frames.push_back(std::move(bundle));
+  }
+
+  jxl::CompressParams cparams;
+  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::AuxOut aux_out;
+  jxl::PaddedBytes compressed;
+  jxl::PassesEncoderState enc_state;
+  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
+                              jxl::GetJxlCms(), &aux_out, nullptr));
+
+  // Independently decode all frames without any skipping, to create the
+  // expected blended frames, for the actual tests below to compare with.
+  {
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+    const uint8_t* next_in = compressed.data();
+    size_t avail_in = compressed.size();
+
+    void* runner = JxlThreadParallelRunnerCreate(
+        NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner(
+                                   dec, JxlThreadParallelRunner, runner));
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSubscribeEvents(dec, JXL_DEC_FULL_IMAGE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+    for (size_t i = 0; i < num_frames; ++i) {
+      EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+      frames[i].resize(xsize * ysize * 6);
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(dec, &format, frames[i].data(),
+                                            frames[i].size()));
+      EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    }
+
+    // After all frames were decoded, JxlDecoderProcessInput should return
+    // success to indicate all is done.
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+    JxlThreadParallelRunnerDestroy(runner);
+    JxlDecoderDestroy(dec);
+  }
+
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+
+  void* runner = JxlThreadParallelRunnerCreate(
+      NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    std::vector<uint8_t> pixels(buffer_size);
+
+    EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+    JxlFrameHeader frame_header;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+    EXPECT_EQ(frame_durations[i], frame_header.duration);
+
+    EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                           xsize, ysize, format, format));
+
+    // Test rewinding mid-way, not decoding all frames.
+    if (i == 8) {
+      break;
+    }
+  }
+
+  JxlDecoderRewind(dec);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    if (i == 3) {
+      JxlDecoderSkipFrames(dec, 5);
+      i += 5;
+    }
+    std::vector<uint8_t> pixels(buffer_size);
+
+    EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+    JxlFrameHeader frame_header;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+    EXPECT_EQ(frame_durations[i], frame_header.duration);
+
+    EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                           xsize, ysize, format, format));
+  }
+
+  // After all frames were decoded, JxlDecoderProcessInput should return
+  // success to indicate all is done.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  // Test rewinding the decoder and skipping different frames
+
+  JxlDecoderRewind(dec);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    int test_skipping = (i == 9) ? 3 : 0;
+    std::vector<uint8_t> pixels(buffer_size);
+
+    EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+    // Since this is after JXL_DEC_FRAME but before JXL_DEC_FULL_IMAGE, this
+    // should only skip the next frame, not the currently processed one.
+    if (test_skipping) JxlDecoderSkipFrames(dec, test_skipping);
+
+    JxlFrameHeader frame_header;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+    EXPECT_EQ(frame_durations[i], frame_header.duration);
+
+    EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                           xsize, ysize, format, format));
+
+    if (test_skipping) i += test_skipping;
+  }
+
+  JxlThreadParallelRunnerDestroy(runner);
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, SkipFrameWithAlphaBlendingTest) {
+  size_t xsize = 90, ysize = 120;
+  constexpr size_t num_frames = 16;
+  std::vector<uint8_t> frames[num_frames + 5];
+  JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetUintSamples(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+  io.metadata.m.have_animation = true;
+  io.frames.clear();
+  io.frames.reserve(num_frames + 5);
+  io.SetSize(xsize, ysize);
+
+  std::vector<uint32_t> frame_durations_c;
+  std::vector<uint32_t> frame_durations_nc;
+  std::vector<uint32_t> frame_xsize, frame_ysize, frame_x0, frame_y0;
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    size_t cropxsize = 1 + xsize * 2 / (i + 1);
+    size_t cropysize = 1 + ysize * 3 / (i + 2);
+    int cropx0 = i * 3 - 8;
+    int cropy0 = i * 4 - 7;
+    if (i < 5) {
+      std::vector<uint8_t> frame_internal =
+          jxl::test::GetSomeTestImage(xsize / 2, ysize / 2, 4, i * 2 + 1);
+      // An internal frame with 0 duration, and use_for_next_frame, this is a
+      // frame that is not rendered and not output by default by the API, but on
+      // which the rendered frames depend
+      jxl::ImageBundle bundle_internal(&io.metadata.m);
+      EXPECT_TRUE(ConvertFromExternal(
+          jxl::Span<const uint8_t>(frame_internal.data(),
+                                   frame_internal.size()),
+          xsize / 2, ysize / 2, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+          /*bits_per_sample=*/16, format,
+          /*pool=*/nullptr, &bundle_internal));
+      bundle_internal.duration = 0;
+      bundle_internal.use_for_next_frame = true;
+      bundle_internal.origin = {13, 17};
+      io.frames.push_back(std::move(bundle_internal));
+      frame_durations_nc.push_back(0);
+      frame_xsize.push_back(xsize / 2);
+      frame_ysize.push_back(ysize / 2);
+      frame_x0.push_back(13);
+      frame_y0.push_back(17);
+    }
+
+    std::vector<uint8_t> frame =
+        jxl::test::GetSomeTestImage(cropxsize, cropysize, 4, i * 2);
+    // Actual rendered frame
+    jxl::ImageBundle bundle(&io.metadata.m);
+    EXPECT_TRUE(ConvertFromExternal(
+        jxl::Span<const uint8_t>(frame.data(), frame.size()), cropxsize,
+        cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
+    bundle.duration = 5 + i;
+    frame_durations_nc.push_back(5 + i);
+    frame_durations_c.push_back(5 + i);
+    frame_xsize.push_back(cropxsize);
+    frame_ysize.push_back(cropysize);
+    frame_x0.push_back(cropx0);
+    frame_y0.push_back(cropy0);
+    bundle.origin = {cropx0, cropy0};
+    // Create some variation in which frames depend on which.
+    if (i != 3 && i != 9 && i != 10) {
+      bundle.use_for_next_frame = true;
+    }
+    if (i != 12) {
+      bundle.blend = true;
+      bundle.blendmode = jxl::BlendMode::kBlend;
+    }
+    io.frames.push_back(std::move(bundle));
+  }
+
+  jxl::CompressParams cparams;
+  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::AuxOut aux_out;
+  jxl::PaddedBytes compressed;
+  jxl::PassesEncoderState enc_state;
+  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
+                              jxl::GetJxlCms(), &aux_out, nullptr));
+  // try both with and without coalescing
+  for (auto coalescing : {JXL_TRUE, JXL_FALSE}) {
+    // Independently decode all frames without any skipping, to create the
+    // expected blended frames, for the actual tests below to compare with.
+    {
+      JxlDecoder* dec = JxlDecoderCreate(NULL);
+      const uint8_t* next_in = compressed.data();
+      size_t avail_in = compressed.size();
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec, coalescing));
+      void* runner = JxlThreadParallelRunnerCreate(
+          NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner(
+                                     dec, JxlThreadParallelRunner, runner));
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSubscribeEvents(dec, JXL_DEC_FULL_IMAGE));
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+      for (size_t i = 0; i < num_frames + (coalescing ? 0 : 5); ++i) {
+        EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+        size_t buffer_size;
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+        if (coalescing) {
+          EXPECT_EQ(xsize * ysize * 8, buffer_size);
+        } else {
+          EXPECT_EQ(frame_xsize[i] * frame_ysize[i] * 8, buffer_size);
+        }
+        frames[i].resize(buffer_size);
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetImageOutBuffer(dec, &format, frames[i].data(),
+                                              frames[i].size()));
+        EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+      }
+
+      // After all frames were decoded, JxlDecoderProcessInput should return
+      // success to indicate all is done.
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+      JxlThreadParallelRunnerDestroy(runner);
+      JxlDecoderDestroy(dec);
+    }
+
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+    const uint8_t* next_in = compressed.data();
+    size_t avail_in = compressed.size();
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec, coalescing));
+    void* runner = JxlThreadParallelRunnerCreate(
+        NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner(
+                                   dec, JxlThreadParallelRunner, runner));
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
+                                   dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME |
+                                            JXL_DEC_FULL_IMAGE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+    EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+    JxlBasicInfo info;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+
+    for (size_t i = 0; i < num_frames; ++i) {
+      EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+      size_t buffer_size;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+      std::vector<uint8_t> pixels(buffer_size);
+
+      JxlFrameHeader frame_header;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+      EXPECT_EQ((coalescing ? frame_durations_c[i] : frame_durations_nc[i]),
+                frame_header.duration);
+
+      EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+
+      EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(dec, &format, pixels.data(),
+                                            pixels.size()));
+
+      EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+      if (coalescing) {
+        EXPECT_EQ(frame_header.layer_info.xsize, xsize);
+      } else {
+        EXPECT_EQ(frame_header.layer_info.xsize, frame_xsize[i]);
+      }
+      if (coalescing) {
+        EXPECT_EQ(frame_header.layer_info.ysize, ysize);
+      } else {
+        EXPECT_EQ(frame_header.layer_info.ysize, frame_ysize[i]);
+      }
+      EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                             frame_header.layer_info.xsize,
+                                             frame_header.layer_info.ysize,
+                                             format, format));
+
+      // Test rewinding mid-way, not decoding all frames.
+      if (i == 8) {
+        break;
+      }
+    }
+
+    JxlDecoderRewind(dec);
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
+                                   dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+    for (size_t i = 0; i < num_frames + (coalescing ? 0 : 5); ++i) {
+      if (i == 3) {
+        JxlDecoderSkipFrames(dec, 5);
+        i += 5;
+      }
+
+      EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+      size_t buffer_size;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+      std::vector<uint8_t> pixels(buffer_size);
+
+      JxlFrameHeader frame_header;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+      EXPECT_EQ((coalescing ? frame_durations_c[i] : frame_durations_nc[i]),
+                frame_header.duration);
+
+      EXPECT_EQ(i + 1 == num_frames + (coalescing ? 0 : 5),
+                frame_header.is_last);
+
+      EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(dec, &format, pixels.data(),
+                                            pixels.size()));
+
+      EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+      if (coalescing) {
+        EXPECT_EQ(frame_header.layer_info.xsize, xsize);
+        EXPECT_EQ(frame_header.layer_info.ysize, ysize);
+        EXPECT_EQ(frame_header.layer_info.crop_x0, 0);
+        EXPECT_EQ(frame_header.layer_info.crop_y0, 0);
+      } else {
+        EXPECT_EQ(frame_header.layer_info.xsize, frame_xsize[i]);
+        EXPECT_EQ(frame_header.layer_info.ysize, frame_ysize[i]);
+        EXPECT_EQ(frame_header.layer_info.crop_x0, frame_x0[i]);
+        EXPECT_EQ(frame_header.layer_info.crop_y0, frame_y0[i]);
+        EXPECT_EQ(frame_header.layer_info.blend_info.blendmode,
+                  i != 12 + 5 && frame_header.duration != 0
+                      ? 2
+                      : 0);  // kBlend or the default kReplace
+      }
+      EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                             frame_header.layer_info.xsize,
+                                             frame_header.layer_info.ysize,
+                                             format, format));
+    }
+
+    // After all frames were decoded, JxlDecoderProcessInput should return
+    // success to indicate all is done.
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+    // Test rewinding the decoder and skipping different frames
+
+    JxlDecoderRewind(dec);
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
+                                   dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+    for (size_t i = 0; i < num_frames + (coalescing ? 0 : 5); ++i) {
+      int test_skipping = (i == 9) ? 3 : 0;
+
+      EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+      size_t buffer_size;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+      std::vector<uint8_t> pixels(buffer_size);
+
+      // Since this is after JXL_DEC_FRAME but before JXL_DEC_FULL_IMAGE, this
+      // should only skip the next frame, not the currently processed one.
+      if (test_skipping) JxlDecoderSkipFrames(dec, test_skipping);
+
+      JxlFrameHeader frame_header;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+      EXPECT_EQ((coalescing ? frame_durations_c[i] : frame_durations_nc[i]),
+                frame_header.duration);
+
+      EXPECT_EQ(i + 1 == num_frames + (coalescing ? 0 : 5),
+                frame_header.is_last);
+
+      EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(dec, &format, pixels.data(),
+                                            pixels.size()));
+
+      EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+      EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                             frame_header.layer_info.xsize,
+                                             frame_header.layer_info.ysize,
+                                             format, format));
+
+      if (test_skipping) i += test_skipping;
+    }
+
+    JxlThreadParallelRunnerDestroy(runner);
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, OrientedCroppedFrameTest) {
+  const auto test = [](bool keep_orientation, uint32_t orientation,
+                       uint32_t resampling) {
+    size_t xsize = 90, ysize = 120;
+    JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    size_t oxsize = (!keep_orientation && orientation > 4 ? ysize : xsize);
+    size_t oysize = (!keep_orientation && orientation > 4 ? xsize : ysize);
+    jxl::CodecInOut io;
+    io.SetSize(xsize, ysize);
+    io.metadata.m.SetUintSamples(16);
+    io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+    io.metadata.m.orientation = orientation;
+    io.frames.clear();
+    io.SetSize(xsize, ysize);
+
+    for (size_t i = 0; i < 3; ++i) {
+      size_t cropxsize = 1 + xsize * 2 / (i + 1);
+      size_t cropysize = 1 + ysize * 3 / (i + 2);
+      int cropx0 = i * 3 - 8;
+      int cropy0 = i * 4 - 7;
+
+      std::vector<uint8_t> frame =
+          jxl::test::GetSomeTestImage(cropxsize, cropysize, 4, i * 2);
+      jxl::ImageBundle bundle(&io.metadata.m);
+      EXPECT_TRUE(ConvertFromExternal(
+          jxl::Span<const uint8_t>(frame.data(), frame.size()), cropxsize,
+          cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+          /*bits_per_sample=*/16, format,
+          /*pool=*/nullptr, &bundle));
+      bundle.origin = {cropx0, cropy0};
+      bundle.use_for_next_frame = true;
+      io.frames.push_back(std::move(bundle));
+    }
+
+    jxl::CompressParams cparams;
+    cparams
+        .SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+    cparams.speed_tier = jxl::SpeedTier::kThunder;
+    cparams.resampling = resampling;
+    jxl::AuxOut aux_out;
+    jxl::PaddedBytes compressed;
+    jxl::PassesEncoderState enc_state;
+    EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
+                                jxl::GetJxlCms(), &aux_out, nullptr));
+
+    // 0 is merged frame as decoded with coalescing enabled (default)
+    // 1-3 are non-coalesced frames as decoded with coalescing disabled
+    // 4 is the manually merged frame
+    std::vector<uint8_t> frames[5];
+    frames[4].resize(xsize * ysize * 8, 0);
+
+    // try both with and without coalescing
+    for (auto coalescing : {JXL_TRUE, JXL_FALSE}) {
+      // Independently decode all frames without any skipping, to create the
+      // expected blended frames, for the actual tests below to compare with.
+      {
+        JxlDecoder* dec = JxlDecoderCreate(NULL);
+        const uint8_t* next_in = compressed.data();
+        size_t avail_in = compressed.size();
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec, coalescing));
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetKeepOrientation(dec, keep_orientation));
+        void* runner = JxlThreadParallelRunnerCreate(
+            NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner(
+                                       dec, JxlThreadParallelRunner, runner));
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSubscribeEvents(dec, JXL_DEC_FULL_IMAGE));
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+        for (size_t i = (coalescing ? 0 : 1); i < (coalescing ? 1 : 4); ++i) {
+          EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+          JxlFrameHeader frame_header;
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderGetFrameHeader(dec, &frame_header));
+          size_t buffer_size;
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+          if (coalescing) {
+            EXPECT_EQ(xsize * ysize * 8, buffer_size);
+          } else {
+            EXPECT_EQ(frame_header.layer_info.xsize *
+                          frame_header.layer_info.ysize * 8,
+                      buffer_size);
+          }
+          frames[i].resize(buffer_size);
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetImageOutBuffer(dec, &format, frames[i].data(),
+                                                frames[i].size()));
+          EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+          EXPECT_EQ(frame_header.layer_info.blend_info.blendmode,
+                    JXL_BLEND_REPLACE);
+          if (coalescing) {
+            EXPECT_EQ(frame_header.layer_info.xsize, oxsize);
+            EXPECT_EQ(frame_header.layer_info.ysize, oysize);
+            EXPECT_EQ(frame_header.layer_info.crop_x0, 0);
+            EXPECT_EQ(frame_header.layer_info.crop_y0, 0);
+          } else {
+            // manually merge this layer
+            int x0 = frame_header.layer_info.crop_x0;
+            int y0 = frame_header.layer_info.crop_y0;
+            int w = frame_header.layer_info.xsize;
+            int h = frame_header.layer_info.ysize;
+            for (int y = 0; y < static_cast<int>(oysize); y++) {
+              if (y < y0 || y >= y0 + h) continue;
+              // pointers do whole 16-bit RGBA pixels at a time
+              uint64_t* row_merged = static_cast<uint64_t*>(
+                  (void*)(frames[4].data() + y * oxsize * 8));
+              uint64_t* row_layer = static_cast<uint64_t*>(
+                  (void*)(frames[i].data() + (y - y0) * w * 8));
+              for (int x = 0; x < static_cast<int>(oxsize); x++) {
+                if (x < x0 || x >= x0 + w) continue;
+                row_merged[x] = row_layer[x - x0];
+              }
+            }
+          }
+        }
+
+        // After all frames were decoded, JxlDecoderProcessInput should return
+        // success to indicate all is done.
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+        JxlThreadParallelRunnerDestroy(runner);
+        JxlDecoderDestroy(dec);
+      }
+    }
+
+    EXPECT_EQ(0u, jxl::test::ComparePixels(frames[0].data(), frames[4].data(),
+                                           oxsize, oysize, format, format));
+  };
+
+  for (bool keep_orientation : {true, false}) {
+    for (uint32_t orientation = 1; orientation <= 8; orientation++) {
+      for (uint32_t resampling : {1, 2, 4, 8}) {
+        SCOPED_TRACE(testing::Message()
+                     << "keep_orientation: " << keep_orientation << ", "
+                     << "orientation: " << orientation << ", "
+                     << "resampling: " << resampling);
+        test(keep_orientation, orientation, resampling);
+      }
+    }
+  }
+}
+
+struct FramePositions {
+  size_t frame_start;
+  size_t header_end;
+  size_t toc_end;
+  std::vector<size_t> section_end;
+};
+
+struct StreamPositions {
+  size_t codestream_start;
+  size_t codestream_end;
+  size_t basic_info;
+  size_t jbrd_end = 0;
+  std::vector<size_t> box_start;
+  std::vector<FramePositions> frames;
+};
+
+void AnalyzeCodestream(const jxl::PaddedBytes& data,
+                       StreamPositions* streampos) {
+  // Unbox data to codestream and mark where it is broken up by boxes.
+  std::vector<uint8_t> codestream;
+  std::vector<std::pair<size_t, size_t>> breakpoints;
+  bool codestream_end = false;
+  ASSERT_LE(2, data.size());
+  if (data[0] == 0xff && data[1] == 0x0a) {
+    codestream = std::vector<uint8_t>(data.begin(), data.end());
+    streampos->codestream_start = 0;
+  } else {
+    const uint8_t* in = data.data();
+    size_t pos = 0;
+    while (pos < data.size()) {
+      ASSERT_LE(pos + 8, data.size());
+      streampos->box_start.push_back(pos);
+      size_t box_size = LoadBE32(in + pos);
+      if (box_size == 0) box_size = data.size() - pos;
+      ASSERT_LE(pos + box_size, data.size());
+      if (memcmp(in + pos + 4, "jxlc", 4) == 0) {
+        EXPECT_TRUE(codestream.empty());
+        streampos->codestream_start = pos + 8;
+        codestream.insert(codestream.end(), in + pos + 8, in + pos + box_size);
+        codestream_end = true;
+      } else if (memcmp(in + pos + 4, "jxlp", 4) == 0) {
+        codestream_end = (LoadBE32(in + pos + 8) & 0x80000000);
+        if (codestream.empty()) {
+          streampos->codestream_start = pos + 12;
+        } else if (box_size > 12 || !codestream_end) {
+          breakpoints.push_back({codestream.size(), 12});
+        }
+        codestream.insert(codestream.end(), in + pos + 12, in + pos + box_size);
+      } else if (memcmp(in + pos + 4, "jbrd", 4) == 0) {
+        EXPECT_TRUE(codestream.empty());
+        streampos->jbrd_end = pos + box_size;
+      } else if (!codestream.empty() && !codestream_end) {
+        breakpoints.push_back({codestream.size(), box_size});
+      }
+      pos += box_size;
+    }
+    ASSERT_EQ(pos, data.size());
+  }
+  // Translate codestream positions to boxed stream positions.
+  size_t offset = streampos->codestream_start;
+  size_t bp = 0;
+  auto add_offset = [&](size_t pos) {
+    while (bp < breakpoints.size() && pos >= breakpoints[bp].first) {
+      offset += breakpoints[bp++].second;
+    }
+    return pos + offset;
+  };
+  // Analyze the unboxed codestream.
+  jxl::BitReader br(
+      jxl::Span<const uint8_t>(codestream.data(), codestream.size()));
+  ASSERT_EQ(br.ReadFixedBits<16>(), 0x0AFF);
+  jxl::CodecMetadata metadata;
+  EXPECT_TRUE(ReadSizeHeader(&br, &metadata.size));
+  EXPECT_TRUE(ReadImageMetadata(&br, &metadata.m));
+  streampos->basic_info =
+      add_offset(br.TotalBitsConsumed() / jxl::kBitsPerByte);
+  metadata.transform_data.nonserialized_xyb_encoded = metadata.m.xyb_encoded;
+  EXPECT_TRUE(jxl::Bundle::Read(&br, &metadata.transform_data));
+  EXPECT_TRUE(br.JumpToByteBoundary());
+  bool has_preview = metadata.m.have_preview;
+  while (br.TotalBitsConsumed() < br.TotalBytes() * jxl::kBitsPerByte) {
+    FramePositions p;
+    p.frame_start = add_offset(br.TotalBitsConsumed() / jxl::kBitsPerByte);
+    jxl::FrameHeader frame_header(&metadata);
+    if (has_preview) {
+      frame_header.nonserialized_is_preview = true;
+      has_preview = false;
+    }
+    EXPECT_TRUE(ReadFrameHeader(&br, &frame_header));
+    p.header_end =
+        add_offset(jxl::DivCeil(br.TotalBitsConsumed(), jxl::kBitsPerByte));
+    jxl::FrameDimensions frame_dim = frame_header.ToFrameDimensions();
+    uint64_t groups_total_size;
+    const size_t toc_entries = jxl::NumTocEntries(
+        frame_dim.num_groups, frame_dim.num_dc_groups,
+        frame_header.passes.num_passes, /*has_ac_global=*/true);
+    std::vector<uint64_t> section_offsets;
+    std::vector<uint32_t> section_sizes;
+    EXPECT_TRUE(ReadGroupOffsets(toc_entries, &br, &section_offsets,
+                                 &section_sizes, &groups_total_size));
+    EXPECT_EQ(br.TotalBitsConsumed() % jxl::kBitsPerByte, 0);
+    size_t sections_start = br.TotalBitsConsumed() / jxl::kBitsPerByte;
+    p.toc_end = add_offset(sections_start);
+    for (size_t i = 0; i < toc_entries; ++i) {
+      size_t end = sections_start + section_offsets[i] + section_sizes[i];
+      p.section_end.push_back(add_offset(end));
+    }
+    br.SkipBits(groups_total_size * jxl::kBitsPerByte);
+    streampos->frames.push_back(p);
+  }
+  streampos->codestream_end = add_offset(codestream.size());
+  EXPECT_EQ(br.TotalBitsConsumed(), br.TotalBytes() * jxl::kBitsPerByte);
+  EXPECT_TRUE(br.Close());
+}
+
+enum ExpectedFlushState { NO_FLUSH, SAME_FLUSH, NEW_FLUSH };
+struct Breakpoint {
+  size_t file_pos;
+  ExpectedFlushState expect_flush;
+};
+
+void VerifyProgression(size_t xsize, size_t ysize, uint32_t num_channels,
+                       const std::vector<uint8_t>& pixels,
+                       const jxl::PaddedBytes& data,
+                       std::vector<Breakpoint> breakpoints) {
+  // Size large enough for multiple groups, required to have progressive stages.
+  ASSERT_LT(256, xsize);
+  ASSERT_LT(256, ysize);
+  std::vector<uint8_t> pixels2;
+  pixels2.resize(pixels.size());
+  JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  int bp = 0;
+  const uint8_t* next_in = data.data();
+  size_t avail_in = breakpoints[bp].file_pos;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+  double prev_dist = 1.0;
+  for (;;) {
+    JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+    printf("bp: %d  status: 0x%x\n", bp, (int)status);
+    if (status == JXL_DEC_BASIC_INFO) {
+      JxlBasicInfo info;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+      EXPECT_EQ(info.xsize, xsize);
+      EXPECT_EQ(info.ysize, ysize);
+      // Output buffer/callback not yet set
+      EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec));
+      size_t buffer_size;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+      EXPECT_EQ(pixels2.size(), buffer_size);
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(dec, &format, pixels2.data(),
+                                            pixels2.size()));
+    } else if (status == JXL_DEC_FRAME) {
+      // Nothing to do.
+    } else if (status == JXL_DEC_SUCCESS) {
+      EXPECT_EQ(bp + 1, breakpoints.size());
+      break;
+    } else if (status == JXL_DEC_NEED_MORE_INPUT ||
+               status == JXL_DEC_FULL_IMAGE) {
+      if (breakpoints[bp].expect_flush == NO_FLUSH) {
+        EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec));
+      } else {
+        if (status != JXL_DEC_FULL_IMAGE) {
+          EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec));
+        }
+        double dist = jxl::test::DistanceRMS(pixels2.data(), pixels.data(),
+                                             xsize, ysize, format);
+        if (breakpoints[bp].expect_flush == NEW_FLUSH) {
+          EXPECT_LT(dist, prev_dist);
+          prev_dist = dist;
+        } else {
+          EXPECT_EQ(dist, prev_dist);
+        }
+      }
+      if (status == JXL_DEC_FULL_IMAGE) {
+        EXPECT_EQ(bp + 1, breakpoints.size());
+        continue;
+      }
+      ASSERT_LT(++bp, breakpoints.size());
+      next_in += avail_in - JxlDecoderReleaseInput(dec);
+      avail_in = breakpoints[bp].file_pos - (next_in - data.data());
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+    } else {
+      printf("Unexpected status: 0x%x\n", (int)status);
+      FAIL();  // unexpected returned status
+    }
+  }
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, ProgressionTest) {
+  size_t xsize = 508, ysize = 470;
+  uint32_t num_channels = 3;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  jxl::TestCodestreamParams params;
+  params.cparams.progressive_dc = 1;
+  params.preview_mode = jxl::kSmallPreview;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  StreamPositions streampos;
+  AnalyzeCodestream(data, &streampos);
+  const std::vector<FramePositions>& fp = streampos.frames;
+  // We have preview, dc frame and regular frame.
+  EXPECT_EQ(3, fp.size());
+  EXPECT_EQ(7, fp[2].section_end.size());
+  EXPECT_EQ(data.size(), fp[2].section_end[6]);
+  std::vector<Breakpoint> breakpoints{
+      {fp[0].frame_start, NO_FLUSH},           // headers
+      {fp[1].frame_start, NO_FLUSH},           // preview
+      {fp[2].frame_start, NO_FLUSH},           // dc frame
+      {fp[2].section_end[0], NO_FLUSH},        // DC global
+      {fp[2].section_end[1] - 1, NO_FLUSH},    // partial DC group
+      {fp[2].section_end[1], NEW_FLUSH},       // DC group
+      {fp[2].section_end[2], SAME_FLUSH},      // AC global
+      {fp[2].section_end[3], NEW_FLUSH},       // AC group 0
+      {fp[2].section_end[4] - 1, SAME_FLUSH},  // partial AC group 1
+      {fp[2].section_end[4], NEW_FLUSH},       // AC group 1
+      {fp[2].section_end[5], NEW_FLUSH},       // AC group 2
+      {data.size() - 1, SAME_FLUSH},           // partial AC group 3
+      {data.size(), NEW_FLUSH}};               // full image
+  VerifyProgression(xsize, ysize, num_channels, pixels, data, breakpoints);
+}
+
+TEST(DecodeTest, ProgressionTestLosslessAlpha) {
+  size_t xsize = 508, ysize = 470;
+  uint32_t num_channels = 4;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  jxl::TestCodestreamParams params;
+  params.cparams.SetLossless();
+  params.cparams.speed_tier = jxl::SpeedTier::kThunder;
+  params.cparams.responsive = 1;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  StreamPositions streampos;
+  AnalyzeCodestream(data, &streampos);
+  const std::vector<FramePositions>& fp = streampos.frames;
+  // We have preview, dc frame and regular frame.
+  EXPECT_EQ(1, fp.size());
+  EXPECT_EQ(7, fp[0].section_end.size());
+  EXPECT_EQ(data.size(), fp[0].section_end[6]);
+  std::vector<Breakpoint> breakpoints{
+      {fp[0].frame_start, NO_FLUSH},           // headers
+      {fp[0].section_end[0] - 1, NO_FLUSH},    // partial DC global
+      {fp[0].section_end[0], NEW_FLUSH},       // DC global
+      {fp[0].section_end[1], SAME_FLUSH},      // DC group
+      {fp[0].section_end[2], SAME_FLUSH},      // AC global
+      {fp[0].section_end[3], NEW_FLUSH},       // AC group 0
+      {fp[0].section_end[4] - 1, SAME_FLUSH},  // partial AC group 1
+      {fp[0].section_end[4], NEW_FLUSH},       // AC group 1
+      {fp[0].section_end[5], NEW_FLUSH},       // AC group 2
+      {data.size() - 1, SAME_FLUSH},           // partial AC group 3
+      {data.size(), NEW_FLUSH}};               // full image
+  VerifyProgression(xsize, ysize, num_channels, pixels, data, breakpoints);
+}
+
+void VerifyFilePosition(size_t expected_pos, const jxl::PaddedBytes& data,
+                        JxlDecoder* dec) {
+  size_t remaining = JxlDecoderReleaseInput(dec);
+  size_t pos = data.size() - remaining;
+  EXPECT_EQ(expected_pos, pos);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetInput(dec, data.data() + pos, remaining));
+}
+
+TEST(DecodeTest, InputHandlingTestOneShot) {
+  size_t xsize = 508, ysize = 470;
+  uint32_t num_channels = 3;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+    printf("Testing with box format %d\n", i);
+    jxl::TestCodestreamParams params;
+    params.cparams.progressive_dc = 1;
+    params.preview_mode = jxl::kSmallPreview;
+    params.box_format = (CodeStreamBoxFormat)i;
+    jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+        num_channels, params);
+    JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    StreamPositions streampos;
+    AnalyzeCodestream(data, &streampos);
+    const std::vector<FramePositions>& fp = streampos.frames;
+    // We have preview, dc frame and regular frame.
+    EXPECT_EQ(3, fp.size());
+
+    std::vector<uint8_t> pixels2;
+    pixels2.resize(pixels.size());
+
+    int kNumEvents = 6;
+    int events[] = {
+        JXL_DEC_BASIC_INFO, JXL_DEC_COLOR_ENCODING, JXL_DEC_PREVIEW_IMAGE,
+        JXL_DEC_FRAME,      JXL_DEC_FULL_IMAGE,     JXL_DEC_FRAME_PROGRESSION,
+    };
+    size_t end_positions[] = {
+        streampos.basic_info,     fp[0].frame_start,
+        fp[1].frame_start,        fp[2].toc_end,
+        streampos.codestream_end, streampos.codestream_end};
+    int events_wanted = 0;
+    for (int j = 0; j < kNumEvents; ++j) {
+      events_wanted |= events[j];
+      size_t end_pos = end_positions[j];
+      JxlDecoder* dec = JxlDecoderCreate(nullptr);
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events_wanted));
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetInput(dec, data.data(), data.size()));
+      EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+      VerifyFilePosition(streampos.basic_info, data, dec);
+      if (j >= 1) {
+        EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[0].frame_start, data, dec);
+      }
+      if (j >= 2) {
+        EXPECT_EQ(JXL_DEC_NEED_PREVIEW_OUT_BUFFER, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[0].toc_end, data, dec);
+        size_t buffer_size;
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size));
+        EXPECT_GE(pixels2.size(), buffer_size);
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetPreviewOutBuffer(dec, &format, pixels2.data(),
+                                                buffer_size));
+        EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[1].frame_start, data, dec);
+      }
+      if (j >= 3) {
+        EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[2].toc_end, data, dec);
+        if (j >= 5) {
+          EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetProgressiveDetail(dec, kDC));
+        }
+      }
+      if (j >= 4) {
+        EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[2].toc_end, data, dec);
+        size_t buffer_size;
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+        EXPECT_EQ(pixels2.size(), buffer_size);
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetImageOutBuffer(dec, &format, pixels2.data(),
+                                              pixels2.size()));
+        if (j >= 5) {
+          EXPECT_EQ(JXL_DEC_FRAME_PROGRESSION, JxlDecoderProcessInput(dec));
+          VerifyFilePosition(fp[2].section_end[1], data, dec);
+        }
+        EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(streampos.codestream_end, data, dec);
+      }
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+      VerifyFilePosition(end_pos, data, dec);
+      JxlDecoderDestroy(dec);
+    }
+  }
+}
+
+#if JPEGXL_ENABLE_JPEG
+TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(InputHandlingTestJPEGOneshot)) {
+  size_t xsize = 123;
+  size_t ysize = 77;
+  size_t channels = 3;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, channels, /*seed=*/0);
+  for (int i = 1; i < kCSBF_NUM_ENTRIES; ++i) {
+    printf("Testing with box format %d\n", i);
+    jxl::PaddedBytes jpeg_codestream;
+    jxl::TestCodestreamParams params;
+    params.cparams.color_transform = jxl::ColorTransform::kNone;
+    params.jpeg_codestream = &jpeg_codestream;
+    params.preview_mode = jxl::kSmallPreview;
+    params.box_format = (CodeStreamBoxFormat)i;
+    jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+        channels, params);
+    JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    StreamPositions streampos;
+    AnalyzeCodestream(data, &streampos);
+    const std::vector<FramePositions>& fp = streampos.frames;
+    // We have preview and regular frame.
+    EXPECT_EQ(2, fp.size());
+    EXPECT_LT(0, streampos.jbrd_end);
+
+    std::vector<uint8_t> pixels2;
+    pixels2.resize(pixels.size());
+
+    int kNumEvents = 6;
+    int events[] = {JXL_DEC_BASIC_INFO,     JXL_DEC_JPEG_RECONSTRUCTION,
+                    JXL_DEC_COLOR_ENCODING, JXL_DEC_PREVIEW_IMAGE,
+                    JXL_DEC_FRAME,          JXL_DEC_FULL_IMAGE};
+    size_t end_positions[] = {streampos.basic_info, streampos.basic_info,
+                              fp[0].frame_start,    fp[1].frame_start,
+                              fp[1].toc_end,        streampos.codestream_end};
+    int events_wanted = 0;
+    for (int j = 0; j < kNumEvents; ++j) {
+      printf("j = %d\n", j);
+      events_wanted |= events[j];
+      size_t end_pos = end_positions[j];
+      JxlDecoder* dec = JxlDecoderCreate(nullptr);
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events_wanted));
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetInput(dec, data.data(), data.size()));
+      if (j >= 1) {
+        EXPECT_EQ(JXL_DEC_JPEG_RECONSTRUCTION, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(streampos.jbrd_end, data, dec);
+      }
+      EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+      VerifyFilePosition(streampos.basic_info, data, dec);
+      if (j >= 2) {
+        EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[0].frame_start, data, dec);
+      }
+      if (j >= 3) {
+        EXPECT_EQ(JXL_DEC_NEED_PREVIEW_OUT_BUFFER, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[0].toc_end, data, dec);
+        size_t buffer_size;
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size));
+        EXPECT_GE(pixels2.size(), buffer_size);
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetPreviewOutBuffer(dec, &format, pixels2.data(),
+                                                buffer_size));
+        EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[1].frame_start, data, dec);
+      }
+      if (j >= 4) {
+        EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[1].toc_end, data, dec);
+      }
+      if (j >= 5) {
+        EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[1].toc_end, data, dec);
+        size_t buffer_size;
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+        EXPECT_EQ(pixels2.size(), buffer_size);
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetImageOutBuffer(dec, &format, pixels2.data(),
+                                              pixels2.size()));
+        EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(streampos.codestream_end, data, dec);
+      }
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+      VerifyFilePosition(end_pos, data, dec);
+      JxlDecoderDestroy(dec);
+    }
+  }
+}
+#endif  // JPEGXL_ENABLE_JPEG
+
+TEST(DecodeTest, InputHandlingTestStreaming) {
+  size_t xsize = 508, ysize = 470;
+  uint32_t num_channels = 3;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+    printf("Testing with box format %d\n", i);
+    fflush(stdout);
+    jxl::TestCodestreamParams params;
+    params.cparams.progressive_dc = 1;
+    params.box_format = (CodeStreamBoxFormat)i;
+    params.preview_mode = jxl::kSmallPreview;
+    jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+        num_channels, params);
+    JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    StreamPositions streampos;
+    AnalyzeCodestream(data, &streampos);
+    const std::vector<FramePositions>& fp = streampos.frames;
+    // We have preview, dc frame and regular frame.
+    EXPECT_EQ(3, fp.size());
+    std::vector<uint8_t> pixels2;
+    pixels2.resize(pixels.size());
+    int events_wanted =
+        (JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING | JXL_DEC_PREVIEW_IMAGE |
+         JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE | JXL_DEC_FRAME_PROGRESSION |
+         JXL_DEC_BOX);
+    for (size_t increment : {1, 7, 27, 1024}) {
+      JxlDecoder* dec = JxlDecoderCreate(nullptr);
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events_wanted));
+      size_t file_pos = 0;
+      size_t box_index = 0;
+      size_t avail_in = 0;
+      for (;;) {
+        const uint8_t* next_in = data.data() + file_pos;
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+        JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+        size_t remaining = JxlDecoderReleaseInput(dec);
+        size_t consumed = avail_in - remaining;
+        file_pos += consumed;
+        avail_in += increment;
+        avail_in = std::min<size_t>(avail_in, data.size() - file_pos);
+        if (status == JXL_DEC_BASIC_INFO) {
+          EXPECT_EQ(file_pos, streampos.basic_info);
+        } else if (status == JXL_DEC_COLOR_ENCODING) {
+          EXPECT_EQ(file_pos, streampos.frames[0].frame_start);
+        } else if (status == JXL_DEC_NEED_PREVIEW_OUT_BUFFER) {
+          EXPECT_EQ(file_pos, streampos.frames[0].toc_end);
+          size_t buffer_size;
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size));
+          EXPECT_GE(pixels2.size(), buffer_size);
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetPreviewOutBuffer(dec, &format, pixels2.data(),
+                                                  buffer_size));
+        } else if (status == JXL_DEC_PREVIEW_IMAGE) {
+          EXPECT_EQ(file_pos, streampos.frames[1].frame_start);
+        } else if (status == JXL_DEC_FRAME) {
+          EXPECT_EQ(file_pos, streampos.frames[2].toc_end);
+          EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetProgressiveDetail(dec, kDC));
+        } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+          EXPECT_EQ(file_pos, streampos.frames[2].toc_end);
+          size_t buffer_size;
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+          EXPECT_EQ(pixels2.size(), buffer_size);
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetImageOutBuffer(dec, &format, pixels2.data(),
+                                                pixels2.size()));
+        } else if (status == JXL_DEC_FRAME_PROGRESSION) {
+          EXPECT_EQ(file_pos, streampos.frames[2].section_end[1]);
+        } else if (status == JXL_DEC_FULL_IMAGE) {
+          EXPECT_EQ(file_pos, streampos.codestream_end);
+        } else if (status == JXL_DEC_SUCCESS) {
+          EXPECT_EQ(file_pos, streampos.codestream_end);
+          break;
+        } else if (status == JXL_DEC_NEED_MORE_INPUT) {
+          EXPECT_LT(remaining, 12);
+          if ((i == kCSBF_None && file_pos >= 2) ||
+              (box_index > 0 && box_index < streampos.box_start.size() &&
+               file_pos >= streampos.box_start[box_index - 1] + 12 &&
+               file_pos < streampos.box_start[box_index])) {
+            EXPECT_EQ(remaining, 0);
+          }
+          if (file_pos == data.size()) break;
+        } else if (status == JXL_DEC_BOX) {
+          ASSERT_LT(box_index, streampos.box_start.size());
+          EXPECT_EQ(file_pos, streampos.box_start[box_index++]);
+        } else {
+          printf("Unexpected status: 0x%x\n", (int)status);
+          FAIL();
+        }
+      }
+      JxlDecoderDestroy(dec);
+    }
+  }
+}
+
+TEST(DecodeTest, FlushTest) {
+  // Size large enough for multiple groups, required to have progressive
+  // stages
+  size_t xsize = 333, ysize = 300;
+  uint32_t num_channels = 3;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  jxl::TestCodestreamParams params;
+  params.preview_mode = jxl::kSmallPreview;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  std::vector<uint8_t> pixels2;
+  pixels2.resize(pixels.size());
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+
+  // Ensure that the first part contains at least the full DC of the image,
+  // otherwise flush does not work.
+  size_t first_part = data.size() - 1;
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(info.xsize, xsize);
+  EXPECT_EQ(info.ysize, ysize);
+
+  EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+  // Output buffer not yet set
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec));
+
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  EXPECT_EQ(pixels2.size(), buffer_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                 dec, &format, pixels2.data(), pixels2.size()));
+
+  // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if
+  // data was already input before, since the processing of the frame only
+  // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME.
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec));
+
+  // Crude test of actual pixel data: pixel threshold of about 4% (2560/65535).
+  // 29000 pixels can be above the threshold
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            29000u);
+
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  size_t consumed = first_part - JxlDecoderReleaseInput(dec);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed,
+                                                data.size() - consumed));
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+  // Lower threshold for the final (still lossy) image
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            11000u);
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, FlushTestImageOutCallback) {
+  // Size large enough for multiple groups, required to have progressive
+  // stages
+  size_t xsize = 333, ysize = 300;
+  uint32_t num_channels = 3;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  jxl::TestCodestreamParams params;
+  params.preview_mode = jxl::kSmallPreview;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  std::vector<uint8_t> pixels2;
+  pixels2.resize(pixels.size());
+
+  size_t bytes_per_pixel = format.num_channels * 2;
+  size_t stride = bytes_per_pixel * xsize;
+  auto callback = [&](size_t x, size_t y, size_t num_pixels,
+                      const void* pixels_row) {
+    memcpy(pixels2.data() + stride * y + bytes_per_pixel * x, pixels_row,
+           num_pixels * bytes_per_pixel);
+  };
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+
+  // Ensure that the first part contains at least the full DC of the image,
+  // otherwise flush does not work.
+  size_t first_part = data.size() - 1;
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(info.xsize, xsize);
+  EXPECT_EQ(info.ysize, ysize);
+
+  EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+  // Output callback not yet set
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutCallback(
+                                 dec, &format,
+                                 [](void* opaque, size_t x, size_t y,
+                                    size_t xsize, const void* pixels_row) {
+                                   auto cb =
+                                       static_cast<decltype(&callback)>(opaque);
+                                   (*cb)(x, y, xsize, pixels_row);
+                                 },
+                                 /*opaque=*/&callback));
+
+  // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if
+  // data was already input before, since the processing of the frame only
+  // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME.
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec));
+
+  // Crude test of actual pixel data: pixel threshold of about 4% (2560/65535).
+  // 29000 pixels can be above the threshold
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            29000u);
+
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  size_t consumed = first_part - JxlDecoderReleaseInput(dec);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed,
+                                                data.size() - consumed));
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+  // Lower threshold for the final (still lossy) image
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            11000u);
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, FlushTestLossyProgressiveAlpha) {
+  // Size large enough for multiple groups, required to have progressive
+  // stages
+  size_t xsize = 333, ysize = 300;
+  uint32_t num_channels = 4;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  jxl::TestCodestreamParams params;
+  params.preview_mode = jxl::kSmallPreview;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  std::vector<uint8_t> pixels2;
+  pixels2.resize(pixels.size());
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+
+  // Ensure that the first part contains at least the full DC of the image,
+  // otherwise flush does not work.
+  size_t first_part = data.size() - 1;
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(info.xsize, xsize);
+  EXPECT_EQ(info.ysize, ysize);
+
+  EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+  // Output buffer not yet set
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec));
+
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  EXPECT_EQ(pixels2.size(), buffer_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                 dec, &format, pixels2.data(), pixels2.size()));
+
+  // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if
+  // data was already input before, since the processing of the frame only
+  // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME.
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec));
+
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            30000u);
+
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  size_t consumed = first_part - JxlDecoderReleaseInput(dec);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed,
+                                                data.size() - consumed));
+
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            11000u);
+
+  JxlDecoderDestroy(dec);
+}
+TEST(DecodeTest, FlushTestLossyProgressiveAlphaUpsampling) {
+  size_t xsize = 533, ysize = 401;
+  uint32_t num_channels = 4;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  jxl::TestCodestreamParams params;
+  params.cparams.resampling = 2;
+  params.cparams.ec_resampling = 4;
+  params.preview_mode = jxl::kSmallPreview;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  std::vector<uint8_t> pixels2;
+  pixels2.resize(pixels.size());
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+
+  // Ensure that the first part contains at least the full DC of the image,
+  // otherwise flush does not work.
+  size_t first_part = data.size() * 2 / 3;
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(info.xsize, xsize);
+  EXPECT_EQ(info.ysize, ysize);
+
+  EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+  // Output buffer not yet set
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec));
+
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  EXPECT_EQ(pixels2.size(), buffer_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                 dec, &format, pixels2.data(), pixels2.size()));
+
+  // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if
+  // data was already input before, since the processing of the frame only
+  // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME.
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec));
+
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            125000u);
+
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  size_t consumed = first_part - JxlDecoderReleaseInput(dec);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed,
+                                                data.size() - consumed));
+
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            70000u);
+
+  JxlDecoderDestroy(dec);
+}
+TEST(DecodeTest, FlushTestLosslessProgressiveAlpha) {
+  // Size large enough for multiple groups, required to have progressive
+  // stages
+  size_t xsize = 333, ysize = 300;
+  uint32_t num_channels = 4;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  jxl::TestCodestreamParams params;
+  params.cparams.SetLossless();
+  params.cparams.speed_tier = jxl::SpeedTier::kThunder;
+  params.cparams.responsive = 1;
+  params.preview_mode = jxl::kSmallPreview;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  std::vector<uint8_t> pixels2;
+  pixels2.resize(pixels.size());
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+
+  // Ensure that the first part contains at least the full DC of the image,
+  // otherwise flush does not work.
+  size_t first_part = data.size() / 2;
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(info.xsize, xsize);
+  EXPECT_EQ(info.ysize, ysize);
+
+  EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+  // Output buffer not yet set
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec));
+
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  EXPECT_EQ(pixels2.size(), buffer_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                 dec, &format, pixels2.data(), pixels2.size()));
+
+  // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if
+  // data was already input before, since the processing of the frame only
+  // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME.
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec));
+
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            2700u);
+
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  size_t consumed = first_part - JxlDecoderReleaseInput(dec);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed,
+                                                data.size() - consumed));
+
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format),
+            0u);
+
+  JxlDecoderDestroy(dec);
+}
+
+class DecodeProgressiveTest : public ::testing::TestWithParam<int> {};
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(DecodeProgressiveTestInstantiation,
+                                   DecodeProgressiveTest,
+                                   ::testing::Range(0, 8));
+TEST_P(DecodeProgressiveTest, ProgressiveEventTest) {
+  const int params = GetParam();
+  int single_group = params & 1;
+  int lossless = (params >> 1) & 1;
+  uint32_t num_channels = 3 + ((params >> 2) & 1);
+  std::set<JxlProgressiveDetail> progressive_details = {kDC, kLastPasses,
+                                                        kPasses};
+  for (auto prog_detail : progressive_details) {
+    // Only few combinations are expected to support outputting
+    // intermediate flushes for complete DC and complete passes.
+    // The test can be updated if more cases are expected to support it.
+    bool expect_flush = (num_channels & 1) && !lossless;
+    size_t xsize, ysize;
+    if (single_group) {
+      // An image smaller than 256x256 ensures it contains only 1 group.
+      xsize = 99;
+      ysize = 100;
+    } else {
+      xsize = 277;
+      ysize = 280;
+    }
+    std::vector<uint8_t> pixels =
+        jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+    JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    jxl::ColorEncoding color_encoding = jxl::ColorEncoding::SRGB(false);
+    jxl::CodecInOut io;
+    EXPECT_TRUE(jxl::ConvertFromExternal(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+        color_encoding,
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &io.Main()));
+    jxl::TestCodestreamParams params;
+    if (lossless) {
+      params.cparams.SetLossless();
+    } else {
+      params.cparams.butteraugli_distance = 0.5f;
+    }
+    jxl::PassDefinition passes[] = {
+        {2, 0, 4}, {4, 0, 4}, {8, 2, 2}, {8, 1, 2}, {8, 0, 1}};
+    const int kNumPasses = 5;
+    jxl::ProgressiveMode progressive_mode{passes};
+    params.progressive_mode = &progressive_mode;
+    jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+        num_channels, params);
+
+    for (size_t increment : {(size_t)1, data.size()}) {
+      printf(
+          "Testing with single_group=%d, lossless=%d, "
+          "num_channels=%d, prog_detail=%d, increment=%d\n",
+          single_group, lossless, (int)num_channels, (int)prog_detail,
+          (int)increment);
+      std::vector<std::vector<uint8_t>> passes(kNumPasses + 1);
+      for (int i = 0; i <= kNumPasses; ++i) {
+        passes[i].resize(pixels.size());
+      }
+
+      JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSubscribeEvents(
+                    dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME |
+                             JXL_DEC_FULL_IMAGE | JXL_DEC_FRAME_PROGRESSION));
+      EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSetProgressiveDetail(dec, kFrames));
+      EXPECT_EQ(JXL_DEC_ERROR,
+                JxlDecoderSetProgressiveDetail(dec, kDCProgressive));
+      EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSetProgressiveDetail(dec, kDCGroups));
+      EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSetProgressiveDetail(dec, kGroups));
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetProgressiveDetail(dec, prog_detail));
+
+      uint8_t* next_in = data.data();
+      size_t avail_in = 0;
+      size_t pos = 0;
+
+      auto process_input = [&]() {
+        for (;;) {
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetInput(dec, next_in, avail_in));
+          JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+          size_t remaining = JxlDecoderReleaseInput(dec);
+          EXPECT_LE(remaining, avail_in);
+          next_in += avail_in - remaining;
+          avail_in = remaining;
+          if (status == JXL_DEC_NEED_MORE_INPUT && pos < data.size()) {
+            size_t chunk = std::min<size_t>(increment, data.size() - pos);
+            pos += chunk;
+            avail_in += chunk;
+            continue;
+          }
+          return status;
+        }
+      };
+
+      EXPECT_EQ(JXL_DEC_BASIC_INFO, process_input());
+      JxlBasicInfo info;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+      EXPECT_EQ(info.xsize, xsize);
+      EXPECT_EQ(info.ysize, ysize);
+
+      EXPECT_EQ(JXL_DEC_FRAME, process_input());
+
+      size_t buffer_size;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+      EXPECT_EQ(pixels.size(), buffer_size);
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                     dec, &format, passes[kNumPasses].data(),
+                                     passes[kNumPasses].size()));
+
+      auto next_pass = [&](int pass) {
+        if (prog_detail <= kDC) return kNumPasses;
+        if (prog_detail <= kLastPasses) {
+          return std::min(pass + 2, kNumPasses);
+        }
+        return pass + 1;
+      };
+
+      if (expect_flush) {
+        // Return a particular downsampling ratio only after the last
+        // pass for that downsampling was processed.
+        int expected_downsampling_ratios[] = {8, 8, 4, 4, 2};
+        for (int p = 0; p < kNumPasses; p = next_pass(p)) {
+          EXPECT_EQ(JXL_DEC_FRAME_PROGRESSION, process_input());
+          EXPECT_EQ(expected_downsampling_ratios[p],
+                    JxlDecoderGetIntendedDownsamplingRatio(dec));
+          EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec));
+          passes[p] = passes[kNumPasses];
+        }
+      }
+
+      EXPECT_EQ(JXL_DEC_FULL_IMAGE, process_input());
+      EXPECT_EQ(JXL_DEC_SUCCESS, process_input());
+
+      JxlDecoderDestroy(dec);
+
+      if (!expect_flush) {
+        continue;
+      }
+      jxl::ButteraugliParams ba;
+      std::vector<float> distances(kNumPasses + 1);
+      for (int p = 0;; p = next_pass(p)) {
+        jxl::CodecInOut io1;
+        EXPECT_TRUE(jxl::ConvertFromExternal(
+            jxl::Span<const uint8_t>(passes[p].data(), passes[p].size()), xsize,
+            ysize, color_encoding,
+            /*bits_per_sample=*/16, format,
+            /*pool=*/nullptr, &io1.Main()));
+        distances[p] = ButteraugliDistance(io.frames, io1.frames, ba,
+                                           jxl::GetJxlCms(), nullptr, nullptr);
+        if (p == kNumPasses) break;
+      }
+      const float kMaxDistance[kNumPasses + 1] = {30.0f, 20.0f, 10.0f,
+                                                  5.0f,  3.0f,  2.0f};
+      EXPECT_LT(distances[kNumPasses], kMaxDistance[kNumPasses]);
+      for (int p = 0; p < kNumPasses;) {
+        int next_p = next_pass(p);
+        EXPECT_LT(distances[p], kMaxDistance[p]);
+        // Verify that the returned pass image is actually not the
+        // same as the next pass image, by checking that it has a bit
+        // worse butteraugli score.
+        EXPECT_LT(distances[next_p] * 1.1f, distances[p]);
+        p = next_p;
+      }
+    }
+  }
+}
+
+void VerifyJPEGReconstruction(const jxl::PaddedBytes& container,
+                              const jxl::PaddedBytes& jpeg_bytes) {
+  JxlDecoderPtr dec = JxlDecoderMake(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec.get(), JXL_DEC_JPEG_RECONSTRUCTION | JXL_DEC_FULL_IMAGE));
+  JxlDecoderSetInput(dec.get(), container.data(), container.size());
+  EXPECT_EQ(JXL_DEC_JPEG_RECONSTRUCTION, JxlDecoderProcessInput(dec.get()));
+  std::vector<uint8_t> reconstructed_buffer(128);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data(),
+                                    reconstructed_buffer.size()));
+  size_t used = 0;
+  JxlDecoderStatus process_result = JXL_DEC_JPEG_NEED_MORE_OUTPUT;
+  while (process_result == JXL_DEC_JPEG_NEED_MORE_OUTPUT) {
+    used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get());
+    reconstructed_buffer.resize(reconstructed_buffer.size() * 2);
+    EXPECT_EQ(
+        JXL_DEC_SUCCESS,
+        JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data() + used,
+                                reconstructed_buffer.size() - used));
+    process_result = JxlDecoderProcessInput(dec.get());
+  }
+  ASSERT_EQ(JXL_DEC_FULL_IMAGE, process_result);
+  used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get());
+  ASSERT_EQ(used, jpeg_bytes.size());
+  EXPECT_EQ(0, memcmp(reconstructed_buffer.data(), jpeg_bytes.data(), used));
+}
+
+#if JPEGXL_ENABLE_JPEG
+TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructTestCodestream)) {
+  size_t xsize = 123;
+  size_t ysize = 77;
+  size_t channels = 3;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, channels, /*seed=*/0);
+  jxl::PaddedBytes jpeg_codestream;
+  jxl::TestCodestreamParams params;
+  params.cparams.color_transform = jxl::ColorTransform::kNone;
+  params.box_format = kCSBF_Single;
+  params.jpeg_codestream = &jpeg_codestream;
+  params.preview_mode = jxl::kSmallPreview;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      channels, params);
+  VerifyJPEGReconstruction(compressed, jpeg_codestream);
+}
+#endif  // JPEGXL_ENABLE_JPEG
+
+TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructionTest)) {
+  const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg";
+  const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path);
+  jxl::CodecInOut orig_io;
+  ASSERT_TRUE(
+      jxl::jpeg::DecodeImageJPG(jxl::Span<const uint8_t>(orig), &orig_io));
+  orig_io.metadata.m.xyb_encoded = false;
+  jxl::BitWriter writer;
+  ASSERT_TRUE(WriteCodestreamHeaders(&orig_io.metadata, &writer, nullptr));
+  writer.ZeroPadToByte();
+  jxl::PassesEncoderState enc_state;
+  jxl::CompressParams cparams;
+  cparams.color_transform = jxl::ColorTransform::kNone;
+  ASSERT_TRUE(jxl::EncodeFrame(cparams, jxl::FrameInfo{}, &orig_io.metadata,
+                               orig_io.Main(), &enc_state, jxl::GetJxlCms(),
+                               /*pool=*/nullptr, &writer,
+                               /*aux_out=*/nullptr));
+
+  jxl::PaddedBytes jpeg_data;
+  ASSERT_TRUE(
+      EncodeJPEGData(*orig_io.Main().jpeg_data.get(), &jpeg_data, cparams));
+  jxl::PaddedBytes container;
+  container.append(jxl::kContainerHeader,
+                   jxl::kContainerHeader + sizeof(jxl::kContainerHeader));
+  jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false,
+                       &container);
+  container.append(jpeg_data.data(), jpeg_data.data() + jpeg_data.size());
+  jxl::AppendBoxHeader(jxl::MakeBoxType("jxlc"), 0, true, &container);
+  jxl::PaddedBytes codestream = std::move(writer).TakeBytes();
+  container.append(codestream.data(), codestream.data() + codestream.size());
+  VerifyJPEGReconstruction(container, orig);
+}
+
+TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructionMetadataTest)) {
+  const std::string jpeg_path = "jxl/jpeg_reconstruction/1x1_exif_xmp.jpg";
+  const std::string jxl_path = "jxl/jpeg_reconstruction/1x1_exif_xmp.jxl";
+  const jxl::PaddedBytes jpeg = jxl::test::ReadTestData(jpeg_path);
+  const jxl::PaddedBytes jxl = jxl::test::ReadTestData(jxl_path);
+  VerifyJPEGReconstruction(jxl, jpeg);
+}
+
+TEST(DecodeTest, ContinueFinalNonEssentialBoxTest) {
+  size_t xsize = 80, ysize = 90;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  jxl::TestCodestreamParams params;
+  params.box_format = kCSBF_Multi_Other_Terminated;
+  params.add_icc_profile = true;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
+      params);
+  StreamPositions streampos;
+  AnalyzeCodestream(compressed, &streampos);
+
+  // The non-essential final box size including 8-byte header
+  size_t final_box_size = unk3_box_size + 8;
+  size_t last_box_begin = compressed.size() - final_box_size;
+  // Verify that the test is indeed setup correctly to be at the beginning of
+  // the 'unkn' box header.
+  ASSERT_EQ(compressed[last_box_begin + 3], final_box_size);
+  ASSERT_EQ(compressed[last_box_begin + 4], 'u');
+  ASSERT_EQ(compressed[last_box_begin + 5], 'n');
+  ASSERT_EQ(compressed[last_box_begin + 6], 'k');
+  ASSERT_EQ(compressed[last_box_begin + 7], '3');
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetInput(dec, compressed.data(), last_box_begin));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+  // The decoder returns success despite not having seen the final unknown box
+  // yet. This is because calling JxlDecoderCloseInput is not mandatory for
+  // backwards compatibility, so it doesn't know more bytes follow, the current
+  // bytes ended at a perfectly valid place.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  size_t remaining = JxlDecoderReleaseInput(dec);
+  // Since the test was set up to end exactly at the boundary of the final
+  // codestream box, and the decoder returned success, all bytes are expected to
+  // be consumed until the end of the  frame header.
+  EXPECT_EQ(remaining, last_box_begin - streampos.frames[0].toc_end);
+
+  // Now set the remaining non-codestream box as input.
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetInput(dec, compressed.data() + last_box_begin,
+                               compressed.size() - last_box_begin));
+  // Even though JxlDecoderProcessInput already returned JXL_DEC_SUCCESS before,
+  // when calling it again now after setting more input, success is expected, no
+  // event occurs but the box has been successfully skipped.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  JxlDecoderDestroy(dec);
+}
+
+namespace {
+bool BoxTypeEquals(const std::string& type_string, JxlBoxType type) {
+  return type_string.size() == 4 && type_string[0] == type[0] &&
+         type_string[1] == type[1] && type_string[2] == type[2] &&
+         type_string[3] == type[3];
+}
+}  // namespace
+
+TEST(DecodeTest, ExtentedBoxSizeTest) {
+  const std::string jxl_path = "jxl/boxes/square-extended-size-container.jxl";
+  const jxl::PaddedBytes orig = jxl::test::ReadTestData(jxl_path);
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, JXL_DEC_BOX));
+
+  JxlBoxType type;
+  uint64_t box_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, orig.data(), orig.size()));
+  EXPECT_EQ(JXL_DEC_BOX, JxlDecoderProcessInput(dec));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE));
+  EXPECT_TRUE(BoxTypeEquals("JXL ", type));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxSizeRaw(dec, &box_size));
+  EXPECT_EQ(12, box_size);
+  EXPECT_EQ(JXL_DEC_BOX, JxlDecoderProcessInput(dec));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE));
+  EXPECT_TRUE(BoxTypeEquals("ftyp", type));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxSizeRaw(dec, &box_size));
+  EXPECT_EQ(20, box_size);
+  EXPECT_EQ(JXL_DEC_BOX, JxlDecoderProcessInput(dec));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE));
+  EXPECT_TRUE(BoxTypeEquals("jxlc", type));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxSizeRaw(dec, &box_size));
+  EXPECT_EQ(72, box_size);
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, JXL_BOXES_TEST(BoxTest)) {
+  size_t xsize = 1, ysize = 1;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  jxl::TestCodestreamParams params;
+  params.box_format = kCSBF_Multi_Other_Terminated;
+  params.add_icc_profile = true;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
+      params);
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, JXL_DEC_BOX));
+
+  std::vector<std::string> expected_box_types = {
+      "JXL ", "ftyp", "jxlp", "unk1", "unk2", "jxlp", "jxlp", "jxlp", "unk3"};
+
+  // Value 0 means to not test the size: codestream is not required to be a
+  // particular exact size.
+  std::vector<size_t> expected_box_sizes = {12, 20, 0, 34, 18, 0, 0, 0, 20};
+
+  JxlBoxType type;
+  uint64_t box_size;
+  std::vector<uint8_t> contents(50);
+  size_t expected_release_size = 0;
+
+  // Cannot get these when decoding didn't start yet
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderGetBoxType(dec, type, JXL_FALSE));
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderGetBoxSizeRaw(dec, &box_size));
+
+  uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+  for (size_t i = 0; i < expected_box_types.size(); i++) {
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+    EXPECT_EQ(JXL_DEC_BOX, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxSizeRaw(dec, &box_size));
+    EXPECT_TRUE(BoxTypeEquals(expected_box_types[i], type));
+    if (expected_box_sizes[i]) {
+      EXPECT_EQ(expected_box_sizes[i], box_size);
+    }
+
+    if (expected_release_size > 0) {
+      EXPECT_EQ(expected_release_size, JxlDecoderReleaseBoxBuffer(dec));
+      expected_release_size = 0;
+    }
+
+    if (type[0] == 'u' && type[1] == 'n' && type[2] == 'k') {
+      JxlDecoderSetBoxBuffer(dec, contents.data(), contents.size());
+      size_t expected_box_contents_size =
+          type[3] == '1' ? unk1_box_size
+                         : (type[3] == '2' ? unk2_box_size : unk3_box_size);
+      expected_release_size = contents.size() - expected_box_contents_size;
+    }
+    size_t consumed = avail_in - JxlDecoderReleaseInput(dec);
+    next_in += consumed;
+    avail_in -= consumed;
+  }
+
+  // After the last DEC_BOX event, check that the input position is exactly at
+  // the stat of the box header.
+  EXPECT_EQ(avail_in, expected_box_sizes.back());
+
+  // Even though all input is given, the decoder cannot assume there aren't
+  // more boxes if the input was not closed.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+  JxlDecoderCloseInput(dec);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, JXL_BOXES_TEST(ExifBrobBoxTest)) {
+  size_t xsize = 1, ysize = 1;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  jxl::TestCodestreamParams params;
+  // Lossless to verify pixels exactly after roundtrip.
+  params.cparams.SetLossless();
+  params.box_format = kCSBF_Brob_Exif;
+  params.add_icc_profile = true;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
+      params);
+
+  // Test raw brob box, not brotli-decompressing
+  for (int streaming = 0; streaming < 2; ++streaming) {
+    JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, JXL_DEC_BOX));
+    if (!streaming) {
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetInput(dec, compressed.data(), compressed.size()));
+      JxlDecoderCloseInput(dec);
+    }
+    // for streaming input case
+    const uint8_t* next_in = compressed.data();
+    size_t avail_in = 0;
+    size_t total_in = 0;
+    size_t step_size = 64;
+
+    std::vector<uint8_t> box_buffer;
+    size_t box_num_output;
+    bool seen_brob_begin = false;
+    bool seen_brob_end = false;
+
+    for (;;) {
+      JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+      if (status == JXL_DEC_NEED_MORE_INPUT) {
+        if (streaming) {
+          size_t remaining = JxlDecoderReleaseInput(dec);
+          EXPECT_LE(remaining, avail_in);
+          next_in += avail_in - remaining;
+          avail_in = remaining;
+          size_t amount = step_size;
+          if (total_in + amount > compressed.size()) {
+            amount = compressed.size() - total_in;
+          }
+          avail_in += amount;
+          total_in += amount;
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetInput(dec, next_in, avail_in));
+          if (total_in == compressed.size()) JxlDecoderCloseInput(dec);
+        } else {
+          FAIL();
+          break;
+        }
+      } else if (status == JXL_DEC_BOX || status == JXL_DEC_SUCCESS) {
+        if (!box_buffer.empty()) {
+          EXPECT_EQ(false, seen_brob_end);
+          seen_brob_end = true;
+          size_t remaining = JxlDecoderReleaseBoxBuffer(dec);
+          box_num_output = box_buffer.size() - remaining;
+          EXPECT_EQ(box_num_output, box_brob_exif_size - 8);
+          EXPECT_EQ(
+              0, memcmp(box_buffer.data(), box_brob_exif + 8, box_num_output));
+          box_buffer.clear();
+        }
+        if (status == JXL_DEC_SUCCESS) break;
+        JxlBoxType type;
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE));
+        if (BoxTypeEquals("brob", type)) {
+          EXPECT_EQ(false, seen_brob_begin);
+          seen_brob_begin = true;
+          box_buffer.resize(8);
+          JxlDecoderSetBoxBuffer(dec, box_buffer.data(), box_buffer.size());
+        }
+      } else if (status == JXL_DEC_BOX_NEED_MORE_OUTPUT) {
+        size_t remaining = JxlDecoderReleaseBoxBuffer(dec);
+        box_num_output = box_buffer.size() - remaining;
+        box_buffer.resize(box_buffer.size() * 2);
+        JxlDecoderSetBoxBuffer(dec, box_buffer.data() + box_num_output,
+                               box_buffer.size() - box_num_output);
+      } else {
+        // We do not expect any other events or errors
+        FAIL();
+        break;
+      }
+    }
+
+    EXPECT_EQ(true, seen_brob_begin);
+    EXPECT_EQ(true, seen_brob_end);
+
+    JxlDecoderDestroy(dec);
+  }
+
+  // Test decompressed brob box
+  for (int streaming = 0; streaming < 2; ++streaming) {
+    JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, JXL_DEC_BOX));
+    if (!streaming) {
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetInput(dec, compressed.data(), compressed.size()));
+      JxlDecoderCloseInput(dec);
+    }
+    // for streaming input case
+    const uint8_t* next_in = compressed.data();
+    size_t avail_in = 0;
+    size_t total_in = 0;
+    size_t step_size = 64;
+
+    std::vector<uint8_t> box_buffer;
+    size_t box_num_output;
+    bool seen_exif_begin = false;
+    bool seen_exif_end = false;
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetDecompressBoxes(dec, JXL_TRUE));
+
+    for (;;) {
+      JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+      if (status == JXL_DEC_NEED_MORE_INPUT) {
+        if (streaming) {
+          size_t remaining = JxlDecoderReleaseInput(dec);
+          EXPECT_LE(remaining, avail_in);
+          next_in += avail_in - remaining;
+          avail_in = remaining;
+          size_t amount = step_size;
+          if (total_in + amount > compressed.size()) {
+            amount = compressed.size() - total_in;
+          }
+          avail_in += amount;
+          total_in += amount;
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetInput(dec, next_in, avail_in));
+          if (total_in == compressed.size()) JxlDecoderCloseInput(dec);
+        } else {
+          FAIL();
+          break;
+        }
+      } else if (status == JXL_DEC_BOX || status == JXL_DEC_SUCCESS) {
+        if (!box_buffer.empty()) {
+          EXPECT_EQ(false, seen_exif_end);
+          seen_exif_end = true;
+          size_t remaining = JxlDecoderReleaseBoxBuffer(dec);
+          box_num_output = box_buffer.size() - remaining;
+          // Expect that the output has the same size and contents as the
+          // uncompressed exif data. Only check contents if the sizes match to
+          // avoid comparing uninitialized memory in the test.
+          EXPECT_EQ(box_num_output, exif_uncompressed_size);
+          if (box_num_output == exif_uncompressed_size) {
+            EXPECT_EQ(0, memcmp(box_buffer.data(), exif_uncompressed,
+                                exif_uncompressed_size));
+          }
+          box_buffer.clear();
+        }
+        if (status == JXL_DEC_SUCCESS) break;
+        JxlBoxType type;
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_TRUE));
+        if (BoxTypeEquals("Exif", type)) {
+          EXPECT_EQ(false, seen_exif_begin);
+          seen_exif_begin = true;
+          box_buffer.resize(8);
+          JxlDecoderSetBoxBuffer(dec, box_buffer.data(), box_buffer.size());
+        }
+      } else if (status == JXL_DEC_BOX_NEED_MORE_OUTPUT) {
+        size_t remaining = JxlDecoderReleaseBoxBuffer(dec);
+        box_num_output = box_buffer.size() - remaining;
+        box_buffer.resize(box_buffer.size() * 2);
+        JxlDecoderSetBoxBuffer(dec, box_buffer.data() + box_num_output,
+                               box_buffer.size() - box_num_output);
+      } else {
+        // We do not expect any other events or errors
+        FAIL();
+        break;
+      }
+    }
+
+    EXPECT_EQ(true, seen_exif_begin);
+    EXPECT_EQ(true, seen_exif_end);
+
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, JXL_BOXES_TEST(PartialCodestreamBoxTest)) {
+  size_t xsize = 23, ysize = 81;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  // Lossless to verify pixels exactly after roundtrip.
+  jxl::TestCodestreamParams params;
+  params.cparams.SetLossless();
+  params.cparams.speed_tier = jxl::SpeedTier::kThunder;
+  params.box_format = kCSBF_Multi;
+  params.add_icc_profile = true;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
+      params);
+
+  std::vector<uint8_t> extracted_codestream;
+
+  {
+    JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSubscribeEvents(
+                  dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE | JXL_DEC_BOX));
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetInput(dec, compressed.data(), compressed.size()));
+    JxlDecoderCloseInput(dec);
+
+    size_t num_jxlp = 0;
+
+    std::vector<uint8_t> pixels2;
+    pixels2.resize(pixels.size());
+
+    std::vector<uint8_t> box_buffer;
+    size_t box_num_output;
+
+    for (;;) {
+      JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+      if (status == JXL_DEC_NEED_MORE_INPUT) {
+        FAIL();
+        break;
+      } else if (status == JXL_DEC_BASIC_INFO) {
+        JxlBasicInfo info;
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+        EXPECT_EQ(info.xsize, xsize);
+        EXPECT_EQ(info.ysize, ysize);
+      } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetImageOutBuffer(dec, &format_orig, pixels2.data(),
+                                              pixels2.size()));
+      } else if (status == JXL_DEC_FULL_IMAGE) {
+        continue;
+      } else if (status == JXL_DEC_BOX || status == JXL_DEC_SUCCESS) {
+        if (!box_buffer.empty()) {
+          size_t remaining = JxlDecoderReleaseBoxBuffer(dec);
+          box_num_output = box_buffer.size() - remaining;
+          EXPECT_GE(box_num_output, 4);
+          // Do not insert the first 4 bytes, which are not part of the
+          // codestream, but the partial codestream box index
+          extracted_codestream.insert(extracted_codestream.end(),
+                                      box_buffer.begin() + 4,
+                                      box_buffer.begin() + box_num_output);
+          box_buffer.clear();
+        }
+        if (status == JXL_DEC_SUCCESS) break;
+        JxlBoxType type;
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE));
+        if (BoxTypeEquals("jxlp", type)) {
+          num_jxlp++;
+          box_buffer.resize(8);
+          JxlDecoderSetBoxBuffer(dec, box_buffer.data(), box_buffer.size());
+        }
+      } else if (status == JXL_DEC_BOX_NEED_MORE_OUTPUT) {
+        size_t remaining = JxlDecoderReleaseBoxBuffer(dec);
+        box_num_output = box_buffer.size() - remaining;
+        box_buffer.resize(box_buffer.size() * 2);
+        JxlDecoderSetBoxBuffer(dec, box_buffer.data() + box_num_output,
+                               box_buffer.size() - box_num_output);
+      } else {
+        // We do not expect any other events or errors
+        FAIL();
+        break;
+      }
+    }
+
+    // The test file created with kCSBF_Multi is expected to have 4 jxlp boxes.
+    EXPECT_EQ(4, num_jxlp);
+
+    EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize,
+                                           ysize, format_orig, format_orig));
+
+    JxlDecoderDestroy(dec);
+  }
+
+  // Now test whether the codestream extracted from the jxlp boxes can itself
+  // also be decoded and gives the same pixels
+  {
+    JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSubscribeEvents(
+                  dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE | JXL_DEC_BOX));
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetInput(dec, extracted_codestream.data(),
+                                 extracted_codestream.size()));
+    JxlDecoderCloseInput(dec);
+
+    size_t num_boxes = 0;
+
+    std::vector<uint8_t> pixels2;
+    pixels2.resize(pixels.size());
+
+    std::vector<uint8_t> box_buffer;
+    size_t box_num_output;
+
+    for (;;) {
+      JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+      if (status == JXL_DEC_NEED_MORE_INPUT) {
+        FAIL();
+        break;
+      } else if (status == JXL_DEC_BASIC_INFO) {
+        JxlBasicInfo info;
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+        EXPECT_EQ(info.xsize, xsize);
+        EXPECT_EQ(info.ysize, ysize);
+      } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetImageOutBuffer(dec, &format_orig, pixels2.data(),
+                                              pixels2.size()));
+      } else if (status == JXL_DEC_FULL_IMAGE) {
+        continue;
+      } else if (status == JXL_DEC_BOX) {
+        num_boxes++;
+      } else if (status == JXL_DEC_BOX_NEED_MORE_OUTPUT) {
+        size_t remaining = JxlDecoderReleaseBoxBuffer(dec);
+        box_num_output = box_buffer.size() - remaining;
+        box_buffer.resize(box_buffer.size() * 2);
+        JxlDecoderSetBoxBuffer(dec, box_buffer.data() + box_num_output,
+                               box_buffer.size() - box_num_output);
+      } else if (status == JXL_DEC_SUCCESS) {
+        break;
+      } else {
+        // We do not expect any other events or errors
+        FAIL();
+        break;
+      }
+    }
+
+    EXPECT_EQ(0, num_boxes);  // The data does not use the container format.
+    EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize,
+                                           ysize, format_orig, format_orig));
+
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, SpotColorTest) {
+  jxl::ThreadPool* pool = nullptr;
+  jxl::CodecInOut io;
+  size_t xsize = 55, ysize = 257;
+  io.metadata.m.color_encoding = jxl::ColorEncoding::LinearSRGB();
+  jxl::Image3F main(xsize, ysize);
+  jxl::ImageF spot(xsize, ysize);
+  jxl::ZeroFillImage(&main);
+  jxl::ZeroFillImage(&spot);
+
+  for (size_t y = 0; y < ysize; y++) {
+    float* JXL_RESTRICT rowm = main.PlaneRow(1, y);
+    float* JXL_RESTRICT rows = spot.Row(y);
+    for (size_t x = 0; x < xsize; x++) {
+      rowm[x] = (x + y) * (1.f / 255.f);
+      rows[x] = ((x ^ y) & 255) * (1.f / 255.f);
+    }
+  }
+  io.SetFromImage(std::move(main), jxl::ColorEncoding::LinearSRGB());
+  jxl::ExtraChannelInfo info;
+  info.bit_depth.bits_per_sample = 8;
+  info.dim_shift = 0;
+  info.type = jxl::ExtraChannel::kSpotColor;
+  info.spot_color[0] = 0.5f;
+  info.spot_color[1] = 0.2f;
+  info.spot_color[2] = 1.f;
+  info.spot_color[3] = 0.5f;
+
+  io.metadata.m.extra_channel_info.push_back(info);
+  std::vector<jxl::ImageF> ec;
+  ec.push_back(std::move(spot));
+  io.frames[0].SetExtraChannels(std::move(ec));
+
+  jxl::CompressParams cparams;
+  cparams.speed_tier = jxl::SpeedTier::kLightning;
+  cparams.modular_mode = true;
+  cparams.color_transform = jxl::ColorTransform::kNone;
+  cparams.butteraugli_distance = 0.f;
+
+  jxl::PaddedBytes compressed;
+  std::unique_ptr<jxl::PassesEncoderState> enc_state =
+      jxl::make_unique<jxl::PassesEncoderState>();
+  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, enc_state.get(), &compressed,
+                              jxl::GetJxlCms(), nullptr, pool));
+
+  for (size_t render_spot = 0; render_spot < 2; render_spot++) {
+    JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSubscribeEvents(
+                  dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE));
+    if (!render_spot) {
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetRenderSpotcolors(dec, JXL_FALSE));
+    }
+
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetInput(dec, compressed.data(), compressed.size()));
+    EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+    JxlBasicInfo binfo;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &binfo));
+    EXPECT_EQ(1u, binfo.num_extra_channels);
+    EXPECT_EQ(xsize, binfo.xsize);
+    EXPECT_EQ(ysize, binfo.ysize);
+
+    JxlExtraChannelInfo extra_info;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderGetExtraChannelInfo(dec, 0, &extra_info));
+    EXPECT_EQ((unsigned int)jxl::ExtraChannel::kSpotColor, extra_info.type);
+
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+    size_t buffer_size;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+    size_t extra_size;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderExtraChannelBufferSize(dec, &format, &extra_size, 0));
+
+    std::vector<uint8_t> image(buffer_size);
+    std::vector<uint8_t> extra(extra_size);
+    size_t bytes_per_pixel = format.num_channels *
+                             jxl::test::GetDataBits(format.data_type) /
+                             jxl::kBitsPerByte;
+    size_t stride = bytes_per_pixel * binfo.xsize;
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, image.data(), image.size()));
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetExtraChannelBuffer(dec, &format, extra.data(),
+                                              extra.size(), 0));
+
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+
+    // After the full image was output, JxlDecoderProcessInput should return
+    // success to indicate all is done.
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+    JxlDecoderDestroy(dec);
+
+    for (size_t y = 0; y < ysize; y++) {
+      uint8_t* JXL_RESTRICT rowm = image.data() + stride * y;
+      uint8_t* JXL_RESTRICT rows = extra.data() + xsize * y;
+      for (size_t x = 0; x < xsize; x++) {
+        if (!render_spot) {
+          // if spot color isn't rendered, main image should be as we made it
+          // (red and blue are all zeroes)
+
+          EXPECT_EQ(rowm[x * 3 + 0], 0);
+          EXPECT_EQ(rowm[x * 3 + 1], (x + y > 255 ? 255 : x + y));
+          EXPECT_EQ(rowm[x * 3 + 2], 0);
+        }
+        if (render_spot) {
+          // if spot color is rendered, expect red and blue to look like the
+          // spot color channel
+          EXPECT_LT(abs(rowm[x * 3 + 0] - (rows[x] * 0.25f)), 1);
+          EXPECT_LT(abs(rowm[x * 3 + 2] - (rows[x] * 0.5f)), 1);
+        }
+        EXPECT_EQ(rows[x], ((x ^ y) & 255));
+      }
+    }
+  }
+}
+
+TEST(DecodeTest, CloseInput) {
+  std::vector<uint8_t> partial_file = {0xff};
+
+  JxlDecoderPtr dec = JxlDecoderMake(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec.get(),
+                                      JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec.get(), partial_file.data(),
+                                                partial_file.size()));
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec.get()));
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec.get()));
+  JxlDecoderCloseInput(dec.get());
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderProcessInput(dec.get()));
+}
diff --git a/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.cc b/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.cc
new file mode 100644
index 0000000000..aa57b2723f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.cc
@@ -0,0 +1,169 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/decode_to_jpeg.h"
+
+namespace jxl {
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+
+JxlDecoderStatus JxlToJpegDecoder::Process(const uint8_t** next_in,
+                                           size_t* avail_in) {
+  if (!inside_box_) {
+    JXL_ABORT(
+        "processing of JPEG reconstruction data outside JPEG reconstruction "
+        "box");
+  }
+  Span<const uint8_t> to_decode;
+  if (box_until_eof_) {
+    // Until EOF means consume all data.
+    to_decode = Span<const uint8_t>(*next_in, *avail_in);
+    *next_in += *avail_in;
+    *avail_in = 0;
+  } else {
+    // Defined size means consume min(available, needed).
+    size_t avail_recon_in =
+        std::min<size_t>(*avail_in, box_size_ - buffer_.size());
+    to_decode = Span<const uint8_t>(*next_in, avail_recon_in);
+    *next_in += avail_recon_in;
+    *avail_in -= avail_recon_in;
+  }
+  bool old_data_exists = !buffer_.empty();
+  if (old_data_exists) {
+    // Append incoming data to buffer if we already had data in the buffer.
+    buffer_.insert(buffer_.end(), to_decode.data(),
+                   to_decode.data() + to_decode.size());
+    to_decode = Span<const uint8_t>(buffer_.data(), buffer_.size());
+  }
+  if (!box_until_eof_ && to_decode.size() > box_size_) {
+    JXL_ABORT("JPEG reconstruction data to decode larger than expected");
+  }
+  if (box_until_eof_ || to_decode.size() == box_size_) {
+    // If undefined size, or the right size, try to decode.
+    jpeg_data_ = make_unique<jpeg::JPEGData>();
+    const auto status = jpeg::DecodeJPEGData(to_decode, jpeg_data_.get());
+    if (status.IsFatalError()) return JXL_DEC_ERROR;
+    if (status) {
+      // Successful decoding, emit event after updating state to track that we
+      // are no longer parsing JPEG reconstruction data.
+      inside_box_ = false;
+      return JXL_DEC_JPEG_RECONSTRUCTION;
+    }
+    if (box_until_eof_) {
+      // Unsuccessful decoding and undefined size, assume incomplete data. Copy
+      // the data if we haven't already.
+      if (!old_data_exists) {
+        buffer_.insert(buffer_.end(), to_decode.data(),
+                       to_decode.data() + to_decode.size());
+      }
+    } else {
+      // Unsuccessful decoding of correct amount of data, assume error.
+      return JXL_DEC_ERROR;
+    }
+  } else {
+    // Not enough data, copy the data if we haven't already.
+    if (!old_data_exists) {
+      buffer_.insert(buffer_.end(), to_decode.data(),
+                     to_decode.data() + to_decode.size());
+    }
+  }
+  return JXL_DEC_NEED_MORE_INPUT;
+}
+
+size_t JxlToJpegDecoder::NumExifMarkers(const jpeg::JPEGData& jpeg_data) {
+  size_t num = 0;
+  for (size_t i = 0; i < jpeg_data.app_data.size(); ++i) {
+    if (jpeg_data.app_marker_type[i] == jxl::jpeg::AppMarkerType::kExif) {
+      num++;
+    }
+  }
+  return num;
+}
+
+size_t JxlToJpegDecoder::NumXmpMarkers(const jpeg::JPEGData& jpeg_data) {
+  size_t num = 0;
+  for (size_t i = 0; i < jpeg_data.app_data.size(); ++i) {
+    if (jpeg_data.app_marker_type[i] == jxl::jpeg::AppMarkerType::kXMP) {
+      num++;
+    }
+  }
+  return num;
+}
+
+JxlDecoderStatus JxlToJpegDecoder::ExifBoxContentSize(
+    const jpeg::JPEGData& jpeg_data, size_t* size) {
+  for (size_t i = 0; i < jpeg_data.app_data.size(); ++i) {
+    if (jpeg_data.app_marker_type[i] == jxl::jpeg::AppMarkerType::kExif) {
+      if (jpeg_data.app_data[i].size() < 3 + sizeof(jpeg::kExifTag)) {
+        // too small for app marker header
+        return JXL_DEC_ERROR;
+      }
+      // The first 4 bytes are the TIFF header from the box contents, and are
+      // not included in the JPEG
+      *size = jpeg_data.app_data[i].size() + 4 - 3 - sizeof(jpeg::kExifTag);
+      return JXL_DEC_SUCCESS;
+    }
+  }
+  return JXL_DEC_ERROR;
+}
+
+JxlDecoderStatus JxlToJpegDecoder::XmlBoxContentSize(
+    const jpeg::JPEGData& jpeg_data, size_t* size) {
+  for (size_t i = 0; i < jpeg_data.app_data.size(); ++i) {
+    if (jpeg_data.app_marker_type[i] == jxl::jpeg::AppMarkerType::kXMP) {
+      if (jpeg_data.app_data[i].size() < 3 + sizeof(jpeg::kXMPTag)) {
+        // too small for app marker header
+        return JXL_DEC_ERROR;
+      }
+      *size = jpeg_data.app_data[i].size() - 3 - sizeof(jpeg::kXMPTag);
+      return JXL_DEC_SUCCESS;
+    }
+  }
+  return JXL_DEC_ERROR;
+}
+
+JxlDecoderStatus JxlToJpegDecoder::SetExif(const uint8_t* data, size_t size,
+                                           jpeg::JPEGData* jpeg_data) {
+  for (size_t i = 0; i < jpeg_data->app_data.size(); ++i) {
+    if (jpeg_data->app_marker_type[i] == jxl::jpeg::AppMarkerType::kExif) {
+      if (jpeg_data->app_data[i].size() !=
+          size + 3 + sizeof(jpeg::kExifTag) - 4)
+        return JXL_DEC_ERROR;
+      // The first 9 bytes are used for JPEG marker header.
+      jpeg_data->app_data[i][0] = 0xE1;
+      // The second and third byte are already filled in correctly
+      memcpy(jpeg_data->app_data[i].data() + 3, jpeg::kExifTag,
+             sizeof(jpeg::kExifTag));
+      // The first 4 bytes are the TIFF header from the box contents, and are
+      // not included in the JPEG
+      memcpy(jpeg_data->app_data[i].data() + 3 + sizeof(jpeg::kExifTag),
+             data + 4, size - 4);
+      return JXL_DEC_SUCCESS;
+    }
+  }
+  return JXL_DEC_ERROR;
+}
+JxlDecoderStatus JxlToJpegDecoder::SetXmp(const uint8_t* data, size_t size,
+                                          jpeg::JPEGData* jpeg_data) {
+  for (size_t i = 0; i < jpeg_data->app_data.size(); ++i) {
+    if (jpeg_data->app_marker_type[i] == jxl::jpeg::AppMarkerType::kXMP) {
+      if (jpeg_data->app_data[i].size() != size + 3 + sizeof(jpeg::kXMPTag))
+        return JXL_DEC_ERROR;
+      // The first 9 bytes are used for JPEG marker header.
+      jpeg_data->app_data[i][0] = 0xE1;
+      // The second and third byte are already filled in correctly
+      memcpy(jpeg_data->app_data[i].data() + 3, jpeg::kXMPTag,
+             sizeof(jpeg::kXMPTag));
+      memcpy(jpeg_data->app_data[i].data() + 3 + sizeof(jpeg::kXMPTag), data,
+             size);
+      return JXL_DEC_SUCCESS;
+    }
+  }
+  return JXL_DEC_ERROR;
+}
+
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.h b/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.h
new file mode 100644
index 0000000000..a64ace27a2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.h
@@ -0,0 +1,217 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DECODE_TO_JPEG_H_
+#define LIB_JXL_DECODE_TO_JPEG_H_
+
+// JPEG XL to JPEG bytes decoder logic. The JxlToJpegDecoder class keeps track
+// of the decoder state needed to parse the JPEG reconstruction box and provide
+// the reconstructed JPEG to the output buffer.
+
+#include <jxl/decode.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <memory>
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"  // JPEGXL_ENABLE_TRANSCODE_JPEG
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/jpeg/dec_jpeg_data.h"
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+#include "lib/jxl/jpeg/dec_jpeg_data_writer.h"
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+namespace jxl {
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+
+class JxlToJpegDecoder {
+ public:
+  // Returns whether an output buffer is set.
+  bool IsOutputSet() const { return next_out_ != nullptr; }
+
+  // Returns whether the decoder is parsing a boxa JPEG box was parsed.
+  bool IsParsingBox() const { return inside_box_; }
+
+  // Sets the output buffer used when producing JPEG output.
+  JxlDecoderStatus SetOutputBuffer(uint8_t* data, size_t size) {
+    if (next_out_) return JXL_DEC_ERROR;
+    next_out_ = data;
+    avail_size_ = size;
+    return JXL_DEC_SUCCESS;
+  }
+
+  // Releases the buffer set with SetOutputBuffer().
+  size_t ReleaseOutputBuffer() {
+    size_t result = avail_size_;
+    next_out_ = nullptr;
+    avail_size_ = 0;
+    return result;
+  }
+
+  void StartBox(bool box_until_eof, size_t contents_size) {
+    // A new box implies that we clear the buffer.
+    buffer_.clear();
+    inside_box_ = true;
+    if (box_until_eof) {
+      box_until_eof_ = true;
+    } else {
+      box_size_ = contents_size;
+    }
+  }
+
+  // Consumes data from next_in/avail_in to reconstruct JPEG data.
+  // Uses box_size_, inside_box_ and box_until_eof_ to calculate how much to
+  // consume. Potentially stores unparsed data in buffer_.
+  // Potentially populates jpeg_data_. Potentially updates inside_box_.
+  // Returns JXL_DEC_JPEG_RECONSTRUCTION when finished, JXL_DEC_NEED_MORE_INPUT
+  // if more input is needed, JXL_DEC_ERROR on parsing error.
+  JxlDecoderStatus Process(const uint8_t** next_in, size_t* avail_in);
+
+  // Returns non-owned copy of the JPEGData, only after Process finished and
+  // the JPEGData was not yet moved to an image bundle with
+  // SetImageBundleJpegData.
+  jpeg::JPEGData* GetJpegData() { return jpeg_data_.get(); }
+
+  // Returns how many exif or xmp app markers are present in the JPEG data. A
+  // return value higher than 1 would require multiple exif boxes or multiple
+  // xmp boxes in the container format, and this is not supported by the API and
+  // considered an error. May only be called after Process returned success.
+  static size_t NumExifMarkers(const jpeg::JPEGData& jpeg_data);
+  static size_t NumXmpMarkers(const jpeg::JPEGData& jpeg_data);
+
+  // Returns box content size for metadata, using the known data from the app
+  // markers.
+  static JxlDecoderStatus ExifBoxContentSize(const jpeg::JPEGData& jpeg_data,
+                                             size_t* size);
+  static JxlDecoderStatus XmlBoxContentSize(const jpeg::JPEGData& jpeg_data,
+                                            size_t* size);
+
+  // Returns JXL_DEC_ERROR if there is no exif/XMP marker or the data size
+  // does not match, or this function is called before Process returned
+  // success, JXL_DEC_SUCCESS otherwise. As input, provide the full box contents
+  // but not the box header. In case of exif, this includes the 4-byte TIFF
+  // header, even though it won't be copied into the JPEG.
+  static JxlDecoderStatus SetExif(const uint8_t* data, size_t size,
+                                  jpeg::JPEGData* jpeg_data);
+  static JxlDecoderStatus SetXmp(const uint8_t* data, size_t size,
+                                 jpeg::JPEGData* jpeg_data);
+
+  // Sets the JpegData of the ImageBundle passed if there is anything to set.
+  // Releases the JpegData from this decoder if set.
+  Status SetImageBundleJpegData(ImageBundle* ib) {
+    if (IsOutputSet() && jpeg_data_ != nullptr) {
+      if (!jpeg::SetJPEGDataFromICC(ib->metadata()->color_encoding.ICC(),
+                                    jpeg_data_.get())) {
+        return false;
+      }
+      ib->jpeg_data.reset(jpeg_data_.release());
+    }
+    return true;
+  }
+
+  JxlDecoderStatus WriteOutput(const jpeg::JPEGData& jpeg_data) {
+    // Copy JPEG bytestream if desired.
+    uint8_t* tmp_next_out = next_out_;
+    size_t tmp_avail_size = avail_size_;
+    auto write = [&tmp_next_out, &tmp_avail_size](const uint8_t* buf,
+                                                  size_t len) {
+      size_t to_write = std::min<size_t>(tmp_avail_size, len);
+      if (to_write != 0) memcpy(tmp_next_out, buf, to_write);
+      tmp_next_out += to_write;
+      tmp_avail_size -= to_write;
+      return to_write;
+    };
+    Status write_result = jpeg::WriteJpeg(jpeg_data, write);
+    if (!write_result) {
+      if (tmp_avail_size == 0) {
+        return JXL_DEC_JPEG_NEED_MORE_OUTPUT;
+      }
+      return JXL_DEC_ERROR;
+    }
+    next_out_ = tmp_next_out;
+    avail_size_ = tmp_avail_size;
+    return JXL_DEC_SUCCESS;
+  }
+
+ private:
+  // Content of the most recently parsed JPEG reconstruction box if any.
+  std::vector<uint8_t> buffer_;
+
+  // Decoded content of the most recently parsed JPEG reconstruction box is
+  // stored here.
+  std::unique_ptr<jpeg::JPEGData> jpeg_data_;
+
+  // True if the decoder is currently reading bytes inside a JPEG reconstruction
+  // box.
+  bool inside_box_ = false;
+
+  // True if the JPEG reconstruction box had undefined size (all remaining
+  // bytes).
+  bool box_until_eof_ = false;
+  // Size of most recently parsed JPEG reconstruction box contents.
+  size_t box_size_ = 0;
+
+  // Next bytes to write JPEG reconstruction to.
+  uint8_t* next_out_ = nullptr;
+  // Available bytes to write JPEG reconstruction to.
+  size_t avail_size_ = 0;
+};
+
+#else
+
+// Fake class that disables support for decoding JPEG XL to JPEG.
+class JxlToJpegDecoder {
+ public:
+  bool IsOutputSet() const { return false; }
+  bool IsParsingBox() const { return false; }
+
+  JxlDecoderStatus SetOutputBuffer(uint8_t* /* data */, size_t /* size */) {
+    return JXL_DEC_ERROR;
+  }
+  size_t ReleaseOutputBuffer() { return 0; }
+
+  void StartBox(bool /* box_until_eof */, size_t /* contents_size */) {}
+
+  JxlDecoderStatus Process(const uint8_t** next_in, size_t* avail_in) {
+    return JXL_DEC_ERROR;
+  }
+  jpeg::JPEGData* GetJpegData() { return nullptr; }
+
+  Status SetImageBundleJpegData(ImageBundle* /* ib */) { return true; }
+
+  static size_t NumExifMarkers(const jpeg::JPEGData& /*jpeg_data*/) {
+    return 0;
+  }
+  static size_t NumXmpMarkers(const jpeg::JPEGData& /*jpeg_data*/) { return 0; }
+  static size_t ExifBoxContentSize(const jpeg::JPEGData& /*jpeg_data*/,
+                                   size_t* /*size*/) {
+    return JXL_DEC_ERROR;
+  }
+  static size_t XmlBoxContentSize(const jpeg::JPEGData& /*jpeg_data*/,
+                                  size_t* /*size*/) {
+    return JXL_DEC_ERROR;
+  }
+  static JxlDecoderStatus SetExif(const uint8_t* /*data*/, size_t /*size*/,
+                                  jpeg::JPEGData* /*jpeg_data*/) {
+    return JXL_DEC_ERROR;
+  }
+  static JxlDecoderStatus SetXmp(const uint8_t* /*data*/, size_t /*size*/,
+                                 jpeg::JPEGData* /*jpeg_data*/) {
+    return JXL_DEC_ERROR;
+  }
+
+  JxlDecoderStatus WriteOutput(const jpeg::JPEGData& /* jpeg_data */) {
+    return JXL_DEC_SUCCESS;
+  }
+};
+
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DECODE_TO_JPEG_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc
new file mode 100644
index 0000000000..2b4d84196f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc
@@ -0,0 +1,1168 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_ac_strategy.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_ac_strategy.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_transforms-inl.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/fast_math-inl.h"
+
+// Some of the floating point constants in this file and in other
+// files in the libjxl project have been obtained using the
+// tools/optimizer/simplex_fork.py tool. It is a variation of
+// Nelder-Mead optimization, and we generally try to minimize
+// BPP * pnorm aggregate as reported by the benchmark_xl tool,
+// but occasionally the values are optimized by using additional
+// constraints such as maintaining a certain density, or ratio of
+// popularity of integral transforms. Jyrki visually reviews all
+// such changes and often makes manual changes to maintain good
+// visual quality to changes where butteraugli was not sufficiently
+// sensitive to some kind of degradation. Unfortunately image quality
+// is still more of an art than science.
+
+// This must come before the begin/end_target, but HWY_ONCE is only true
+// after that, so use an "include guard".
+#ifndef LIB_JXL_ENC_AC_STRATEGY_
+#define LIB_JXL_ENC_AC_STRATEGY_
+// Parameters of the heuristic are marked with a OPTIMIZE comment.
+namespace jxl {
+
+// Debugging utilities.
+
+// Returns a linear sRGB color (as bytes) for each AC strategy.
+const uint8_t* TypeColor(const uint8_t& raw_strategy) {
+  JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy));
+  static_assert(AcStrategy::kNumValidStrategies == 27, "Change colors");
+  static constexpr uint8_t kColors[][3] = {
+      {0xFF, 0xFF, 0x00},  // DCT8
+      {0xFF, 0x80, 0x80},  // HORNUSS
+      {0xFF, 0x80, 0x80},  // DCT2x2
+      {0xFF, 0x80, 0x80},  // DCT4x4
+      {0x80, 0xFF, 0x00},  // DCT16x16
+      {0x00, 0xC0, 0x00},  // DCT32x32
+      {0xC0, 0xFF, 0x00},  // DCT16x8
+      {0xC0, 0xFF, 0x00},  // DCT8x16
+      {0x00, 0xFF, 0x00},  // DCT32x8
+      {0x00, 0xFF, 0x00},  // DCT8x32
+      {0x00, 0xFF, 0x00},  // DCT32x16
+      {0x00, 0xFF, 0x00},  // DCT16x32
+      {0xFF, 0x80, 0x00},  // DCT4x8
+      {0xFF, 0x80, 0x00},  // DCT8x4
+      {0xFF, 0xFF, 0x80},  // AFV0
+      {0xFF, 0xFF, 0x80},  // AFV1
+      {0xFF, 0xFF, 0x80},  // AFV2
+      {0xFF, 0xFF, 0x80},  // AFV3
+      {0x00, 0xC0, 0xFF},  // DCT64x64
+      {0x00, 0xFF, 0xFF},  // DCT64x32
+      {0x00, 0xFF, 0xFF},  // DCT32x64
+      {0x00, 0x40, 0xFF},  // DCT128x128
+      {0x00, 0x80, 0xFF},  // DCT128x64
+      {0x00, 0x80, 0xFF},  // DCT64x128
+      {0x00, 0x00, 0xC0},  // DCT256x256
+      {0x00, 0x00, 0xFF},  // DCT256x128
+      {0x00, 0x00, 0xFF},  // DCT128x256
+  };
+  return kColors[raw_strategy];
+}
+
+const uint8_t* TypeMask(const uint8_t& raw_strategy) {
+  JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy));
+  static_assert(AcStrategy::kNumValidStrategies == 27, "Add masks");
+  // implicitly, first row and column is made dark
+  static constexpr uint8_t kMask[][64] = {
+      {
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+      },                           // DCT8
+      {
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 1, 0, 0, 1, 0, 0,  //
+          0, 0, 1, 0, 0, 1, 0, 0,  //
+          0, 0, 1, 1, 1, 1, 0, 0,  //
+          0, 0, 1, 0, 0, 1, 0, 0,  //
+          0, 0, 1, 0, 0, 1, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+      },                           // HORNUSS
+      {
+          1, 1, 1, 1, 1, 1, 1, 1,  //
+          1, 0, 1, 0, 1, 0, 1, 0,  //
+          1, 1, 1, 1, 1, 1, 1, 1,  //
+          1, 0, 1, 0, 1, 0, 1, 0,  //
+          1, 1, 1, 1, 1, 1, 1, 1,  //
+          1, 0, 1, 0, 1, 0, 1, 0,  //
+          1, 1, 1, 1, 1, 1, 1, 1,  //
+          1, 0, 1, 0, 1, 0, 1, 0,  //
+      },                           // 2x2
+      {
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          1, 1, 1, 1, 1, 1, 1, 1,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+      },                           // 4x4
+      {},                          // DCT16x16 (unused)
+      {},                          // DCT32x32 (unused)
+      {},                          // DCT16x8 (unused)
+      {},                          // DCT8x16 (unused)
+      {},                          // DCT32x8 (unused)
+      {},                          // DCT8x32 (unused)
+      {},                          // DCT32x16 (unused)
+      {},                          // DCT16x32 (unused)
+      {
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          1, 1, 1, 1, 1, 1, 1, 1,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+      },                           // DCT4x8
+      {
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+      },                           // DCT8x4
+      {
+          1, 1, 1, 1, 1, 0, 0, 0,  //
+          1, 1, 1, 1, 0, 0, 0, 0,  //
+          1, 1, 1, 0, 0, 0, 0, 0,  //
+          1, 1, 0, 0, 0, 0, 0, 0,  //
+          1, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+      },                           // AFV0
+      {
+          0, 0, 0, 0, 1, 1, 1, 1,  //
+          0, 0, 0, 0, 0, 1, 1, 1,  //
+          0, 0, 0, 0, 0, 0, 1, 1,  //
+          0, 0, 0, 0, 0, 0, 0, 1,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+      },                           // AFV1
+      {
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          1, 0, 0, 0, 0, 0, 0, 0,  //
+          1, 1, 0, 0, 0, 0, 0, 0,  //
+          1, 1, 1, 0, 0, 0, 0, 0,  //
+          1, 1, 1, 1, 0, 0, 0, 0,  //
+      },                           // AFV2
+      {
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 1,  //
+          0, 0, 0, 0, 0, 0, 1, 1,  //
+          0, 0, 0, 0, 0, 1, 1, 1,  //
+      },                           // AFV3
+  };
+  return kMask[raw_strategy];
+}
+
+void DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize,
+                    size_t ysize, const char* tag, AuxOut* aux_out) {
+  Image3F color_acs(xsize, ysize);
+  for (size_t y = 0; y < ysize; y++) {
+    float* JXL_RESTRICT rows[3] = {
+        color_acs.PlaneRow(0, y),
+        color_acs.PlaneRow(1, y),
+        color_acs.PlaneRow(2, y),
+    };
+    const AcStrategyRow acs_row = ac_strategy.ConstRow(y / kBlockDim);
+    for (size_t x = 0; x < xsize; x++) {
+      AcStrategy acs = acs_row[x / kBlockDim];
+      const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy());
+      for (size_t c = 0; c < 3; c++) {
+        rows[c][x] = color[c] / 255.f;
+      }
+    }
+  }
+  size_t stride = color_acs.PixelsPerRow();
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t by = 0; by < DivCeil(ysize, kBlockDim); by++) {
+      float* JXL_RESTRICT row = color_acs.PlaneRow(c, by * kBlockDim);
+      const AcStrategyRow acs_row = ac_strategy.ConstRow(by);
+      for (size_t bx = 0; bx < DivCeil(xsize, kBlockDim); bx++) {
+        AcStrategy acs = acs_row[bx];
+        if (!acs.IsFirstBlock()) continue;
+        const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy());
+        const uint8_t* JXL_RESTRICT mask = TypeMask(acs.RawStrategy());
+        if (acs.covered_blocks_x() == 1 && acs.covered_blocks_y() == 1) {
+          for (size_t iy = 0; iy < kBlockDim && by * kBlockDim + iy < ysize;
+               iy++) {
+            for (size_t ix = 0; ix < kBlockDim && bx * kBlockDim + ix < xsize;
+                 ix++) {
+              if (mask[iy * kBlockDim + ix]) {
+                row[iy * stride + bx * kBlockDim + ix] = color[c] / 800.f;
+              }
+            }
+          }
+        }
+        // draw block edges
+        for (size_t ix = 0; ix < kBlockDim * acs.covered_blocks_x() &&
+                            bx * kBlockDim + ix < xsize;
+             ix++) {
+          row[0 * stride + bx * kBlockDim + ix] = color[c] / 350.f;
+        }
+        for (size_t iy = 0; iy < kBlockDim * acs.covered_blocks_y() &&
+                            by * kBlockDim + iy < ysize;
+             iy++) {
+          row[iy * stride + bx * kBlockDim + 0] = color[c] / 350.f;
+        }
+      }
+    }
+  }
+  aux_out->DumpImage(tag, color_acs);
+}
+
+}  // namespace jxl
+#endif  // LIB_JXL_ENC_AC_STRATEGY_
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::AbsDiff;
+using hwy::HWY_NAMESPACE::Eq;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+using hwy::HWY_NAMESPACE::Round;
+using hwy::HWY_NAMESPACE::Sqrt;
+
+bool MultiBlockTransformCrossesHorizontalBoundary(
+    const AcStrategyImage& ac_strategy, size_t start_x, size_t y,
+    size_t end_x) {
+  if (start_x >= ac_strategy.xsize() || y >= ac_strategy.ysize()) {
+    return false;
+  }
+  if (y % 8 == 0) {
+    // Nothing crosses 64x64 boundaries, and the memory on the other side
+    // of the 64x64 block may still uninitialized.
+    return false;
+  }
+  end_x = std::min(end_x, ac_strategy.xsize());
+  // The first multiblock might be before the start_x, let's adjust it
+  // to point to the first IsFirstBlock() == true block we find by backward
+  // tracing.
+  AcStrategyRow row = ac_strategy.ConstRow(y);
+  const size_t start_x_limit = start_x & ~7;
+  while (start_x != start_x_limit && !row[start_x].IsFirstBlock()) {
+    --start_x;
+  }
+  for (size_t x = start_x; x < end_x;) {
+    if (row[x].IsFirstBlock()) {
+      x += row[x].covered_blocks_x();
+    } else {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool MultiBlockTransformCrossesVerticalBoundary(
+    const AcStrategyImage& ac_strategy, size_t x, size_t start_y,
+    size_t end_y) {
+  if (x >= ac_strategy.xsize() || start_y >= ac_strategy.ysize()) {
+    return false;
+  }
+  if (x % 8 == 0) {
+    // Nothing crosses 64x64 boundaries, and the memory on the other side
+    // of the 64x64 block may still uninitialized.
+    return false;
+  }
+  end_y = std::min(end_y, ac_strategy.ysize());
+  // The first multiblock might be before the start_y, let's adjust it
+  // to point to the first IsFirstBlock() == true block we find by backward
+  // tracing.
+  const size_t start_y_limit = start_y & ~7;
+  while (start_y != start_y_limit &&
+         !ac_strategy.ConstRow(start_y)[x].IsFirstBlock()) {
+    --start_y;
+  }
+
+  for (size_t y = start_y; y < end_y;) {
+    AcStrategyRow row = ac_strategy.ConstRow(y);
+    if (row[x].IsFirstBlock()) {
+      y += row[x].covered_blocks_y();
+    } else {
+      return true;
+    }
+  }
+  return false;
+}
+
+static const float kChromaErrorWeight[AcStrategy::kNumValidStrategies] = {
+    0.95f,  // DCT = 0,
+    1.0f,   // IDENTITY = 1,
+    0.5f,   // DCT2X2 = 2,
+    1.0f,   // DCT4X4 = 3,
+    2.0f,   // DCT16X16 = 4,
+    2.0f,   // DCT32X32 = 5,
+    1.4f,   // DCT16X8 = 6,
+    1.4f,   // DCT8X16 = 7,
+    2.0f,   // DCT32X8 = 8,
+    2.0f,   // DCT8X32 = 9,
+    2.0f,   // DCT32X16 = 10,
+    2.0f,   // DCT16X32 = 11,
+    2.0f,   // DCT4X8 = 12,
+    2.0f,   // DCT8X4 = 13,
+    1.7f,   // AFV0 = 14,
+    1.7f,   // AFV1 = 15,
+    1.7f,   // AFV2 = 16,
+    1.7f,   // AFV3 = 17,
+    2.0f,   // DCT64X64 = 18,
+    2.0f,   // DCT64X32 = 19,
+    2.0f,   // DCT32X64 = 20,
+    2.0f,   // DCT128X128 = 21,
+    2.0f,   // DCT128X64 = 22,
+    2.0f,   // DCT64X128 = 23,
+    2.0f,   // DCT256X256 = 24,
+    2.0f,   // DCT256X128 = 25,
+    2.0f,   // DCT128X256 = 26,
+};
+
+float EstimateEntropy(const AcStrategy& acs, size_t x, size_t y,
+                      const ACSConfig& config,
+                      const float* JXL_RESTRICT cmap_factors, float* block,
+                      float* scratch_space, uint32_t* quantized) {
+  const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize;
+
+  // Apply transform.
+  for (size_t c = 0; c < 3; c++) {
+    float* JXL_RESTRICT block_c = block + size * c;
+    TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y),
+                        config.src_stride, block_c, scratch_space);
+  }
+
+  HWY_FULL(float) df;
+
+  const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y();
+  // avoid large blocks when there is a lot going on in red-green.
+  float cmul[3] = {kChromaErrorWeight[acs.RawStrategy()], 1.0f, 1.0f};
+  float quant_norm8 = 0;
+  float masking = 0;
+  if (num_blocks == 1) {
+    // When it is only one 8x8, we don't need aggregation of values.
+    quant_norm8 = config.Quant(x / 8, y / 8);
+    masking = 2.0f * config.Masking(x / 8, y / 8);
+  } else if (num_blocks == 2) {
+    // Taking max instead of 8th norm seems to work
+    // better for smallest blocks up to 16x8. Jyrki couldn't get
+    // improvements in trying the same for 16x16 blocks.
+    if (acs.covered_blocks_y() == 2) {
+      quant_norm8 =
+          std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1));
+      masking = 2.0f * std::max(config.Masking(x / 8, y / 8),
+                                config.Masking(x / 8, y / 8 + 1));
+    } else {
+      quant_norm8 =
+          std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8));
+      masking = 2.0f * std::max(config.Masking(x / 8, y / 8),
+                                config.Masking(x / 8 + 1, y / 8));
+    }
+  } else {
+    float masking_norm2 = 0;
+    float masking_max = 0;
+    // Load QF value, calculate empirical heuristic on masking field
+    // for weighting the information loss. Information loss manifests
+    // itself as ringing, and masking could hide it.
+    for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+      for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+        float qval = config.Quant(x / 8 + ix, y / 8 + iy);
+        qval *= qval;
+        qval *= qval;
+        quant_norm8 += qval * qval;
+        float maskval = config.Masking(x / 8 + ix, y / 8 + iy);
+        masking_max = std::max<float>(masking_max, maskval);
+        masking_norm2 += maskval * maskval;
+      }
+    }
+    quant_norm8 /= num_blocks;
+    quant_norm8 = FastPowf(quant_norm8, 1.0f / 8.0f);
+    masking_norm2 = sqrt(masking_norm2 / num_blocks);
+    // This is a highly empirical formula.
+    masking = (masking_norm2 + masking_max);
+  }
+  const auto q = Set(df, quant_norm8);
+
+  // Compute entropy.
+  float entropy = config.base_entropy;
+  auto info_loss = Zero(df);
+  auto info_loss2 = Zero(df);
+
+  for (size_t c = 0; c < 3; c++) {
+    const float* inv_matrix = config.dequant->InvMatrix(acs.RawStrategy(), c);
+    const auto cmap_factor = Set(df, cmap_factors[c]);
+
+    auto entropy_v = Zero(df);
+    auto nzeros_v = Zero(df);
+    auto cost1 = Set(df, config.cost1);
+    auto cost2 = Set(df, config.cost2);
+    auto cost_delta = Set(df, config.cost_delta);
+    for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) {
+      const auto in = Load(df, block + c * size + i);
+      const auto in_y = Mul(Load(df, block + size + i), cmap_factor);
+      const auto im = Load(df, inv_matrix + i);
+      const auto val = Mul(Sub(in, in_y), Mul(im, q));
+      const auto rval = Round(val);
+      const auto diff = AbsDiff(val, rval);
+      info_loss = Add(info_loss, diff);
+      info_loss2 = MulAdd(diff, diff, info_loss2);
+      const auto q = Abs(rval);
+      const auto q_is_zero = Eq(q, Zero(df));
+      entropy_v = Add(entropy_v, IfThenElseZero(Ge(q, Set(df, 1.5f)), cost2));
+      // We used to have q * C here, but that cost model seems to
+      // be punishing large values more than necessary. Sqrt tries
+      // to avoid large values less aggressively. Having high accuracy
+      // around zero is most important at low qualities, and there
+      // we have directly specified costs for 0, 1, and 2.
+      entropy_v = MulAdd(Sqrt(q), cost_delta, entropy_v);
+      nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f)));
+    }
+    entropy_v = MulAdd(nzeros_v, cost1, entropy_v);
+
+    entropy += cmul[c] * GetLane(SumOfLanes(df, entropy_v));
+    size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v));
+    // Add #bit of num_nonzeros, as an estimate of the cost for encoding the
+    // number of non-zeros of the block.
+    size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1;
+    // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a
+    // bias.
+    entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits);
+  }
+  float ret =
+      entropy +
+      masking *
+          ((config.info_loss_multiplier * GetLane(SumOfLanes(df, info_loss))) +
+           (config.info_loss_multiplier2 *
+            sqrt(num_blocks * GetLane(SumOfLanes(df, info_loss2)))));
+  return ret;
+}
+
+uint8_t FindBest8x8Transform(size_t x, size_t y, int encoding_speed_tier,
+                             const ACSConfig& config,
+                             const float* JXL_RESTRICT cmap_factors,
+                             AcStrategyImage* JXL_RESTRICT ac_strategy,
+                             float* block, float* scratch_space,
+                             uint32_t* quantized, float* entropy_out) {
+  struct TransformTry8x8 {
+    AcStrategy::Type type;
+    int encoding_speed_tier_max_limit;
+    float entropy_add;
+    float entropy_mul;
+  };
+  static const TransformTry8x8 kTransforms8x8[] = {
+      {
+          AcStrategy::Type::DCT,
+          9,
+          3.0f,
+          0.745f,
+      },
+      {
+          AcStrategy::Type::DCT4X4,
+          5,
+          4.0f,
+          0.7f,
+      },
+      {
+          AcStrategy::Type::DCT2X2,
+          5,
+          0.0f,
+          0.66f,
+      },
+      {
+          AcStrategy::Type::DCT4X8,
+          4,
+          0.0f,
+          0.700754622182473063f,
+      },
+      {
+          AcStrategy::Type::DCT8X4,
+          4,
+          0.0f,
+          0.700754622182473063f,
+      },
+      {
+          AcStrategy::Type::IDENTITY,
+          5,
+          8.0f,
+          0.81217614513585534f,
+      },
+      {
+          AcStrategy::Type::AFV0,
+          4,
+          3.0f,
+          0.70086131125719425f,
+      },
+      {
+          AcStrategy::Type::AFV1,
+          4,
+          3.0f,
+          0.70086131125719425f,
+      },
+      {
+          AcStrategy::Type::AFV2,
+          4,
+          3.0f,
+          0.70086131125719425f,
+      },
+      {
+          AcStrategy::Type::AFV3,
+          4,
+          3.0f,
+          0.70086131125719425f,
+      },
+  };
+  double best = 1e30;
+  uint8_t best_tx = kTransforms8x8[0].type;
+  for (auto tx : kTransforms8x8) {
+    if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) {
+      continue;
+    }
+    AcStrategy acs = AcStrategy::FromRawStrategy(tx.type);
+    float entropy = EstimateEntropy(acs, x, y, config, cmap_factors, block,
+                                    scratch_space, quantized);
+    entropy = tx.entropy_add + tx.entropy_mul * entropy;
+    if (entropy < best) {
+      best_tx = tx.type;
+      best = entropy;
+    }
+  }
+  *entropy_out = best;
+  return best_tx;
+}
+
+// bx, by addresses the 64x64 block at 8x8 subresolution
+// cx, cy addresses the left, upper 8x8 block position of the candidate
+// transform.
+void TryMergeAcs(AcStrategy::Type acs_raw, size_t bx, size_t by, size_t cx,
+                 size_t cy, const ACSConfig& config,
+                 const float* JXL_RESTRICT cmap_factors,
+                 AcStrategyImage* JXL_RESTRICT ac_strategy,
+                 const float entropy_mul, const uint8_t candidate_priority,
+                 uint8_t* priority, float* JXL_RESTRICT entropy_estimate,
+                 float* block, float* scratch_space, uint32_t* quantized) {
+  AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw);
+  float entropy_current = 0;
+  for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) {
+    for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) {
+      if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) {
+        // Transform would reuse already allocated blocks and
+        // lead to invalid overlaps, for example DCT64X32 vs.
+        // DCT32X64.
+        return;
+      }
+      entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)];
+    }
+  }
+  float entropy_candidate =
+      entropy_mul * EstimateEntropy(acs, (bx + cx) * 8, (by + cy) * 8, config,
+                                    cmap_factors, block, scratch_space,
+                                    quantized);
+  if (entropy_candidate >= entropy_current) return;
+  // Accept the candidate.
+  for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+    for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+      entropy_estimate[(cy + iy) * 8 + cx + ix] = 0;
+      priority[(cy + iy) * 8 + cx + ix] = candidate_priority;
+    }
+  }
+  ac_strategy->Set(bx + cx, by + cy, acs_raw);
+  entropy_estimate[cy * 8 + cx] = entropy_candidate;
+}
+
+static void SetEntropyForTransform(size_t cx, size_t cy,
+                                   const AcStrategy::Type acs_raw,
+                                   float entropy,
+                                   float* JXL_RESTRICT entropy_estimate) {
+  const AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw);
+  for (size_t dy = 0; dy < acs.covered_blocks_y(); ++dy) {
+    for (size_t dx = 0; dx < acs.covered_blocks_x(); ++dx) {
+      entropy_estimate[(cy + dy) * 8 + cx + dx] = 0.0;
+    }
+  }
+  entropy_estimate[cy * 8 + cx] = entropy;
+}
+
+AcStrategy::Type AcsSquare(size_t blocks) {
+  if (blocks == 2) {
+    return AcStrategy::Type::DCT16X16;
+  } else if (blocks == 4) {
+    return AcStrategy::Type::DCT32X32;
+  } else {
+    return AcStrategy::Type::DCT64X64;
+  }
+}
+
+AcStrategy::Type AcsVerticalSplit(size_t blocks) {
+  if (blocks == 2) {
+    return AcStrategy::Type::DCT16X8;
+  } else if (blocks == 4) {
+    return AcStrategy::Type::DCT32X16;
+  } else {
+    return AcStrategy::Type::DCT64X32;
+  }
+}
+
+AcStrategy::Type AcsHorizontalSplit(size_t blocks) {
+  if (blocks == 2) {
+    return AcStrategy::Type::DCT8X16;
+  } else if (blocks == 4) {
+    return AcStrategy::Type::DCT16X32;
+  } else {
+    return AcStrategy::Type::DCT32X64;
+  }
+}
+
+// The following function tries to merge smaller transforms into
+// squares and the rectangles originating from a single middle division
+// (horizontal or vertical) fairly.
+//
+// This is now generalized to concern about squares
+// of blocks X blocks size, where a block is 8x8 pixels.
+void FindBestFirstLevelDivisionForSquare(
+    size_t blocks, bool allow_square_transform, size_t bx, size_t by, size_t cx,
+    size_t cy, const ACSConfig& config, const float* JXL_RESTRICT cmap_factors,
+    AcStrategyImage* JXL_RESTRICT ac_strategy, const float entropy_mul_JXK,
+    const float entropy_mul_JXJ, float* JXL_RESTRICT entropy_estimate,
+    float* block, float* scratch_space, uint32_t* quantized) {
+  // We denote J for the larger dimension here, and K for the smaller.
+  // For example, for 32x32 block splitting, J would be 32, K 16.
+  const size_t blocks_half = blocks / 2;
+  const AcStrategy::Type acs_rawJXK = AcsVerticalSplit(blocks);
+  const AcStrategy::Type acs_rawKXJ = AcsHorizontalSplit(blocks);
+  const AcStrategy::Type acs_rawJXJ = AcsSquare(blocks);
+  const AcStrategy acsJXK = AcStrategy::FromRawStrategy(acs_rawJXK);
+  const AcStrategy acsKXJ = AcStrategy::FromRawStrategy(acs_rawKXJ);
+  const AcStrategy acsJXJ = AcStrategy::FromRawStrategy(acs_rawJXJ);
+  AcStrategyRow row0 = ac_strategy->ConstRow(by + cy + 0);
+  AcStrategyRow row1 = ac_strategy->ConstRow(by + cy + blocks_half);
+  // Let's check if we can consider a JXJ block here at all.
+  // This is not necessary in the basic use of hierarchically merging
+  // blocks in the simplest possible way, but is needed when we try other
+  // 'floating' options of merging, possibly after a simple hierarchical
+  // merge has been explored.
+  if (MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx,
+                                                   by + cy, bx + cx + blocks) ||
+      MultiBlockTransformCrossesHorizontalBoundary(
+          *ac_strategy, bx + cx, by + cy + blocks, bx + cx + blocks) ||
+      MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx, by + cy,
+                                                 by + cy + blocks) ||
+      MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks,
+                                                 by + cy, by + cy + blocks)) {
+    return;  // not suitable for JxJ analysis, some transforms leak out.
+  }
+  // For floating transforms there may be
+  // already blocks selected that make either or both JXK and
+  // KXJ not feasible for this location.
+  const bool allow_JXK = !MultiBlockTransformCrossesVerticalBoundary(
+      *ac_strategy, bx + cx + blocks_half, by + cy, by + cy + blocks);
+  const bool allow_KXJ = !MultiBlockTransformCrossesHorizontalBoundary(
+      *ac_strategy, bx + cx, by + cy + blocks_half, bx + cx + blocks);
+  // Current entropies aggregated on NxN resolution.
+  float entropy[2][2] = {};
+  for (size_t dy = 0; dy < blocks; ++dy) {
+    for (size_t dx = 0; dx < blocks; ++dx) {
+      entropy[dy / blocks_half][dx / blocks_half] +=
+          entropy_estimate[(cy + dy) * 8 + (cx + dx)];
+    }
+  }
+  float entropy_JXK_left = std::numeric_limits<float>::max();
+  float entropy_JXK_right = std::numeric_limits<float>::max();
+  float entropy_KXJ_top = std::numeric_limits<float>::max();
+  float entropy_KXJ_bottom = std::numeric_limits<float>::max();
+  float entropy_JXJ = std::numeric_limits<float>::max();
+  if (allow_JXK) {
+    if (row0[bx + cx + 0].RawStrategy() != acs_rawJXK) {
+      entropy_JXK_left =
+          entropy_mul_JXK *
+          EstimateEntropy(acsJXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
+                          cmap_factors, block, scratch_space, quantized);
+    }
+    if (row0[bx + cx + blocks_half].RawStrategy() != acs_rawJXK) {
+      entropy_JXK_right =
+          entropy_mul_JXK * EstimateEntropy(acsJXK, (bx + cx + blocks_half) * 8,
+                                            (by + cy + 0) * 8, config,
+                                            cmap_factors, block, scratch_space,
+                                            quantized);
+    }
+  }
+  if (allow_KXJ) {
+    if (row0[bx + cx].RawStrategy() != acs_rawKXJ) {
+      entropy_KXJ_top =
+          entropy_mul_JXK *
+          EstimateEntropy(acsKXJ, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
+                          cmap_factors, block, scratch_space, quantized);
+    }
+    if (row1[bx + cx].RawStrategy() != acs_rawKXJ) {
+      entropy_KXJ_bottom =
+          entropy_mul_JXK * EstimateEntropy(acsKXJ, (bx + cx + 0) * 8,
+                                            (by + cy + blocks_half) * 8, config,
+                                            cmap_factors, block, scratch_space,
+                                            quantized);
+    }
+  }
+  if (allow_square_transform) {
+    // We control the exploration of the square transform separately so that
+    // we can turn it off at high decoding speeds for 32x32, but still allow
+    // exploring 16x32 and 32x16.
+    entropy_JXJ = entropy_mul_JXJ * EstimateEntropy(acsJXJ, (bx + cx + 0) * 8,
+                                                    (by + cy + 0) * 8, config,
+                                                    cmap_factors, block,
+                                                    scratch_space, quantized);
+  }
+
+  // Test if this block should have JXK or KXJ transforms,
+  // because it can have only one or the other.
+  float costJxN = std::min(entropy_JXK_left, entropy[0][0] + entropy[1][0]) +
+                  std::min(entropy_JXK_right, entropy[0][1] + entropy[1][1]);
+  float costNxJ = std::min(entropy_KXJ_top, entropy[0][0] + entropy[0][1]) +
+                  std::min(entropy_KXJ_bottom, entropy[1][0] + entropy[1][1]);
+  if (entropy_JXJ < costJxN && entropy_JXJ < costNxJ) {
+    ac_strategy->Set(bx + cx, by + cy, acs_rawJXJ);
+    SetEntropyForTransform(cx, cy, acs_rawJXJ, entropy_JXJ, entropy_estimate);
+  } else if (costJxN < costNxJ) {
+    if (entropy_JXK_left < entropy[0][0] + entropy[1][0]) {
+      ac_strategy->Set(bx + cx, by + cy, acs_rawJXK);
+      SetEntropyForTransform(cx, cy, acs_rawJXK, entropy_JXK_left,
+                             entropy_estimate);
+    }
+    if (entropy_JXK_right < entropy[0][1] + entropy[1][1]) {
+      ac_strategy->Set(bx + cx + blocks_half, by + cy, acs_rawJXK);
+      SetEntropyForTransform(cx + blocks_half, cy, acs_rawJXK,
+                             entropy_JXK_right, entropy_estimate);
+    }
+  } else {
+    if (entropy_KXJ_top < entropy[0][0] + entropy[0][1]) {
+      ac_strategy->Set(bx + cx, by + cy, acs_rawKXJ);
+      SetEntropyForTransform(cx, cy, acs_rawKXJ, entropy_KXJ_top,
+                             entropy_estimate);
+    }
+    if (entropy_KXJ_bottom < entropy[1][0] + entropy[1][1]) {
+      ac_strategy->Set(bx + cx, by + cy + blocks_half, acs_rawKXJ);
+      SetEntropyForTransform(cx, cy + blocks_half, acs_rawKXJ,
+                             entropy_KXJ_bottom, entropy_estimate);
+    }
+  }
+}
+
+void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state,
+                    const ACSConfig& config, const Rect& rect) {
+  // Main philosophy here:
+  // 1. First find best 8x8 transform for each area.
+  // 2. Merging them into larger transforms where possibly, but
+  // starting from the smallest transforms (16x8 and 8x16).
+  // Additional complication: 16x8 and 8x16 are considered
+  // simultanouesly and fairly against each other.
+  // We are looking at 64x64 squares since the YtoX and YtoB
+  // maps happen to be at that resolution, and having
+  // integral transforms cross these boundaries leads to
+  // additional complications.
+  const CompressParams& cparams = enc_state->cparams;
+  const float butteraugli_target = cparams.butteraugli_distance;
+  AcStrategyImage* ac_strategy = &enc_state->shared.ac_strategy;
+  // TODO(veluca): reuse allocations
+  auto mem = hwy::AllocateAligned<float>(5 * AcStrategy::kMaxCoeffArea);
+  auto qmem = hwy::AllocateAligned<uint32_t>(AcStrategy::kMaxCoeffArea);
+  uint32_t* JXL_RESTRICT quantized = qmem.get();
+  float* JXL_RESTRICT block = mem.get();
+  float* JXL_RESTRICT scratch_space = mem.get() + 3 * AcStrategy::kMaxCoeffArea;
+  size_t bx = rect.x0();
+  size_t by = rect.y0();
+  JXL_ASSERT(rect.xsize() <= 8);
+  JXL_ASSERT(rect.ysize() <= 8);
+  size_t tx = bx / kColorTileDimInBlocks;
+  size_t ty = by / kColorTileDimInBlocks;
+  const float cmap_factors[3] = {
+      enc_state->shared.cmap.YtoXRatio(
+          enc_state->shared.cmap.ytox_map.ConstRow(ty)[tx]),
+      0.0f,
+      enc_state->shared.cmap.YtoBRatio(
+          enc_state->shared.cmap.ytob_map.ConstRow(ty)[tx]),
+  };
+  if (cparams.speed_tier > SpeedTier::kHare) return;
+  // First compute the best 8x8 transform for each square. Later, we do not
+  // experiment with different combinations, but only use the best of the 8x8s
+  // when DCT8X8 is specified in the tree search.
+  // 8x8 transforms have 10 variants, but every larger transform is just a DCT.
+  float entropy_estimate[64] = {};
+  // Favor all 8x8 transforms (against 16x8 and larger transforms)) at
+  // low butteraugli_target distances.
+  static const float k8x8mul1 = -0.55;
+  static const float k8x8mul2 = 1.0;
+  static const float k8x8base = 1.4;
+  const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base);
+  for (size_t iy = 0; iy < rect.ysize(); iy++) {
+    for (size_t ix = 0; ix < rect.xsize(); ix++) {
+      float entropy = 0.0;
+      const uint8_t best_of_8x8s = FindBest8x8Transform(
+          8 * (bx + ix), 8 * (by + iy), static_cast<int>(cparams.speed_tier),
+          config, cmap_factors, ac_strategy, block, scratch_space, quantized,
+          &entropy);
+      ac_strategy->Set(bx + ix, by + iy,
+                       static_cast<AcStrategy::Type>(best_of_8x8s));
+      entropy_estimate[iy * 8 + ix] = entropy * mul8x8;
+    }
+  }
+  // Merge when a larger transform is better than the previously
+  // searched best combination of 8x8 transforms.
+  struct MergeTry {
+    AcStrategy::Type type;
+    uint8_t priority;
+    uint8_t decoding_speed_tier_max_limit;
+    uint8_t encoding_speed_tier_max_limit;
+    float entropy_mul;
+  };
+  static const float k8X16mul1 = -0.55;
+  static const float k8X16mul2 = 0.865;
+  static const float k8X16base = 1.6;
+  const float entropy_mul16X8 =
+      k8X16mul2 + k8X16mul1 / (butteraugli_target + k8X16base);
+  //  const float entropy_mul16X8 = mul8X16 * 0.91195782912371126f;
+
+  static const float k16X16mul1 = -0.35;
+  static const float k16X16mul2 = 0.798;
+  static const float k16X16base = 2.0;
+  const float entropy_mul16X16 =
+      k16X16mul2 + k16X16mul1 / (butteraugli_target + k16X16base);
+  //  const float entropy_mul16X16 = mul16X16 * 0.83183417727960129f;
+
+  static const float k32X16mul1 = -0.1;
+  static const float k32X16mul2 = 0.854;
+  static const float k32X16base = 2.5;
+  const float entropy_mul16X32 =
+      k32X16mul2 + k32X16mul1 / (butteraugli_target + k32X16base);
+
+  const float entropy_mul32X32 = 0.93;
+  const float entropy_mul64X64 = 1.52f;
+  // TODO(jyrki): Consider this feedback in further changes:
+  // Also effectively when the multipliers for smaller blocks are
+  // below 1, this raises the bar for the bigger blocks even higher
+  // in that sense these constants are not independent (e.g. changing
+  // the constant for DCT16x32 by -5% (making it more likely) also
+  // means that DCT32x32 becomes harder to do when starting from
+  // two DCT16x32s). It might be better to make them more independent,
+  // e.g. by not applying the multiplier when storing the new entropy
+  // estimates in TryMergeToACSCandidate().
+  const MergeTry kTransformsForMerge[9] = {
+      {AcStrategy::Type::DCT16X8, 2, 4, 5, entropy_mul16X8},
+      {AcStrategy::Type::DCT8X16, 2, 4, 5, entropy_mul16X8},
+      // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its
+      // subdivisions. {AcStrategy::Type::DCT16X16, 3, entropy_mul16X16},
+      {AcStrategy::Type::DCT16X32, 4, 4, 4, entropy_mul16X32},
+      {AcStrategy::Type::DCT32X16, 4, 4, 4, entropy_mul16X32},
+      // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its
+      // subdivisions. {AcStrategy::Type::DCT32X32, 5, 1, 5,
+      // 0.9822994906548809f},
+      {AcStrategy::Type::DCT64X32, 6, 1, 3, 1.29f},
+      {AcStrategy::Type::DCT32X64, 6, 1, 3, 1.29f},
+      // {AcStrategy::Type::DCT64X64, 8, 1, 3, 2.0846542128012948f},
+  };
+  /*
+  These sizes not yet included in merge heuristic:
+  set(AcStrategy::Type::DCT32X8, 0.0f, 2.261390410971102f);
+  set(AcStrategy::Type::DCT8X32, 0.0f, 2.261390410971102f);
+  set(AcStrategy::Type::DCT128X128, 0.0f, 1.0f);
+  set(AcStrategy::Type::DCT128X64, 0.0f, 0.73f);
+  set(AcStrategy::Type::DCT64X128, 0.0f, 0.73f);
+  set(AcStrategy::Type::DCT256X256, 0.0f, 1.0f);
+  set(AcStrategy::Type::DCT256X128, 0.0f, 0.73f);
+  set(AcStrategy::Type::DCT128X256, 0.0f, 0.73f);
+  */
+
+  // Priority is a tricky kludge to avoid collisions so that transforms
+  // don't overlap.
+  uint8_t priority[64] = {};
+  bool enable_32x32 = cparams.decoding_speed_tier < 4;
+  for (auto tx : kTransformsForMerge) {
+    if (tx.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) {
+      continue;
+    }
+    AcStrategy acs = AcStrategy::FromRawStrategy(tx.type);
+
+    for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize();
+         cy += acs.covered_blocks_y()) {
+      for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize();
+           cx += acs.covered_blocks_x()) {
+        if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) {
+          if (cparams.decoding_speed_tier < 4 &&
+              tx.type == AcStrategy::Type::DCT32X64) {
+            // We handle both DCT8X16 and DCT16X8 at the same time.
+            if ((cy | cx) % 8 == 0) {
+              FindBestFirstLevelDivisionForSquare(
+                  8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
+                  tx.entropy_mul, entropy_mul64X64, entropy_estimate, block,
+                  scratch_space, quantized);
+            }
+            continue;
+          } else if (tx.type == AcStrategy::Type::DCT32X16) {
+            // We handled both DCT8X16 and DCT16X8 at the same time,
+            // and that is above. The last column and last row,
+            // when the last column or last row is odd numbered,
+            // are still handled by TryMergeAcs.
+            continue;
+          }
+        }
+        if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) ||
+            (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) {
+          // already covered by FindBest32X32
+          continue;
+        }
+
+        if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) {
+          if (tx.type == AcStrategy::Type::DCT16X32) {
+            // We handle both DCT8X16 and DCT16X8 at the same time.
+            if ((cy | cx) % 4 == 0) {
+              FindBestFirstLevelDivisionForSquare(
+                  4, enable_32x32, bx, by, cx, cy, config, cmap_factors,
+                  ac_strategy, tx.entropy_mul, entropy_mul32X32,
+                  entropy_estimate, block, scratch_space, quantized);
+            }
+            continue;
+          } else if (tx.type == AcStrategy::Type::DCT32X16) {
+            // We handled both DCT8X16 and DCT16X8 at the same time,
+            // and that is above. The last column and last row,
+            // when the last column or last row is odd numbered,
+            // are still handled by TryMergeAcs.
+            continue;
+          }
+        }
+        if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) ||
+            (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) {
+          // already covered by FindBest32X32
+          continue;
+        }
+        if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) {
+          if (tx.type == AcStrategy::Type::DCT8X16) {
+            // We handle both DCT8X16 and DCT16X8 at the same time.
+            if ((cy | cx) % 2 == 0) {
+              FindBestFirstLevelDivisionForSquare(
+                  2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
+                  tx.entropy_mul, entropy_mul16X16, entropy_estimate, block,
+                  scratch_space, quantized);
+            }
+            continue;
+          } else if (tx.type == AcStrategy::Type::DCT16X8) {
+            // We handled both DCT8X16 and DCT16X8 at the same time,
+            // and that is above. The last column and last row,
+            // when the last column or last row is odd numbered,
+            // are still handled by TryMergeAcs.
+            continue;
+          }
+        }
+        if ((tx.type == AcStrategy::Type::DCT8X16 && cy % 2 == 1) ||
+            (tx.type == AcStrategy::Type::DCT16X8 && cx % 2 == 1)) {
+          // already covered by FindBestFirstLevelDivisionForSquare
+          continue;
+        }
+        // All other merge sizes are handled here.
+        // Some of the DCT16X8s and DCT8X16s will still leak through here
+        // when there is an odd number of 8x8 blocks, then the last row
+        // and column will get their DCT16X8s and DCT8X16s through the
+        // normal integral transform merging process.
+        TryMergeAcs(tx.type, bx, by, cx, cy, config, cmap_factors, ac_strategy,
+                    tx.entropy_mul, tx.priority, &priority[0], entropy_estimate,
+                    block, scratch_space, quantized);
+      }
+    }
+  }
+  if (cparams.speed_tier >= SpeedTier::kHare) {
+    return;
+  }
+  // Here we still try to do some non-aligned matching, find a few more
+  // 16X8, 8X16 and 16X16s between the non-2-aligned blocks.
+  for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) {
+    for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) {
+      if ((cy | cx) % 2 != 0) {
+        FindBestFirstLevelDivisionForSquare(
+            2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
+            entropy_mul16X8, entropy_mul16X16, entropy_estimate, block,
+            scratch_space, quantized);
+      }
+    }
+  }
+  // Non-aligned matching for 32X32, 16X32 and 32X16.
+  size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1;
+  for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) {
+    for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) {
+      if ((cy | cx) % 4 == 0) {
+        continue;  // Already tried with loop above (DCT16X32 case).
+      }
+      FindBestFirstLevelDivisionForSquare(
+          4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy,
+          entropy_mul16X32, entropy_mul32X32, entropy_estimate, block,
+          scratch_space, quantized);
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(ProcessRectACS);
+
+void AcStrategyHeuristics::Init(const Image3F& src,
+                                PassesEncoderState* enc_state) {
+  this->enc_state = enc_state;
+  config.dequant = &enc_state->shared.matrices;
+  const CompressParams& cparams = enc_state->cparams;
+  const float butteraugli_target = cparams.butteraugli_distance;
+
+  if (cparams.speed_tier >= SpeedTier::kCheetah) {
+    JXL_CHECK(enc_state->shared.matrices.EnsureComputed(1));  // DCT8 only
+  } else {
+    uint32_t acs_mask = 0;
+    // All transforms up to 64x64.
+    for (size_t i = 0; i < AcStrategy::DCT128X128; i++) {
+      acs_mask |= (1 << i);
+    }
+    JXL_CHECK(enc_state->shared.matrices.EnsureComputed(acs_mask));
+  }
+
+  // Image row pointers and strides.
+  config.quant_field_row = enc_state->initial_quant_field.Row(0);
+  config.quant_field_stride = enc_state->initial_quant_field.PixelsPerRow();
+  auto& mask = enc_state->initial_quant_masking;
+  if (mask.xsize() > 0 && mask.ysize() > 0) {
+    config.masking_field_row = mask.Row(0);
+    config.masking_field_stride = mask.PixelsPerRow();
+  }
+
+  config.src_rows[0] = src.ConstPlaneRow(0, 0);
+  config.src_rows[1] = src.ConstPlaneRow(1, 0);
+  config.src_rows[2] = src.ConstPlaneRow(2, 0);
+  config.src_stride = src.PixelsPerRow();
+
+  // Entropy estimate is composed of two factors:
+  //  - estimate of the number of bits that will be used by the block
+  //  - information loss due to quantization
+  // The following constant controls the relative weights of these components.
+  config.info_loss_multiplier = 138.0f;
+  config.info_loss_multiplier2 = 50.46839691767866;
+  // TODO(jyrki): explore base_entropy setting more.
+  // A small value (0?) works better at high distance, while a larger value
+  // may be more effective at low distance/high bpp.
+  config.base_entropy = 0.0;
+  config.zeros_mul = 7.565053364251793f;
+  // Lots of +1 and -1 coefficients at high quality, it is
+  // beneficial to favor them. At low qualities zeros matter more
+  // and +1 / -1 coefficients are already quite harmful.
+  float slope = std::min<float>(1.0f, butteraugli_target * (1.0f / 3));
+  config.cost1 = 1 + slope * 8.8703248061477744f;
+  config.cost2 = 4.4628149885273363f;
+  config.cost_delta = 5.3359184934516337f;
+  JXL_ASSERT(enc_state->shared.ac_strategy.xsize() ==
+             enc_state->shared.frame_dim.xsize_blocks);
+  JXL_ASSERT(enc_state->shared.ac_strategy.ysize() ==
+             enc_state->shared.frame_dim.ysize_blocks);
+}
+
+void AcStrategyHeuristics::ProcessRect(const Rect& rect) {
+  PROFILER_FUNC;
+  const CompressParams& cparams = enc_state->cparams;
+  // In Falcon mode, use DCT8 everywhere and uniform quantization.
+  if (cparams.speed_tier >= SpeedTier::kCheetah) {
+    enc_state->shared.ac_strategy.FillDCT8(rect);
+    return;
+  }
+  HWY_DYNAMIC_DISPATCH(ProcessRectACS)
+  (enc_state, config, rect);
+}
+
+void AcStrategyHeuristics::Finalize(AuxOut* aux_out) {
+  const auto& ac_strategy = enc_state->shared.ac_strategy;
+  // Accounting and debug output.
+  if (aux_out != nullptr) {
+    aux_out->num_small_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::IDENTITY) +
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT2X2) +
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT4X4);
+    aux_out->num_dct4x8_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT4X8) +
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT8X4);
+    aux_out->num_afv_blocks = ac_strategy.CountBlocks(AcStrategy::Type::AFV0) +
+                              ac_strategy.CountBlocks(AcStrategy::Type::AFV1) +
+                              ac_strategy.CountBlocks(AcStrategy::Type::AFV2) +
+                              ac_strategy.CountBlocks(AcStrategy::Type::AFV3);
+    aux_out->num_dct8_blocks = ac_strategy.CountBlocks(AcStrategy::Type::DCT);
+    aux_out->num_dct8x16_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT8X16) +
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT16X8);
+    aux_out->num_dct8x32_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT8X32) +
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT32X8);
+    aux_out->num_dct16_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT16X16);
+    aux_out->num_dct16x32_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT16X32) +
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT32X16);
+    aux_out->num_dct32_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT32X32);
+    aux_out->num_dct32x64_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT32X64) +
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT64X32);
+    aux_out->num_dct64_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT64X64);
+  }
+
+  if (WantDebugOutput(aux_out)) {
+    DumpAcStrategy(ac_strategy, enc_state->shared.frame_dim.xsize,
+                   enc_state->shared.frame_dim.ysize, "ac_strategy", aux_out);
+  }
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h
new file mode 100644
index 0000000000..1ce3442ccf
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h
@@ -0,0 +1,74 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_AC_STRATEGY_H_
+#define LIB_JXL_ENC_AC_STRATEGY_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/quant_weights.h"
+
+// `FindBestAcStrategy` uses heuristics to choose which AC strategy should be
+// used in each block, as well as the initial quantization field.
+
+namespace jxl {
+
+struct AuxOut;
+
+// AC strategy selection: utility struct.
+
+struct ACSConfig {
+  const DequantMatrices* JXL_RESTRICT dequant;
+  float info_loss_multiplier;
+  float info_loss_multiplier2;
+  float* JXL_RESTRICT quant_field_row;
+  size_t quant_field_stride;
+  float* JXL_RESTRICT masking_field_row;
+  size_t masking_field_stride;
+  const float* JXL_RESTRICT src_rows[3];
+  size_t src_stride;
+  // Cost for 1 (-1), 2 (-2) explicitly, cost for others computed with cost1 +
+  // cost2 + sqrt(q) * cost_delta.
+  float cost1;
+  float cost2;
+  float cost_delta;
+  float base_entropy;
+  float zeros_mul;
+  const float& Pixel(size_t c, size_t x, size_t y) const {
+    return src_rows[c][y * src_stride + x];
+  }
+  float Masking(size_t bx, size_t by) const {
+    JXL_DASSERT(masking_field_row[by * masking_field_stride + bx] > 0);
+    return masking_field_row[by * masking_field_stride + bx];
+  }
+  float Quant(size_t bx, size_t by) const {
+    JXL_DASSERT(quant_field_row[by * quant_field_stride + bx] > 0);
+    return quant_field_row[by * quant_field_stride + bx];
+  }
+};
+
+struct AcStrategyHeuristics {
+  void Init(const Image3F& src, PassesEncoderState* enc_state);
+  void ProcessRect(const Rect& rect);
+  void Finalize(AuxOut* aux_out);
+  ACSConfig config;
+  PassesEncoderState* enc_state;
+};
+
+// Debug.
+void DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize,
+                    size_t ysize, const char* tag, AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_AC_STRATEGY_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc
new file mode 100644
index 0000000000..f54204b059
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc
@@ -0,0 +1,1145 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_adaptive_quantization.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+#include <string>
+#include <vector>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_adaptive_quantization.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/butteraugli/butteraugli.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/dec_group.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_group.h"
+#include "lib/jxl/enc_modular.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_transforms-inl.h"
+#include "lib/jxl/epf.h"
+#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/gauss_blur.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/quant_weights.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::AbsDiff;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Sqrt;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+// The following functions modulate an exponent (out_val) and return the updated
+// value. Their descriptor is limited to 8 lanes for 8x8 blocks.
+
+// Hack for mask estimation. Eventually replace this code with butteraugli's
+// masking.
+float ComputeMaskForAcStrategyUse(const float out_val) {
+  const float kMul = 1.0f;
+  const float kOffset = 0.001f;
+  return kMul / (out_val + kOffset);
+}
+
+template <class D, class V>
+V ComputeMask(const D d, const V out_val) {
+  const auto kBase = Set(d, -0.76471879237038032f);
+  const auto kMul4 = Set(d, 4.4585596705216615f);
+  const auto kMul2 = Set(d, 17.282053892620215f);
+  const auto kOffset2 = Set(d, 302.36961315317848f);
+  const auto kMul3 = Set(d, 7.0561261998705858f);
+  const auto kOffset3 = Set(d, 2.3179635626140773f);
+  const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3);
+  const auto kMul0 = Set(d, 0.80061762862741759f);
+  const auto k1 = Set(d, 1.0f);
+
+  // Avoid division by zero.
+  const auto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f));
+  const auto v2 = Div(k1, Add(v1, kOffset2));
+  const auto v3 = Div(k1, MulAdd(v1, v1, kOffset3));
+  const auto v4 = Div(k1, MulAdd(v1, v1, kOffset4));
+  // TODO(jyrki):
+  // A log or two here could make sense. In butteraugli we have effectively
+  // log(log(x + C)) for this kind of use, as a single log is used in
+  // saturating visual masking and here the modulation values are exponential,
+  // another log would counter that.
+  return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3))));
+}
+
+// mul and mul2 represent a scaling difference between jxl and butteraugli.
+static const float kSGmul = 226.77216153508914f;
+static const float kSGmul2 = 1.0f / 73.377132366608819f;
+static const float kLog2 = 0.693147181f;
+// Includes correction factor for std::log -> log2.
+static const float kSGRetMul = kSGmul2 * 18.6580932135f * kLog2;
+static const float kSGVOffset = 7.7825991679894591f;
+
+template <bool invert, typename D, typename V>
+V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) {
+  // The opsin space in jxl is the cubic root of photons, i.e., v * v * v
+  // is related to the number of photons.
+  //
+  // SimpleGamma(v * v * v) is the psychovisual space in butteraugli.
+  // This ratio allows quantization to move from jxl's opsin space to
+  // butteraugli's log-gamma space.
+  float kEpsilon = 1e-2;
+  v = ZeroIfNegative(v);
+  const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul);
+  const auto kVOffset = Set(d, kSGVOffset * kLog2 + kEpsilon);
+  const auto kDenMul = Set(d, kLog2 * kSGmul);
+
+  const auto v2 = Mul(v, v);
+
+  const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon));
+  const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset);
+  return invert ? Div(num, den) : Div(den, num);
+}
+
+template <bool invert = false>
+static float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) {
+  using DScalar = HWY_CAPPED(float, 1);
+  auto vscalar = Load(DScalar(), &v);
+  return GetLane(
+      RatioOfDerivativesOfCubicRootToSimpleGamma<invert>(DScalar(), vscalar));
+}
+
+// TODO(veluca): this function computes an approximation of the derivative of
+// SimpleGamma with (f(x+eps)-f(x))/eps. Consider two-sided approximation or
+// exact derivatives. For reference, SimpleGamma was:
+/*
+template <typename D, typename V>
+V SimpleGamma(const D d, V v) {
+  // A simple HDR compatible gamma function.
+  const auto mul = Set(d, kSGmul);
+  const auto kRetMul = Set(d, kSGRetMul);
+  const auto kRetAdd = Set(d, kSGmul2 * -20.2789020414f);
+  const auto kVOffset = Set(d, kSGVOffset);
+
+  v *= mul;
+
+  // This should happen rarely, but may lead to a NaN, which is rather
+  // undesirable. Since negative photons don't exist we solve the NaNs by
+  // clamping here.
+  // TODO(veluca): with FastLog2f, this no longer leads to NaNs.
+  v = ZeroIfNegative(v);
+  return kRetMul * FastLog2f(d, v + kVOffset) + kRetAdd;
+}
+*/
+
+template <class D, class V>
+V GammaModulation(const D d, const size_t x, const size_t y,
+                  const ImageF& xyb_x, const ImageF& xyb_y, const V out_val) {
+  const float kBias = 0.16f;
+  JXL_DASSERT(kBias > kOpsinAbsorbanceBias[0]);
+  JXL_DASSERT(kBias > kOpsinAbsorbanceBias[1]);
+  JXL_DASSERT(kBias > kOpsinAbsorbanceBias[2]);
+  auto overall_ratio = Zero(d);
+  auto bias = Set(d, kBias);
+  auto half = Set(d, 0.5f);
+  for (size_t dy = 0; dy < 8; ++dy) {
+    const float* const JXL_RESTRICT row_in_x = xyb_x.Row(y + dy);
+    const float* const JXL_RESTRICT row_in_y = xyb_y.Row(y + dy);
+    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
+      const auto iny = Add(Load(d, row_in_y + x + dx), bias);
+      const auto inx = Load(d, row_in_x + x + dx);
+      const auto r = Sub(iny, inx);
+      const auto g = Add(iny, inx);
+      const auto ratio_r =
+          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, r);
+      const auto ratio_g =
+          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, g);
+      const auto avg_ratio = Mul(half, Add(ratio_r, ratio_g));
+
+      overall_ratio = Add(overall_ratio, avg_ratio);
+    }
+  }
+  overall_ratio = Mul(SumOfLanes(d, overall_ratio), Set(d, 1.0f / 64));
+  // ideally -1.0, but likely optimal correction adds some entropy, so slightly
+  // less than that.
+  // ln(2) constant folded in because we want std::log but have FastLog2f.
+  const auto kGam = Set(d, -0.15526878023684174f * 0.693147180559945f);
+  return MulAdd(kGam, FastLog2f(d, overall_ratio), out_val);
+}
+
+template <class D, class V>
+V ColorModulation(const D d, const size_t x, const size_t y,
+                  const ImageF& xyb_x, const ImageF& xyb_y, const ImageF& xyb_b,
+                  const double butteraugli_target, V out_val) {
+  static const float kStrengthMul = 4.2456542701250122f;
+  static const float kRedRampStart = 0.18748564245760829f;
+  static const float kRedRampLength = 0.16701783842516479f;
+  static const float kBlueRampLength = 0.16117602661852037f;
+  static const float kBlueRampStart = 0.47897504338287333f;
+  const float strength = kStrengthMul * (1.0f - 0.15f * butteraugli_target);
+  if (strength < 0) {
+    return out_val;
+  }
+  // x values are smaller than y and b values, need to take the difference into
+  // account.
+  const float red_strength = strength * 6.0f;
+  const float blue_strength = strength;
+  {
+    // Reduce some bits from areas not blue or red.
+    const float offset = strength * -0.007;  // 9174542291185913f;
+    out_val = Add(out_val, Set(d, offset));
+  }
+  // Calculate how much of the 8x8 block is covered with blue or red.
+  auto blue_coverage = Zero(d);
+  auto red_coverage = Zero(d);
+  auto bias_y = Set(d, 0.2f);
+  auto bias_y_add = Set(d, 0.1f);
+  for (size_t dy = 0; dy < 8; ++dy) {
+    const float* const JXL_RESTRICT row_in_x = xyb_x.Row(y + dy);
+    const float* const JXL_RESTRICT row_in_y = xyb_y.Row(y + dy);
+    const float* const JXL_RESTRICT row_in_b = xyb_b.Row(y + dy);
+    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
+      const auto pixel_y = Load(d, row_in_y + x + dx);
+      // Estimate redness-greeness relative to the intensity.
+      const auto pixel_xpy = Div(Abs(Load(d, row_in_x + x + dx)),
+                                 Max(Add(bias_y_add, pixel_y), bias_y));
+      const auto pixel_x =
+          Max(Set(d, 0.0f), Sub(pixel_xpy, Set(d, kRedRampStart)));
+      const auto pixel_b =
+          Max(Set(d, 0.0f), Sub(Load(d, row_in_b + x + dx),
+                                Add(pixel_y, Set(d, kBlueRampStart))));
+      const auto blue_slope = Min(pixel_b, Set(d, kBlueRampLength));
+      const auto red_slope = Min(pixel_x, Set(d, kRedRampLength));
+      red_coverage = Add(red_coverage, red_slope);
+      blue_coverage = Add(blue_coverage, blue_slope);
+    }
+  }
+
+  // Saturate when the high red or high blue coverage is above a level.
+  // The idea here is that if a certain fraction of the block is red or
+  // blue we consider as if it was fully red or blue.
+  static const float ratio = 28.0f;  // out of 64 pixels.
+
+  auto overall_red_coverage = SumOfLanes(d, red_coverage);
+  overall_red_coverage =
+      Min(overall_red_coverage, Set(d, ratio * kRedRampLength));
+  overall_red_coverage =
+      Mul(overall_red_coverage, Set(d, red_strength / ratio));
+
+  auto overall_blue_coverage = SumOfLanes(d, blue_coverage);
+  overall_blue_coverage =
+      Min(overall_blue_coverage, Set(d, ratio * kBlueRampLength));
+  overall_blue_coverage =
+      Mul(overall_blue_coverage, Set(d, blue_strength / ratio));
+
+  return Add(overall_red_coverage, Add(overall_blue_coverage, out_val));
+}
+
+// Change precision in 8x8 blocks that have high frequency content.
+template <class D, class V>
+V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb,
+               const V out_val) {
+  // Zero out the invalid differences for the rightmost value per row.
+  const Rebind<uint32_t, D> du;
+  HWY_ALIGN constexpr uint32_t kMaskRight[kBlockDim] = {~0u, ~0u, ~0u, ~0u,
+                                                        ~0u, ~0u, ~0u, 0};
+
+  auto sum = Zero(d);  // sum of absolute differences with right and below
+
+  static const float valmin = 0.52489909479039587f;
+  auto valminv = Set(d, valmin);
+  for (size_t dy = 0; dy < 8; ++dy) {
+    const float* JXL_RESTRICT row_in = xyb.Row(y + dy) + x;
+    const float* JXL_RESTRICT row_in_next =
+        dy == 7 ? row_in : xyb.Row(y + dy + 1) + x;
+
+    // In SCALAR, there is no guarantee of having extra row padding.
+    // Hence, we need to ensure we don't access pixels outside the row itself.
+    // In SIMD modes, however, rows are padded, so it's safe to access one
+    // garbage value after the row. The vector then gets masked with kMaskRight
+    // to remove the influence of that value.
+#if HWY_TARGET != HWY_SCALAR
+    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
+#else
+    for (size_t dx = 0; dx < 7; dx += Lanes(d)) {
+#endif
+      const auto p = Load(d, row_in + dx);
+      const auto pr = LoadU(d, row_in + dx + 1);
+      const auto mask = BitCast(d, Load(du, kMaskRight + dx));
+      sum = Add(sum, And(mask, Min(valminv, AbsDiff(p, pr))));
+
+      const auto pd = Load(d, row_in_next + dx);
+      sum = Add(sum, Min(valminv, AbsDiff(p, pd)));
+    }
+#if HWY_TARGET == HWY_SCALAR
+    const auto p = Load(d, row_in + 7);
+    const auto pd = Load(d, row_in_next + 7);
+    sum = Add(sum, Min(valminv, AbsDiff(p, pd)));
+#endif
+  }
+  // more negative value gives more bpp
+  static const float kOffset = -2.6545897672771526;
+  static const float kMul = -0.049868161744916512;
+
+  sum = SumOfLanes(d, sum);
+  float scalar_sum = GetLane(sum);
+  static const float maxsum = 7.9076877647025947f;
+  static const float minsum = 0.53640540945659809f;
+  scalar_sum = std::min(maxsum, scalar_sum);
+  scalar_sum = std::max(minsum, scalar_sum);
+  scalar_sum += kOffset;
+  scalar_sum *= kMul;
+  return Add(Set(d, scalar_sum), out_val);
+}
+
+void PerBlockModulations(const float butteraugli_target, const ImageF& xyb_x,
+                         const ImageF& xyb_y, const ImageF& xyb_b,
+                         const float scale, const Rect& rect, ImageF* out) {
+  JXL_ASSERT(SameSize(xyb_x, xyb_y));
+  JXL_ASSERT(DivCeil(xyb_x.xsize(), kBlockDim) == out->xsize());
+  JXL_ASSERT(DivCeil(xyb_x.ysize(), kBlockDim) == out->ysize());
+
+  float base_level = 0.48f * scale;
+  float kDampenRampStart = 2.0f;
+  float kDampenRampEnd = 14.0f;
+  float dampen = 1.0f;
+  if (butteraugli_target >= kDampenRampStart) {
+    dampen = 1.0f - ((butteraugli_target - kDampenRampStart) /
+                     (kDampenRampEnd - kDampenRampStart));
+    if (dampen < 0) {
+      dampen = 0;
+    }
+  }
+  const float mul = scale * dampen;
+  const float add = (1.0f - dampen) * base_level;
+  for (size_t iy = rect.y0(); iy < rect.y0() + rect.ysize(); iy++) {
+    const size_t y = iy * 8;
+    float* const JXL_RESTRICT row_out = out->Row(iy);
+    const HWY_CAPPED(float, kBlockDim) df;
+    for (size_t ix = rect.x0(); ix < rect.x0() + rect.xsize(); ix++) {
+      size_t x = ix * 8;
+      auto out_val = Set(df, row_out[ix]);
+      out_val = ComputeMask(df, out_val);
+      out_val = HfModulation(df, x, y, xyb_y, out_val);
+      out_val = ColorModulation(df, x, y, xyb_x, xyb_y, xyb_b,
+                                butteraugli_target, out_val);
+      out_val = GammaModulation(df, x, y, xyb_x, xyb_y, out_val);
+      // We want multiplicative quantization field, so everything
+      // until this point has been modulating the exponent.
+      row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add;
+    }
+  }
+}
+
+template <typename D, typename V>
+V MaskingSqrt(const D d, V v) {
+  static const float kLogOffset = 27.97044946785558f;
+  static const float kMul = 211.53333281566171f;
+  const auto mul_v = Set(d, kMul * 1e8);
+  const auto offset_v = Set(d, kLogOffset);
+  return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v)));
+}
+
+float MaskingSqrt(const float v) {
+  using DScalar = HWY_CAPPED(float, 1);
+  auto vscalar = Load(DScalar(), &v);
+  return GetLane(MaskingSqrt(DScalar(), vscalar));
+}
+
+void StoreMin4(const float v, float& min0, float& min1, float& min2,
+               float& min3) {
+  if (v < min3) {
+    if (v < min0) {
+      min3 = min2;
+      min2 = min1;
+      min1 = min0;
+      min0 = v;
+    } else if (v < min1) {
+      min3 = min2;
+      min2 = min1;
+      min1 = v;
+    } else if (v < min2) {
+      min3 = min2;
+      min2 = v;
+    } else {
+      min3 = v;
+    }
+  }
+}
+
+// Look for smooth areas near the area of degradation.
+// If the areas are generally smooth, don't do masking.
+// Output is downsampled 2x.
+void FuzzyErosion(const Rect& from_rect, const ImageF& from,
+                  const Rect& to_rect, ImageF* to) {
+  const size_t xsize = from.xsize();
+  const size_t ysize = from.ysize();
+  constexpr int kStep = 1;
+  static_assert(kStep == 1, "Step must be 1");
+  JXL_ASSERT(to_rect.xsize() * 2 == from_rect.xsize());
+  JXL_ASSERT(to_rect.ysize() * 2 == from_rect.ysize());
+  for (size_t fy = 0; fy < from_rect.ysize(); ++fy) {
+    size_t y = fy + from_rect.y0();
+    size_t ym1 = y >= kStep ? y - kStep : y;
+    size_t yp1 = y + kStep < ysize ? y + kStep : y;
+    const float* rowt = from.Row(ym1);
+    const float* row = from.Row(y);
+    const float* rowb = from.Row(yp1);
+    float* row_out = to_rect.Row(to, fy / 2);
+    for (size_t fx = 0; fx < from_rect.xsize(); ++fx) {
+      size_t x = fx + from_rect.x0();
+      size_t xm1 = x >= kStep ? x - kStep : x;
+      size_t xp1 = x + kStep < xsize ? x + kStep : x;
+      float min0 = row[x];
+      float min1 = row[xm1];
+      float min2 = row[xp1];
+      float min3 = rowt[xm1];
+      // Sort the first four values.
+      if (min0 > min1) std::swap(min0, min1);
+      if (min0 > min2) std::swap(min0, min2);
+      if (min0 > min3) std::swap(min0, min3);
+      if (min1 > min2) std::swap(min1, min2);
+      if (min1 > min3) std::swap(min1, min3);
+      if (min2 > min3) std::swap(min2, min3);
+      // The remaining five values of a 3x3 neighbourhood.
+      StoreMin4(rowt[x], min0, min1, min2, min3);
+      StoreMin4(rowt[xp1], min0, min1, min2, min3);
+      StoreMin4(rowb[xm1], min0, min1, min2, min3);
+      StoreMin4(rowb[x], min0, min1, min2, min3);
+      StoreMin4(rowb[xp1], min0, min1, min2, min3);
+      static const float kMul0 = 0.125f;
+      static const float kMul1 = 0.075f;
+      static const float kMul2 = 0.06f;
+      static const float kMul3 = 0.05f;
+      float v = kMul0 * min0 + kMul1 * min1 + kMul2 * min2 + kMul3 * min3;
+      if (fx % 2 == 0 && fy % 2 == 0) {
+        row_out[fx / 2] = v;
+      } else {
+        row_out[fx / 2] += v;
+      }
+    }
+  }
+}
+
+struct AdaptiveQuantizationImpl {
+  void Init(const Image3F& xyb) {
+    JXL_DASSERT(xyb.xsize() % kBlockDim == 0);
+    JXL_DASSERT(xyb.ysize() % kBlockDim == 0);
+    const size_t xsize = xyb.xsize();
+    const size_t ysize = xyb.ysize();
+    aq_map = ImageF(xsize / kBlockDim, ysize / kBlockDim);
+  }
+  void PrepareBuffers(size_t num_threads) {
+    diff_buffer = ImageF(kEncTileDim + 8, num_threads);
+    for (size_t i = pre_erosion.size(); i < num_threads; i++) {
+      pre_erosion.emplace_back(kEncTileDimInBlocks * 2 + 2,
+                               kEncTileDimInBlocks * 2 + 2);
+    }
+  }
+
+  void ComputeTile(float butteraugli_target, float scale, const Image3F& xyb,
+                   const Rect& rect, const int thread, ImageF* mask) {
+    PROFILER_ZONE("aq DiffPrecompute");
+    const size_t xsize = xyb.xsize();
+    const size_t ysize = xyb.ysize();
+
+    // The XYB gamma is 3.0 to be able to decode faster with two muls.
+    // Butteraugli's gamma is matching the gamma of human eye, around 2.6.
+    // We approximate the gamma difference by adding one cubic root into
+    // the adaptive quantization. This gives us a total gamma of 2.6666
+    // for quantization uses.
+    const float match_gamma_offset = 0.019;
+
+    const HWY_FULL(float) df;
+
+    size_t y_start = rect.y0() * 8;
+    size_t y_end = y_start + rect.ysize() * 8;
+
+    size_t x0 = rect.x0() * 8;
+    size_t x1 = x0 + rect.xsize() * 8;
+    if (x0 != 0) x0 -= 4;
+    if (x1 != xyb.xsize()) x1 += 4;
+    if (y_start != 0) y_start -= 4;
+    if (y_end != xyb.ysize()) y_end += 4;
+    pre_erosion[thread].ShrinkTo((x1 - x0) / 4, (y_end - y_start) / 4);
+
+    static const float limit = 0.2f;
+    // Computes image (padded to multiple of 8x8) of local pixel differences.
+    // Subsample both directions by 4.
+    for (size_t y = y_start; y < y_end; ++y) {
+      size_t y2 = y + 1 < ysize ? y + 1 : y;
+      size_t y1 = y > 0 ? y - 1 : y;
+
+      const float* row_in = xyb.PlaneRow(1, y);
+      const float* row_in1 = xyb.PlaneRow(1, y1);
+      const float* row_in2 = xyb.PlaneRow(1, y2);
+      float* JXL_RESTRICT row_out = diff_buffer.Row(thread);
+
+      auto scalar_pixel = [&](size_t x) {
+        const size_t x2 = x + 1 < xsize ? x + 1 : x;
+        const size_t x1 = x > 0 ? x - 1 : x;
+        const float base =
+            0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]);
+        const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(
+            row_in[x] + match_gamma_offset);
+        float diff = gammac * (row_in[x] - base);
+        diff *= diff;
+        if (diff >= limit) {
+          diff = limit;
+        }
+        diff = MaskingSqrt(diff);
+        if ((y % 4) != 0) {
+          row_out[x - x0] += diff;
+        } else {
+          row_out[x - x0] = diff;
+        }
+      };
+
+      size_t x = x0;
+      // First pixel of the row.
+      if (x0 == 0) {
+        scalar_pixel(x0);
+        ++x;
+      }
+      // SIMD
+      const auto match_gamma_offset_v = Set(df, match_gamma_offset);
+      const auto quarter = Set(df, 0.25f);
+      for (; x + 1 + Lanes(df) < x1; x += Lanes(df)) {
+        const auto in = LoadU(df, row_in + x);
+        const auto in_r = LoadU(df, row_in + x + 1);
+        const auto in_l = LoadU(df, row_in + x - 1);
+        const auto in_t = LoadU(df, row_in2 + x);
+        const auto in_b = LoadU(df, row_in1 + x);
+        auto base = Mul(quarter, Add(Add(in_r, in_l), Add(in_t, in_b)));
+        auto gammacv =
+            RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/false>(
+                df, Add(in, match_gamma_offset_v));
+        auto diff = Mul(gammacv, Sub(in, base));
+        diff = Mul(diff, diff);
+        diff = Min(diff, Set(df, limit));
+        diff = MaskingSqrt(df, diff);
+        if ((y & 3) != 0) {
+          diff = Add(diff, LoadU(df, row_out + x - x0));
+        }
+        StoreU(diff, df, row_out + x - x0);
+      }
+      // Scalar
+      for (; x < x1; ++x) {
+        scalar_pixel(x);
+      }
+      if (y % 4 == 3) {
+        float* row_dout = pre_erosion[thread].Row((y - y_start) / 4);
+        for (size_t x = 0; x < (x1 - x0) / 4; x++) {
+          row_dout[x] = (row_out[x * 4] + row_out[x * 4 + 1] +
+                         row_out[x * 4 + 2] + row_out[x * 4 + 3]) *
+                        0.25f;
+        }
+      }
+    }
+    Rect from_rect(x0 % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1,
+                   rect.xsize() * 2, rect.ysize() * 2);
+    FuzzyErosion(from_rect, pre_erosion[thread], rect, &aq_map);
+    for (size_t y = 0; y < rect.ysize(); ++y) {
+      const float* aq_map_row = rect.ConstRow(aq_map, y);
+      float* mask_row = rect.Row(mask, y);
+      for (size_t x = 0; x < rect.xsize(); ++x) {
+        mask_row[x] = ComputeMaskForAcStrategyUse(aq_map_row[x]);
+      }
+    }
+    PerBlockModulations(butteraugli_target, xyb.Plane(0), xyb.Plane(1),
+                        xyb.Plane(2), scale, rect, &aq_map);
+  }
+  std::vector<ImageF> pre_erosion;
+  ImageF aq_map;
+  ImageF diff_buffer;
+};
+
+ImageF AdaptiveQuantizationMap(const float butteraugli_target,
+                               const Image3F& xyb,
+                               const FrameDimensions& frame_dim, float scale,
+                               ThreadPool* pool, ImageF* mask) {
+  PROFILER_ZONE("aq AdaptiveQuantMap");
+
+  AdaptiveQuantizationImpl impl;
+  impl.Init(xyb);
+  *mask = ImageF(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+  JXL_CHECK(RunOnPool(
+      pool, 0,
+      DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks) *
+          DivCeil(frame_dim.ysize_blocks, kEncTileDimInBlocks),
+      [&](const size_t num_threads) {
+        impl.PrepareBuffers(num_threads);
+        return true;
+      },
+      [&](const uint32_t tid, const size_t thread) {
+        size_t n_enc_tiles =
+            DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks);
+        size_t tx = tid % n_enc_tiles;
+        size_t ty = tid / n_enc_tiles;
+        size_t by0 = ty * kEncTileDimInBlocks;
+        size_t by1 =
+            std::min((ty + 1) * kEncTileDimInBlocks, frame_dim.ysize_blocks);
+        size_t bx0 = tx * kEncTileDimInBlocks;
+        size_t bx1 =
+            std::min((tx + 1) * kEncTileDimInBlocks, frame_dim.xsize_blocks);
+        Rect r(bx0, by0, bx1 - bx0, by1 - by0);
+        impl.ComputeTile(butteraugli_target, scale, xyb, r, thread, mask);
+      },
+      "AQ DiffPrecompute"));
+
+  return std::move(impl).aq_map;
+}
+
+}  // namespace
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(AdaptiveQuantizationMap);
+
+namespace {
+// If true, prints the quantization maps at each iteration.
+bool FLAGS_dump_quant_state = false;
+
+void DumpHeatmap(const AuxOut* aux_out, const std::string& label,
+                 const ImageF& image, float good_threshold,
+                 float bad_threshold) {
+  Image3F heatmap = CreateHeatMapImage(image, good_threshold, bad_threshold);
+  char filename[200];
+  snprintf(filename, sizeof(filename), "%s%05d", label.c_str(),
+           aux_out->num_butteraugli_iters);
+  aux_out->DumpImage(filename, heatmap);
+}
+
+void DumpHeatmaps(const AuxOut* aux_out, float ba_target,
+                  const ImageF& quant_field, const ImageF& tile_heatmap,
+                  const ImageF& bt_diffmap) {
+  if (!WantDebugOutput(aux_out)) return;
+  ImageF inv_qmap(quant_field.xsize(), quant_field.ysize());
+  for (size_t y = 0; y < quant_field.ysize(); ++y) {
+    const float* JXL_RESTRICT row_q = quant_field.ConstRow(y);
+    float* JXL_RESTRICT row_inv_q = inv_qmap.Row(y);
+    for (size_t x = 0; x < quant_field.xsize(); ++x) {
+      row_inv_q[x] = 1.0f / row_q[x];  // never zero
+    }
+  }
+  DumpHeatmap(aux_out, "quant_heatmap", inv_qmap, 4.0f * ba_target,
+              6.0f * ba_target);
+  DumpHeatmap(aux_out, "tile_heatmap", tile_heatmap, ba_target,
+              1.5f * ba_target);
+  // matches heat maps produced by the command line tool.
+  DumpHeatmap(aux_out, "bt_diffmap", bt_diffmap, ButteraugliFuzzyInverse(1.5),
+              ButteraugliFuzzyInverse(0.5));
+}
+
+ImageF TileDistMap(const ImageF& distmap, int tile_size, int margin,
+                   const AcStrategyImage& ac_strategy) {
+  PROFILER_FUNC;
+  const int tile_xsize = (distmap.xsize() + tile_size - 1) / tile_size;
+  const int tile_ysize = (distmap.ysize() + tile_size - 1) / tile_size;
+  ImageF tile_distmap(tile_xsize, tile_ysize);
+  size_t distmap_stride = tile_distmap.PixelsPerRow();
+  for (int tile_y = 0; tile_y < tile_ysize; ++tile_y) {
+    AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(tile_y);
+    float* JXL_RESTRICT dist_row = tile_distmap.Row(tile_y);
+    for (int tile_x = 0; tile_x < tile_xsize; ++tile_x) {
+      AcStrategy acs = ac_strategy_row[tile_x];
+      if (!acs.IsFirstBlock()) continue;
+      int this_tile_xsize = acs.covered_blocks_x() * tile_size;
+      int this_tile_ysize = acs.covered_blocks_y() * tile_size;
+      int y_begin = std::max<int>(0, tile_size * tile_y - margin);
+      int y_end = std::min<int>(distmap.ysize(),
+                                tile_size * tile_y + this_tile_ysize + margin);
+      int x_begin = std::max<int>(0, tile_size * tile_x - margin);
+      int x_end = std::min<int>(distmap.xsize(),
+                                tile_size * tile_x + this_tile_xsize + margin);
+      float dist_norm = 0.0;
+      double pixels = 0;
+      for (int y = y_begin; y < y_end; ++y) {
+        float ymul = 1.0;
+        constexpr float kBorderMul = 0.98f;
+        constexpr float kCornerMul = 0.7f;
+        if (margin != 0 && (y == y_begin || y == y_end - 1)) {
+          ymul = kBorderMul;
+        }
+        const float* const JXL_RESTRICT row = distmap.Row(y);
+        for (int x = x_begin; x < x_end; ++x) {
+          float xmul = ymul;
+          if (margin != 0 && (x == x_begin || x == x_end - 1)) {
+            if (xmul == 1.0) {
+              xmul = kBorderMul;
+            } else {
+              xmul = kCornerMul;
+            }
+          }
+          float v = row[x];
+          v *= v;
+          v *= v;
+          v *= v;
+          v *= v;
+          dist_norm += xmul * v;
+          pixels += xmul;
+        }
+      }
+      if (pixels == 0) pixels = 1;
+      // 16th norm is less than the max norm, we reduce the difference
+      // with this normalization factor.
+      constexpr float kTileNorm = 1.2f;
+      const float tile_dist =
+          kTileNorm * std::pow(dist_norm / pixels, 1.0f / 16.0f);
+      dist_row[tile_x] = tile_dist;
+      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+          dist_row[tile_x + distmap_stride * iy + ix] = tile_dist;
+        }
+      }
+    }
+  }
+  return tile_distmap;
+}
+
+static const float kDcQuantPow = 0.83;
+static const float kDcQuant = 1.095924047623553f;
+static const float kAcQuant = 0.80751132443618624f;
+
+void FindBestQuantization(const ImageBundle& linear, const Image3F& opsin,
+                          PassesEncoderState* enc_state,
+                          const JxlCmsInterface& cms, ThreadPool* pool,
+                          AuxOut* aux_out) {
+  const CompressParams& cparams = enc_state->cparams;
+  if (cparams.resampling > 1 &&
+      cparams.original_butteraugli_distance <= 4.0 * cparams.resampling) {
+    // For downsampled opsin image, the butteraugli based adaptive quantization
+    // loop would only make the size bigger without improving the distance much,
+    // so in this case we enable it only for very high butteraugli targets.
+    return;
+  }
+  Quantizer& quantizer = enc_state->shared.quantizer;
+  ImageI& raw_quant_field = enc_state->shared.raw_quant_field;
+  ImageF& quant_field = enc_state->initial_quant_field;
+
+  // TODO(veluca): this should really be rather handled on the
+  // ButteraugliComparator side.
+  struct TemporaryShrink {
+    TemporaryShrink(ImageBundle& bundle, size_t xsize, size_t ysize)
+        : bundle(bundle),
+          orig_xsize(bundle.xsize()),
+          orig_ysize(bundle.ysize()) {
+      bundle.ShrinkTo(xsize, ysize);
+    }
+    TemporaryShrink(const TemporaryShrink&) = delete;
+    TemporaryShrink(TemporaryShrink&&) = delete;
+
+    ~TemporaryShrink() { bundle.ShrinkTo(orig_xsize, orig_ysize); }
+
+    ImageBundle& bundle;
+    size_t orig_xsize;
+    size_t orig_ysize;
+  } t(const_cast<ImageBundle&>(linear),
+      enc_state->shared.frame_header.nonserialized_metadata->xsize(),
+      enc_state->shared.frame_header.nonserialized_metadata->ysize());
+
+  const float butteraugli_target = cparams.butteraugli_distance;
+  const float original_butteraugli = cparams.original_butteraugli_distance;
+  ButteraugliParams params = cparams.ba_params;
+  params.intensity_target = linear.metadata()->IntensityTarget();
+  // Hack the default intensity target value to be 80.0, the intensity
+  // target of sRGB images and a more reasonable viewing default than
+  // JPEG XL file format's default.
+  if (fabs(params.intensity_target - 255.0f) < 1e-3) {
+    params.intensity_target = 80.0f;
+  }
+  JxlButteraugliComparator comparator(params, cms);
+  JXL_CHECK(comparator.SetReferenceImage(linear));
+  bool lower_is_better =
+      (comparator.GoodQualityScore() < comparator.BadQualityScore());
+  const float initial_quant_dc = InitialQuantDC(butteraugli_target);
+  AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field),
+                   &quant_field);
+  ImageF tile_distmap;
+  ImageF initial_quant_field = CopyImage(quant_field);
+
+  float initial_qf_min, initial_qf_max;
+  ImageMinMax(initial_quant_field, &initial_qf_min, &initial_qf_max);
+  float initial_qf_ratio = initial_qf_max / initial_qf_min;
+  float qf_max_deviation_low = std::sqrt(250 / initial_qf_ratio);
+  float asymmetry = 2;
+  if (qf_max_deviation_low < asymmetry) asymmetry = qf_max_deviation_low;
+  float qf_lower = initial_qf_min / (asymmetry * qf_max_deviation_low);
+  float qf_higher = initial_qf_max * (qf_max_deviation_low / asymmetry);
+
+  JXL_ASSERT(qf_higher / qf_lower < 253);
+
+  constexpr int kOriginalComparisonRound = 1;
+  int iters = cparams.max_butteraugli_iters;
+  if (iters > 7) {
+    iters = 7;
+  }
+  if (cparams.speed_tier != SpeedTier::kTortoise) {
+    iters = 2;
+  }
+  for (int i = 0; i < iters + 1; ++i) {
+    if (FLAGS_dump_quant_state) {
+      printf("\nQuantization field:\n");
+      for (size_t y = 0; y < quant_field.ysize(); ++y) {
+        for (size_t x = 0; x < quant_field.xsize(); ++x) {
+          printf(" %.5f", quant_field.Row(y)[x]);
+        }
+        printf("\n");
+      }
+    }
+    quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field);
+    ImageBundle dec_linear = RoundtripImage(opsin, enc_state, cms, pool);
+    PROFILER_ZONE("enc Butteraugli");
+    float score;
+    ImageF diffmap;
+    JXL_CHECK(comparator.CompareWith(dec_linear, &diffmap, &score));
+    if (!lower_is_better) {
+      score = -score;
+      diffmap = ScaleImage(-1.0f, diffmap);
+    }
+    tile_distmap = TileDistMap(diffmap, 8 * cparams.resampling, 0,
+                               enc_state->shared.ac_strategy);
+    if (WantDebugOutput(aux_out)) {
+      aux_out->DumpImage(("dec" + ToString(i)).c_str(), *dec_linear.color());
+      DumpHeatmaps(aux_out, butteraugli_target, quant_field, tile_distmap,
+                   diffmap);
+    }
+    if (aux_out != nullptr) ++aux_out->num_butteraugli_iters;
+    if (cparams.log_search_state) {
+      float minval, maxval;
+      ImageMinMax(quant_field, &minval, &maxval);
+      printf("\nButteraugli iter: %d/%d\n", i, cparams.max_butteraugli_iters);
+      printf("Butteraugli distance: %f  (target = %f)\n", score,
+             original_butteraugli);
+      printf("quant range: %f ... %f  DC quant: %f\n", minval, maxval,
+             initial_quant_dc);
+      if (FLAGS_dump_quant_state) {
+        quantizer.DumpQuantizationMap(raw_quant_field);
+      }
+    }
+
+    if (i == iters) break;
+
+    double kPow[8] = {
+        0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    };
+    double kPowMod[8] = {
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    };
+    if (i == kOriginalComparisonRound) {
+      // Don't allow optimization to make the quant field a lot worse than
+      // what the initial guess was. This allows the AC field to have enough
+      // precision to reduce the oscillations due to the dc reconstruction.
+      double kInitMul = 0.6;
+      const double kOneMinusInitMul = 1.0 - kInitMul;
+      for (size_t y = 0; y < quant_field.ysize(); ++y) {
+        float* const JXL_RESTRICT row_q = quant_field.Row(y);
+        const float* const JXL_RESTRICT row_init = initial_quant_field.Row(y);
+        for (size_t x = 0; x < quant_field.xsize(); ++x) {
+          double clamp = kOneMinusInitMul * row_q[x] + kInitMul * row_init[x];
+          if (row_q[x] < clamp) {
+            row_q[x] = clamp;
+            if (row_q[x] > qf_higher) row_q[x] = qf_higher;
+            if (row_q[x] < qf_lower) row_q[x] = qf_lower;
+          }
+        }
+      }
+    }
+
+    double cur_pow = 0.0;
+    if (i < 7) {
+      cur_pow = kPow[i] + (original_butteraugli - 1.0) * kPowMod[i];
+      if (cur_pow < 0) {
+        cur_pow = 0;
+      }
+    }
+    if (cur_pow == 0.0) {
+      for (size_t y = 0; y < quant_field.ysize(); ++y) {
+        const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y);
+        float* const JXL_RESTRICT row_q = quant_field.Row(y);
+        for (size_t x = 0; x < quant_field.xsize(); ++x) {
+          const float diff = row_dist[x] / original_butteraugli;
+          if (diff > 1.0f) {
+            float old = row_q[x];
+            row_q[x] *= diff;
+            int qf_old = old * quantizer.InvGlobalScale() + 0.5;
+            int qf_new = row_q[x] * quantizer.InvGlobalScale() + 0.5;
+            if (qf_old == qf_new) {
+              row_q[x] = old + quantizer.Scale();
+            }
+          }
+          if (row_q[x] > qf_higher) row_q[x] = qf_higher;
+          if (row_q[x] < qf_lower) row_q[x] = qf_lower;
+        }
+      }
+    } else {
+      for (size_t y = 0; y < quant_field.ysize(); ++y) {
+        const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y);
+        float* const JXL_RESTRICT row_q = quant_field.Row(y);
+        for (size_t x = 0; x < quant_field.xsize(); ++x) {
+          const float diff = row_dist[x] / original_butteraugli;
+          if (diff <= 1.0f) {
+            row_q[x] *= std::pow(diff, cur_pow);
+          } else {
+            float old = row_q[x];
+            row_q[x] *= diff;
+            int qf_old = old * quantizer.InvGlobalScale() + 0.5;
+            int qf_new = row_q[x] * quantizer.InvGlobalScale() + 0.5;
+            if (qf_old == qf_new) {
+              row_q[x] = old + quantizer.Scale();
+            }
+          }
+          if (row_q[x] > qf_higher) row_q[x] = qf_higher;
+          if (row_q[x] < qf_lower) row_q[x] = qf_lower;
+        }
+      }
+    }
+  }
+  quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field);
+}
+
+void FindBestQuantizationMaxError(const Image3F& opsin,
+                                  PassesEncoderState* enc_state,
+                                  const JxlCmsInterface& cms, ThreadPool* pool,
+                                  AuxOut* aux_out) {
+  // TODO(szabadka): Make this work for non-opsin color spaces.
+  const CompressParams& cparams = enc_state->cparams;
+  Quantizer& quantizer = enc_state->shared.quantizer;
+  ImageI& raw_quant_field = enc_state->shared.raw_quant_field;
+  ImageF& quant_field = enc_state->initial_quant_field;
+
+  // TODO(veluca): better choice of this value.
+  const float initial_quant_dc =
+      16 * std::sqrt(0.1f / cparams.butteraugli_distance);
+  AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field),
+                   &quant_field);
+
+  const float inv_max_err[3] = {1.0f / enc_state->cparams.max_error[0],
+                                1.0f / enc_state->cparams.max_error[1],
+                                1.0f / enc_state->cparams.max_error[2]};
+
+  for (int i = 0; i < cparams.max_butteraugli_iters + 1; ++i) {
+    quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field);
+    if (aux_out) {
+      aux_out->DumpXybImage(("ops" + ToString(i)).c_str(), opsin);
+    }
+    ImageBundle decoded = RoundtripImage(opsin, enc_state, cms, pool);
+    if (aux_out) {
+      aux_out->DumpXybImage(("dec" + ToString(i)).c_str(), *decoded.color());
+    }
+
+    for (size_t by = 0; by < enc_state->shared.frame_dim.ysize_blocks; by++) {
+      AcStrategyRow ac_strategy_row =
+          enc_state->shared.ac_strategy.ConstRow(by);
+      for (size_t bx = 0; bx < enc_state->shared.frame_dim.xsize_blocks; bx++) {
+        AcStrategy acs = ac_strategy_row[bx];
+        if (!acs.IsFirstBlock()) continue;
+        float max_error = 0;
+        for (size_t c = 0; c < 3; c++) {
+          for (size_t y = by * kBlockDim;
+               y < (by + acs.covered_blocks_y()) * kBlockDim; y++) {
+            if (y >= decoded.ysize()) continue;
+            const float* JXL_RESTRICT in_row = opsin.ConstPlaneRow(c, y);
+            const float* JXL_RESTRICT dec_row =
+                decoded.color()->ConstPlaneRow(c, y);
+            for (size_t x = bx * kBlockDim;
+                 x < (bx + acs.covered_blocks_x()) * kBlockDim; x++) {
+              if (x >= decoded.xsize()) continue;
+              max_error = std::max(
+                  std::abs(in_row[x] - dec_row[x]) * inv_max_err[c], max_error);
+            }
+          }
+        }
+        // Target an error between max_error/2 and max_error.
+        // If the error in the varblock is above the target, increase the qf to
+        // compensate. If the error is below the target, decrease the qf.
+        // However, to avoid an excessive increase of the qf, only do so if the
+        // error is less than half the maximum allowed error.
+        const float qf_mul = (max_error < 0.5f)   ? max_error * 2.0f
+                             : (max_error > 1.0f) ? max_error
+                                                  : 1.0f;
+        for (size_t qy = by; qy < by + acs.covered_blocks_y(); qy++) {
+          float* JXL_RESTRICT quant_field_row = quant_field.Row(qy);
+          for (size_t qx = bx; qx < bx + acs.covered_blocks_x(); qx++) {
+            quant_field_row[qx] *= qf_mul;
+          }
+        }
+      }
+    }
+  }
+  quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field);
+}
+
+}  // namespace
+
+void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect,
+                      ImageF* quant_field) {
+  // Replace the whole quant_field in non-8x8 blocks with the maximum of each
+  // 8x8 block.
+  size_t stride = quant_field->PixelsPerRow();
+  for (size_t y = 0; y < rect.ysize(); ++y) {
+    AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(rect, y);
+    float* JXL_RESTRICT quant_row = rect.Row(quant_field, y);
+    for (size_t x = 0; x < rect.xsize(); ++x) {
+      AcStrategy acs = ac_strategy_row[x];
+      if (!acs.IsFirstBlock()) continue;
+      JXL_ASSERT(x + acs.covered_blocks_x() <= quant_field->xsize());
+      JXL_ASSERT(y + acs.covered_blocks_y() <= quant_field->ysize());
+      float max = quant_row[x];
+      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+          max = std::max(quant_row[x + ix + iy * stride], max);
+        }
+      }
+      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+          quant_row[x + ix + iy * stride] = max;
+        }
+      }
+    }
+  }
+}
+
+float InitialQuantDC(float butteraugli_target) {
+  const float kDcMul = 0.3;  // Butteraugli target where non-linearity kicks in.
+  const float butteraugli_target_dc = std::max<float>(
+      0.5f * butteraugli_target,
+      std::min<float>(butteraugli_target,
+                      kDcMul * std::pow((1.0f / kDcMul) * butteraugli_target,
+                                        kDcQuantPow)));
+  // We want the maximum DC value to be at most 2**15 * kInvDCQuant / quant_dc.
+  // The maximum DC value might not be in the kXybRange because of inverse
+  // gaborish, so we add some slack to the maximum theoretical quant obtained
+  // this way (64).
+  return std::min(kDcQuant / butteraugli_target_dc, 50.f);
+}
+
+ImageF InitialQuantField(const float butteraugli_target, const Image3F& opsin,
+                         const FrameDimensions& frame_dim, ThreadPool* pool,
+                         float rescale, ImageF* mask) {
+  PROFILER_FUNC;
+  const float quant_ac = kAcQuant / butteraugli_target;
+  return HWY_DYNAMIC_DISPATCH(AdaptiveQuantizationMap)(
+      butteraugli_target, opsin, frame_dim, quant_ac * rescale, pool, mask);
+}
+
+void FindBestQuantizer(const ImageBundle* linear, const Image3F& opsin,
+                       PassesEncoderState* enc_state,
+                       const JxlCmsInterface& cms, ThreadPool* pool,
+                       AuxOut* aux_out, double rescale) {
+  const CompressParams& cparams = enc_state->cparams;
+  if (cparams.max_error_mode) {
+    PROFILER_ZONE("enc find best maxerr");
+    FindBestQuantizationMaxError(opsin, enc_state, cms, pool, aux_out);
+  } else if (cparams.speed_tier <= SpeedTier::kKitten) {
+    // Normal encoding to a butteraugli score.
+    PROFILER_ZONE("enc find best2");
+    FindBestQuantization(*linear, opsin, enc_state, cms, pool, aux_out);
+  }
+}
+
+ImageBundle RoundtripImage(const Image3F& opsin, PassesEncoderState* enc_state,
+                           const JxlCmsInterface& cms, ThreadPool* pool) {
+  PROFILER_ZONE("enc roundtrip");
+  std::unique_ptr<PassesDecoderState> dec_state =
+      jxl::make_unique<PassesDecoderState>();
+  JXL_CHECK(dec_state->output_encoding_info.SetFromMetadata(
+      *enc_state->shared.metadata));
+  dec_state->shared = &enc_state->shared;
+  JXL_ASSERT(opsin.ysize() % kBlockDim == 0);
+
+  const size_t xsize_groups = DivCeil(opsin.xsize(), kGroupDim);
+  const size_t ysize_groups = DivCeil(opsin.ysize(), kGroupDim);
+  const size_t num_groups = xsize_groups * ysize_groups;
+
+  size_t num_special_frames = enc_state->special_frames.size();
+
+  std::unique_ptr<ModularFrameEncoder> modular_frame_encoder =
+      jxl::make_unique<ModularFrameEncoder>(enc_state->shared.frame_header,
+                                            enc_state->cparams);
+  JXL_CHECK(InitializePassesEncoder(opsin, cms, pool, enc_state,
+                                    modular_frame_encoder.get(), nullptr));
+  JXL_CHECK(dec_state->Init());
+  JXL_CHECK(dec_state->InitForAC(pool));
+
+  ImageBundle decoded(&enc_state->shared.metadata->m);
+  decoded.origin = enc_state->shared.frame_header.frame_origin;
+  decoded.SetFromImage(Image3F(opsin.xsize(), opsin.ysize()),
+                       dec_state->output_encoding_info.color_encoding);
+
+  PassesDecoderState::PipelineOptions options;
+  options.use_slow_render_pipeline = false;
+  options.coalescing = true;
+  options.render_spotcolors = false;
+
+  // Same as dec_state->shared->frame_header.nonserialized_metadata->m
+  const ImageMetadata& metadata = *decoded.metadata();
+
+  JXL_CHECK(dec_state->PreparePipeline(&decoded, options));
+
+  hwy::AlignedUniquePtr<GroupDecCache[]> group_dec_caches;
+  const auto allocate_storage = [&](const size_t num_threads) -> Status {
+    JXL_RETURN_IF_ERROR(
+        dec_state->render_pipeline->PrepareForThreads(num_threads,
+                                                      /*use_group_ids=*/false));
+    group_dec_caches = hwy::MakeUniqueAlignedArray<GroupDecCache>(num_threads);
+    return true;
+  };
+  const auto process_group = [&](const uint32_t group_index,
+                                 const size_t thread) {
+    if (dec_state->shared->frame_header.loop_filter.epf_iters > 0) {
+      ComputeSigma(dec_state->shared->BlockGroupRect(group_index),
+                   dec_state.get());
+    }
+    RenderPipelineInput input =
+        dec_state->render_pipeline->GetInputBuffers(group_index, thread);
+    JXL_CHECK(DecodeGroupForRoundtrip(
+        enc_state->coeffs, group_index, dec_state.get(),
+        &group_dec_caches[thread], thread, input, &decoded, nullptr));
+    for (size_t c = 0; c < metadata.num_extra_channels; c++) {
+      std::pair<ImageF*, Rect> ri = input.GetBuffer(3 + c);
+      FillPlane(0.0f, ri.first, ri.second);
+    }
+    input.Done();
+  };
+  JXL_CHECK(RunOnPool(pool, 0, num_groups, allocate_storage, process_group,
+                      "AQ loop"));
+
+  // Ensure we don't create any new special frames.
+  enc_state->special_frames.resize(num_special_frames);
+
+  return decoded;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.h b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.h
new file mode 100644
index 0000000000..a63c574492
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.h
@@ -0,0 +1,66 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_ADAPTIVE_QUANTIZATION_H_
+#define LIB_JXL_ENC_ADAPTIVE_QUANTIZATION_H_
+
+#include <stddef.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer.h"
+#include "lib/jxl/splines.h"
+
+// Heuristics to find a good quantizer for a given image. InitialQuantField
+// produces a quantization field (i.e. relative quantization amounts for each
+// block) out of an opsin-space image. `InitialQuantField` uses heuristics,
+// `FindBestQuantizer` (in non-fast mode) will run multiple encoding-decoding
+// steps and try to improve the given quant field.
+
+namespace jxl {
+
+struct AuxOut;
+
+// Computes the decoded image for a given set of compression parameters. Mainly
+// used in the FindBestQuantization loops and in some tests.
+// TODO(veluca): this doesn't seem the best possible file for this function.
+ImageBundle RoundtripImage(const Image3F& opsin, PassesEncoderState* enc_state,
+                           const JxlCmsInterface& cms, ThreadPool* pool);
+
+// Returns an image subsampled by kBlockDim in each direction. If the value
+// at pixel (x,y) in the returned image is greater than 1.0, it means that
+// more fine-grained quantization should be used in the corresponding block
+// of the input image, while a value less than 1.0 indicates that less
+// fine-grained quantization should be enough. Returns a mask, too, which
+// can later be used to make better decisions about ac strategy.
+ImageF InitialQuantField(float butteraugli_target, const Image3F& opsin,
+                         const FrameDimensions& frame_dim, ThreadPool* pool,
+                         float rescale, ImageF* initial_quant_mask);
+
+float InitialQuantDC(float butteraugli_target);
+
+void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect,
+                      ImageF* quant_field);
+
+// Returns a quantizer that uses an adjusted version of the provided
+// quant_field. Also computes the dequant_map corresponding to the given
+// dequant_float_map and chosen quantization levels.
+// `linear` is only used in Kitten mode or slower.
+void FindBestQuantizer(const ImageBundle* linear, const Image3F& opsin,
+                       PassesEncoderState* enc_state,
+                       const JxlCmsInterface& cms, ThreadPool* pool,
+                       AuxOut* aux_out, double rescale = 1.0);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_ADAPTIVE_QUANTIZATION_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_ans.cc b/third_party/jpeg-xl/lib/jxl/enc_ans.cc
new file mode 100644
index 0000000000..4249426bc9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_ans.cc
@@ -0,0 +1,1688 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_ans.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <limits>
+#include <numeric>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ans_common.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_cluster.h"
+#include "lib/jxl/enc_context_map.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_huffman.h"
+#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+
+namespace {
+
+bool ans_fuzzer_friendly_ = false;
+
+static const int kMaxNumSymbolsForSmallCode = 4;
+
+void ANSBuildInfoTable(const ANSHistBin* counts, const AliasTable::Entry* table,
+                       size_t alphabet_size, size_t log_alpha_size,
+                       ANSEncSymbolInfo* info) {
+  size_t log_entry_size = ANS_LOG_TAB_SIZE - log_alpha_size;
+  size_t entry_size_minus_1 = (1 << log_entry_size) - 1;
+  // create valid alias table for empty streams.
+  for (size_t s = 0; s < std::max<size_t>(1, alphabet_size); ++s) {
+    const ANSHistBin freq = s == alphabet_size ? ANS_TAB_SIZE : counts[s];
+    info[s].freq_ = static_cast<uint16_t>(freq);
+#ifdef USE_MULT_BY_RECIPROCAL
+    if (freq != 0) {
+      info[s].ifreq_ =
+          ((1ull << RECIPROCAL_PRECISION) + info[s].freq_ - 1) / info[s].freq_;
+    } else {
+      info[s].ifreq_ = 1;  // shouldn't matter (symbol shouldn't occur), but...
+    }
+#endif
+    info[s].reverse_map_.resize(freq);
+  }
+  for (int i = 0; i < ANS_TAB_SIZE; i++) {
+    AliasTable::Symbol s =
+        AliasTable::Lookup(table, i, log_entry_size, entry_size_minus_1);
+    info[s.value].reverse_map_[s.offset] = i;
+  }
+}
+
+float EstimateDataBits(const ANSHistBin* histogram, const ANSHistBin* counts,
+                       size_t len) {
+  float sum = 0.0f;
+  int total_histogram = 0;
+  int total_counts = 0;
+  for (size_t i = 0; i < len; ++i) {
+    total_histogram += histogram[i];
+    total_counts += counts[i];
+    if (histogram[i] > 0) {
+      JXL_ASSERT(counts[i] > 0);
+      // += histogram[i] * -log(counts[i]/total_counts)
+      sum += histogram[i] *
+             std::max(0.0f, ANS_LOG_TAB_SIZE - FastLog2f(counts[i]));
+    }
+  }
+  if (total_histogram > 0) {
+    // Used only in assert.
+    (void)total_counts;
+    JXL_ASSERT(total_counts == ANS_TAB_SIZE);
+  }
+  return sum;
+}
+
+float EstimateDataBitsFlat(const ANSHistBin* histogram, size_t len) {
+  const float flat_bits = std::max(FastLog2f(len), 0.0f);
+  float total_histogram = 0;
+  for (size_t i = 0; i < len; ++i) {
+    total_histogram += histogram[i];
+  }
+  return total_histogram * flat_bits;
+}
+
+// Static Huffman code for encoding logcounts. The last symbol is used as RLE
+// sequence.
+static const uint8_t kLogCountBitLengths[ANS_LOG_TAB_SIZE + 2] = {
+    5, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 6, 7, 7,
+};
+static const uint8_t kLogCountSymbols[ANS_LOG_TAB_SIZE + 2] = {
+    17, 11, 15, 3, 9, 7, 4, 2, 5, 6, 0, 33, 1, 65,
+};
+
+// Returns the difference between largest count that can be represented and is
+// smaller than "count" and smallest representable count larger than "count".
+static int SmallestIncrement(uint32_t count, uint32_t shift) {
+  int bits = count == 0 ? -1 : FloorLog2Nonzero(count);
+  int drop_bits = bits - GetPopulationCountPrecision(bits, shift);
+  return drop_bits < 0 ? 1 : (1 << drop_bits);
+}
+
+template <bool minimize_error_of_sum>
+bool RebalanceHistogram(const float* targets, int max_symbol, int table_size,
+                        uint32_t shift, int* omit_pos, ANSHistBin* counts) {
+  int sum = 0;
+  float sum_nonrounded = 0.0;
+  int remainder_pos = 0;  // if all of them are handled in first loop
+  int remainder_log = -1;
+  for (int n = 0; n < max_symbol; ++n) {
+    if (targets[n] > 0 && targets[n] < 1.0f) {
+      counts[n] = 1;
+      sum_nonrounded += targets[n];
+      sum += counts[n];
+    }
+  }
+  const float discount_ratio =
+      (table_size - sum) / (table_size - sum_nonrounded);
+  JXL_ASSERT(discount_ratio > 0);
+  JXL_ASSERT(discount_ratio <= 1.0f);
+  // Invariant for minimize_error_of_sum == true:
+  // abs(sum - sum_nonrounded)
+  //   <= SmallestIncrement(max(targets[])) + max_symbol
+  for (int n = 0; n < max_symbol; ++n) {
+    if (targets[n] >= 1.0f) {
+      sum_nonrounded += targets[n];
+      counts[n] =
+          static_cast<ANSHistBin>(targets[n] * discount_ratio);  // truncate
+      if (counts[n] == 0) counts[n] = 1;
+      if (counts[n] == table_size) counts[n] = table_size - 1;
+      // Round the count to the closest nonzero multiple of SmallestIncrement
+      // (when minimize_error_of_sum is false) or one of two closest so as to
+      // keep the sum as close as possible to sum_nonrounded.
+      int inc = SmallestIncrement(counts[n], shift);
+      counts[n] -= counts[n] & (inc - 1);
+      // TODO(robryk): Should we rescale targets[n]?
+      const float target =
+          minimize_error_of_sum ? (sum_nonrounded - sum) : targets[n];
+      if (counts[n] == 0 ||
+          (target > counts[n] + inc / 2 && counts[n] + inc < table_size)) {
+        counts[n] += inc;
+      }
+      sum += counts[n];
+      const int count_log = FloorLog2Nonzero(static_cast<uint32_t>(counts[n]));
+      if (count_log > remainder_log) {
+        remainder_pos = n;
+        remainder_log = count_log;
+      }
+    }
+  }
+  JXL_ASSERT(remainder_pos != -1);
+  // NOTE: This is the only place where counts could go negative. We could
+  // detect that, return false and make ANSHistBin uint32_t.
+  counts[remainder_pos] -= sum - table_size;
+  *omit_pos = remainder_pos;
+  return counts[remainder_pos] > 0;
+}
+
+Status NormalizeCounts(ANSHistBin* counts, int* omit_pos, const int length,
+                       const int precision_bits, uint32_t shift,
+                       int* num_symbols, int* symbols) {
+  const int32_t table_size = 1 << precision_bits;  // target sum / table size
+  uint64_t total = 0;
+  int max_symbol = 0;
+  int symbol_count = 0;
+  for (int n = 0; n < length; ++n) {
+    total += counts[n];
+    if (counts[n] > 0) {
+      if (symbol_count < kMaxNumSymbolsForSmallCode) {
+        symbols[symbol_count] = n;
+      }
+      ++symbol_count;
+      max_symbol = n + 1;
+    }
+  }
+  *num_symbols = symbol_count;
+  if (symbol_count == 0) {
+    return true;
+  }
+  if (symbol_count == 1) {
+    counts[symbols[0]] = table_size;
+    return true;
+  }
+  if (symbol_count > table_size)
+    return JXL_FAILURE("Too many entries in an ANS histogram");
+
+  const float norm = 1.f * table_size / total;
+  std::vector<float> targets(max_symbol);
+  for (size_t n = 0; n < targets.size(); ++n) {
+    targets[n] = norm * counts[n];
+  }
+  if (!RebalanceHistogram<false>(&targets[0], max_symbol, table_size, shift,
+                                 omit_pos, counts)) {
+    // Use an alternative rebalancing mechanism if the one above failed
+    // to create a histogram that is positive wherever the original one was.
+    if (!RebalanceHistogram<true>(&targets[0], max_symbol, table_size, shift,
+                                  omit_pos, counts)) {
+      return JXL_FAILURE("Logic error: couldn't rebalance a histogram");
+    }
+  }
+  return true;
+}
+
+struct SizeWriter {
+  size_t size = 0;
+  void Write(size_t num, size_t bits) { size += num; }
+};
+
+template <typename Writer>
+void StoreVarLenUint8(size_t n, Writer* writer) {
+  JXL_DASSERT(n <= 255);
+  if (n == 0) {
+    writer->Write(1, 0);
+  } else {
+    writer->Write(1, 1);
+    size_t nbits = FloorLog2Nonzero(n);
+    writer->Write(3, nbits);
+    writer->Write(nbits, n - (1ULL << nbits));
+  }
+}
+
+template <typename Writer>
+void StoreVarLenUint16(size_t n, Writer* writer) {
+  JXL_DASSERT(n <= 65535);
+  if (n == 0) {
+    writer->Write(1, 0);
+  } else {
+    writer->Write(1, 1);
+    size_t nbits = FloorLog2Nonzero(n);
+    writer->Write(4, nbits);
+    writer->Write(nbits, n - (1ULL << nbits));
+  }
+}
+
+template <typename Writer>
+bool EncodeCounts(const ANSHistBin* counts, const int alphabet_size,
+                  const int omit_pos, const int num_symbols, uint32_t shift,
+                  const int* symbols, Writer* writer) {
+  bool ok = true;
+  if (num_symbols <= 2) {
+    // Small tree marker to encode 1-2 symbols.
+    writer->Write(1, 1);
+    if (num_symbols == 0) {
+      writer->Write(1, 0);
+      StoreVarLenUint8(0, writer);
+    } else {
+      writer->Write(1, num_symbols - 1);
+      for (int i = 0; i < num_symbols; ++i) {
+        StoreVarLenUint8(symbols[i], writer);
+      }
+    }
+    if (num_symbols == 2) {
+      writer->Write(ANS_LOG_TAB_SIZE, counts[symbols[0]]);
+    }
+  } else {
+    // Mark non-small tree.
+    writer->Write(1, 0);
+    // Mark non-flat histogram.
+    writer->Write(1, 0);
+
+    // Precompute sequences for RLE encoding. Contains the number of identical
+    // values starting at a given index. Only contains the value at the first
+    // element of the series.
+    std::vector<uint32_t> same(alphabet_size, 0);
+    int last = 0;
+    for (int i = 1; i < alphabet_size; i++) {
+      // Store the sequence length once different symbol reached, or we're at
+      // the end, or the length is longer than we can encode, or we are at
+      // the omit_pos. We don't support including the omit_pos in an RLE
+      // sequence because this value may use a different amount of log2 bits
+      // than standard, it is too complex to handle in the decoder.
+      if (counts[i] != counts[last] || i + 1 == alphabet_size ||
+          (i - last) >= 255 || i == omit_pos || i == omit_pos + 1) {
+        same[last] = (i - last);
+        last = i + 1;
+      }
+    }
+
+    int length = 0;
+    std::vector<int> logcounts(alphabet_size);
+    int omit_log = 0;
+    for (int i = 0; i < alphabet_size; ++i) {
+      JXL_ASSERT(counts[i] <= ANS_TAB_SIZE);
+      JXL_ASSERT(counts[i] >= 0);
+      if (i == omit_pos) {
+        length = i + 1;
+      } else if (counts[i] > 0) {
+        logcounts[i] = FloorLog2Nonzero(static_cast<uint32_t>(counts[i])) + 1;
+        length = i + 1;
+        if (i < omit_pos) {
+          omit_log = std::max(omit_log, logcounts[i] + 1);
+        } else {
+          omit_log = std::max(omit_log, logcounts[i]);
+        }
+      }
+    }
+    logcounts[omit_pos] = omit_log;
+
+    // Elias gamma-like code for shift. Only difference is that if the number
+    // of bits to be encoded is equal to FloorLog2(ANS_LOG_TAB_SIZE+1), we skip
+    // the terminating 0 in unary coding.
+    int upper_bound_log = FloorLog2Nonzero(ANS_LOG_TAB_SIZE + 1);
+    int log = FloorLog2Nonzero(shift + 1);
+    writer->Write(log, (1 << log) - 1);
+    if (log != upper_bound_log) writer->Write(1, 0);
+    writer->Write(log, ((1 << log) - 1) & (shift + 1));
+
+    // Since num_symbols >= 3, we know that length >= 3, therefore we encode
+    // length - 3.
+    if (length - 3 > 255) {
+      // Pretend that everything is OK, but complain about correctness later.
+      StoreVarLenUint8(255, writer);
+      ok = false;
+    } else {
+      StoreVarLenUint8(length - 3, writer);
+    }
+
+    // The logcount values are encoded with a static Huffman code.
+    static const size_t kMinReps = 4;
+    size_t rep = ANS_LOG_TAB_SIZE + 1;
+    for (int i = 0; i < length; ++i) {
+      if (i > 0 && same[i - 1] > kMinReps) {
+        // Encode the RLE symbol and skip the repeated ones.
+        writer->Write(kLogCountBitLengths[rep], kLogCountSymbols[rep]);
+        StoreVarLenUint8(same[i - 1] - kMinReps - 1, writer);
+        i += same[i - 1] - 2;
+        continue;
+      }
+      writer->Write(kLogCountBitLengths[logcounts[i]],
+                    kLogCountSymbols[logcounts[i]]);
+    }
+    for (int i = 0; i < length; ++i) {
+      if (i > 0 && same[i - 1] > kMinReps) {
+        // Skip symbols encoded by RLE.
+        i += same[i - 1] - 2;
+        continue;
+      }
+      if (logcounts[i] > 1 && i != omit_pos) {
+        int bitcount = GetPopulationCountPrecision(logcounts[i] - 1, shift);
+        int drop_bits = logcounts[i] - 1 - bitcount;
+        JXL_CHECK((counts[i] & ((1 << drop_bits) - 1)) == 0);
+        writer->Write(bitcount, (counts[i] >> drop_bits) - (1 << bitcount));
+      }
+    }
+  }
+  return ok;
+}
+
+void EncodeFlatHistogram(const int alphabet_size, BitWriter* writer) {
+  // Mark non-small tree.
+  writer->Write(1, 0);
+  // Mark uniform histogram.
+  writer->Write(1, 1);
+  JXL_ASSERT(alphabet_size > 0);
+  // Encode alphabet size.
+  StoreVarLenUint8(alphabet_size - 1, writer);
+}
+
+float ComputeHistoAndDataCost(const ANSHistBin* histogram, size_t alphabet_size,
+                              uint32_t method) {
+  if (method == 0) {  // Flat code
+    return ANS_LOG_TAB_SIZE + 2 +
+           EstimateDataBitsFlat(histogram, alphabet_size);
+  }
+  // Non-flat: shift = method-1.
+  uint32_t shift = method - 1;
+  std::vector<ANSHistBin> counts(histogram, histogram + alphabet_size);
+  int omit_pos = 0;
+  int num_symbols;
+  int symbols[kMaxNumSymbolsForSmallCode] = {};
+  JXL_CHECK(NormalizeCounts(counts.data(), &omit_pos, alphabet_size,
+                            ANS_LOG_TAB_SIZE, shift, &num_symbols, symbols));
+  SizeWriter writer;
+  // Ignore the correctness, no real encoding happens at this stage.
+  (void)EncodeCounts(counts.data(), alphabet_size, omit_pos, num_symbols, shift,
+                     symbols, &writer);
+  return writer.size +
+         EstimateDataBits(histogram, counts.data(), alphabet_size);
+}
+
+uint32_t ComputeBestMethod(
+    const ANSHistBin* histogram, size_t alphabet_size, float* cost,
+    HistogramParams::ANSHistogramStrategy ans_histogram_strategy) {
+  size_t method = 0;
+  float fcost = ComputeHistoAndDataCost(histogram, alphabet_size, 0);
+  auto try_shift = [&](size_t shift) {
+    float c = ComputeHistoAndDataCost(histogram, alphabet_size, shift + 1);
+    if (c < fcost) {
+      method = shift + 1;
+      fcost = c;
+    }
+  };
+  switch (ans_histogram_strategy) {
+    case HistogramParams::ANSHistogramStrategy::kPrecise: {
+      for (uint32_t shift = 0; shift <= ANS_LOG_TAB_SIZE; shift++) {
+        try_shift(shift);
+      }
+      break;
+    }
+    case HistogramParams::ANSHistogramStrategy::kApproximate: {
+      for (uint32_t shift = 0; shift <= ANS_LOG_TAB_SIZE; shift += 2) {
+        try_shift(shift);
+      }
+      break;
+    }
+    case HistogramParams::ANSHistogramStrategy::kFast: {
+      try_shift(0);
+      try_shift(ANS_LOG_TAB_SIZE / 2);
+      try_shift(ANS_LOG_TAB_SIZE);
+      break;
+    }
+  };
+  *cost = fcost;
+  return method;
+}
+
+}  // namespace
+
+// Returns an estimate of the cost of encoding this histogram and the
+// corresponding data.
+size_t BuildAndStoreANSEncodingData(
+    HistogramParams::ANSHistogramStrategy ans_histogram_strategy,
+    const ANSHistBin* histogram, size_t alphabet_size, size_t log_alpha_size,
+    bool use_prefix_code, ANSEncSymbolInfo* info, BitWriter* writer) {
+  if (use_prefix_code) {
+    if (alphabet_size <= 1) return 0;
+    std::vector<uint32_t> histo(alphabet_size);
+    for (size_t i = 0; i < alphabet_size; i++) {
+      histo[i] = histogram[i];
+      JXL_CHECK(histogram[i] >= 0);
+    }
+    size_t cost = 0;
+    {
+      std::vector<uint8_t> depths(alphabet_size);
+      std::vector<uint16_t> bits(alphabet_size);
+      if (writer == nullptr) {
+        BitWriter tmp_writer;
+        BitWriter::Allotment allotment(
+            &tmp_writer, 8 * alphabet_size + 8);  // safe upper bound
+        BuildAndStoreHuffmanTree(histo.data(), alphabet_size, depths.data(),
+                                 bits.data(), &tmp_writer);
+        allotment.ReclaimAndCharge(&tmp_writer, 0, /*aux_out=*/nullptr);
+        cost = tmp_writer.BitsWritten();
+      } else {
+        size_t start = writer->BitsWritten();
+        BuildAndStoreHuffmanTree(histo.data(), alphabet_size, depths.data(),
+                                 bits.data(), writer);
+        cost = writer->BitsWritten() - start;
+      }
+      for (size_t i = 0; i < alphabet_size; i++) {
+        info[i].bits = depths[i] == 0 ? 0 : bits[i];
+        info[i].depth = depths[i];
+      }
+    }
+    // Estimate data cost.
+    for (size_t i = 0; i < alphabet_size; i++) {
+      cost += histogram[i] * info[i].depth;
+    }
+    return cost;
+  }
+  JXL_ASSERT(alphabet_size <= ANS_TAB_SIZE);
+  // Ensure we ignore trailing zeros in the histogram.
+  if (alphabet_size != 0) {
+    size_t largest_symbol = 0;
+    for (size_t i = 0; i < alphabet_size; i++) {
+      if (histogram[i] != 0) largest_symbol = i;
+    }
+    alphabet_size = largest_symbol + 1;
+  }
+  float cost;
+  uint32_t method = ComputeBestMethod(histogram, alphabet_size, &cost,
+                                      ans_histogram_strategy);
+  JXL_ASSERT(cost >= 0);
+  int num_symbols;
+  int symbols[kMaxNumSymbolsForSmallCode] = {};
+  std::vector<ANSHistBin> counts(histogram, histogram + alphabet_size);
+  if (!counts.empty()) {
+    size_t sum = 0;
+    for (size_t i = 0; i < counts.size(); i++) {
+      sum += counts[i];
+    }
+    if (sum == 0) {
+      counts[0] = ANS_TAB_SIZE;
+    }
+  }
+  if (method == 0) {
+    counts = CreateFlatHistogram(alphabet_size, ANS_TAB_SIZE);
+    AliasTable::Entry a[ANS_MAX_ALPHABET_SIZE];
+    InitAliasTable(counts, ANS_TAB_SIZE, log_alpha_size, a);
+    ANSBuildInfoTable(counts.data(), a, alphabet_size, log_alpha_size, info);
+    if (writer != nullptr) {
+      EncodeFlatHistogram(alphabet_size, writer);
+    }
+    return cost;
+  }
+  int omit_pos = 0;
+  uint32_t shift = method - 1;
+  JXL_CHECK(NormalizeCounts(counts.data(), &omit_pos, alphabet_size,
+                            ANS_LOG_TAB_SIZE, shift, &num_symbols, symbols));
+  AliasTable::Entry a[ANS_MAX_ALPHABET_SIZE];
+  InitAliasTable(counts, ANS_TAB_SIZE, log_alpha_size, a);
+  ANSBuildInfoTable(counts.data(), a, alphabet_size, log_alpha_size, info);
+  if (writer != nullptr) {
+    bool ok = EncodeCounts(counts.data(), alphabet_size, omit_pos, num_symbols,
+                           shift, symbols, writer);
+    (void)ok;
+    JXL_DASSERT(ok);
+  }
+  return cost;
+}
+
+float ANSPopulationCost(const ANSHistBin* data, size_t alphabet_size) {
+  float c;
+  ComputeBestMethod(data, alphabet_size, &c,
+                    HistogramParams::ANSHistogramStrategy::kFast);
+  return c;
+}
+
+template <typename Writer>
+void EncodeUintConfig(const HybridUintConfig uint_config, Writer* writer,
+                      size_t log_alpha_size) {
+  writer->Write(CeilLog2Nonzero(log_alpha_size + 1),
+                uint_config.split_exponent);
+  if (uint_config.split_exponent == log_alpha_size) {
+    return;  // msb/lsb don't matter.
+  }
+  size_t nbits = CeilLog2Nonzero(uint_config.split_exponent + 1);
+  writer->Write(nbits, uint_config.msb_in_token);
+  nbits = CeilLog2Nonzero(uint_config.split_exponent -
+                          uint_config.msb_in_token + 1);
+  writer->Write(nbits, uint_config.lsb_in_token);
+}
+template <typename Writer>
+void EncodeUintConfigs(const std::vector<HybridUintConfig>& uint_config,
+                       Writer* writer, size_t log_alpha_size) {
+  // TODO(veluca): RLE?
+  for (size_t i = 0; i < uint_config.size(); i++) {
+    EncodeUintConfig(uint_config[i], writer, log_alpha_size);
+  }
+}
+template void EncodeUintConfigs(const std::vector<HybridUintConfig>&,
+                                BitWriter*, size_t);
+
+namespace {
+
+void ChooseUintConfigs(const HistogramParams& params,
+                       const std::vector<std::vector<Token>>& tokens,
+                       const std::vector<uint8_t>& context_map,
+                       std::vector<Histogram>* clustered_histograms,
+                       EntropyEncodingData* codes, size_t* log_alpha_size) {
+  codes->uint_config.resize(clustered_histograms->size());
+
+  if (params.uint_method == HistogramParams::HybridUintMethod::kNone) return;
+  if (params.uint_method == HistogramParams::HybridUintMethod::k000) {
+    codes->uint_config.clear();
+    codes->uint_config.resize(clustered_histograms->size(),
+                              HybridUintConfig(0, 0, 0));
+    return;
+  }
+  if (params.uint_method == HistogramParams::HybridUintMethod::kContextMap) {
+    codes->uint_config.clear();
+    codes->uint_config.resize(clustered_histograms->size(),
+                              HybridUintConfig(2, 0, 1));
+    return;
+  }
+
+  // Brute-force method that tries a few options.
+  std::vector<HybridUintConfig> configs;
+  if (params.uint_method == HistogramParams::HybridUintMethod::kBest) {
+    configs = {
+        HybridUintConfig(4, 2, 0),  // default
+        HybridUintConfig(4, 1, 0),  // less precise
+        HybridUintConfig(4, 2, 1),  // add sign
+        HybridUintConfig(4, 2, 2),  // add sign+parity
+        HybridUintConfig(4, 1, 2),  // add parity but less msb
+        // Same as above, but more direct coding.
+        HybridUintConfig(5, 2, 0), HybridUintConfig(5, 1, 0),
+        HybridUintConfig(5, 2, 1), HybridUintConfig(5, 2, 2),
+        HybridUintConfig(5, 1, 2),
+        // Same as above, but less direct coding.
+        HybridUintConfig(3, 2, 0), HybridUintConfig(3, 1, 0),
+        HybridUintConfig(3, 2, 1), HybridUintConfig(3, 1, 2),
+        // For near-lossless.
+        HybridUintConfig(4, 1, 3), HybridUintConfig(5, 1, 4),
+        HybridUintConfig(5, 2, 3), HybridUintConfig(6, 1, 5),
+        HybridUintConfig(6, 2, 4), HybridUintConfig(6, 0, 0),
+        // Other
+        HybridUintConfig(0, 0, 0),   // varlenuint
+        HybridUintConfig(2, 0, 1),   // works well for ctx map
+        HybridUintConfig(7, 0, 0),   // direct coding
+        HybridUintConfig(8, 0, 0),   // direct coding
+        HybridUintConfig(9, 0, 0),   // direct coding
+        HybridUintConfig(10, 0, 0),  // direct coding
+        HybridUintConfig(11, 0, 0),  // direct coding
+        HybridUintConfig(12, 0, 0),  // direct coding
+    };
+  } else if (params.uint_method == HistogramParams::HybridUintMethod::kFast) {
+    configs = {
+        HybridUintConfig(4, 2, 0),  // default
+        HybridUintConfig(4, 1, 2),  // add parity but less msb
+        HybridUintConfig(0, 0, 0),  // smallest histograms
+        HybridUintConfig(2, 0, 1),  // works well for ctx map
+    };
+  }
+
+  std::vector<float> costs(clustered_histograms->size(),
+                           std::numeric_limits<float>::max());
+  std::vector<uint32_t> extra_bits(clustered_histograms->size());
+  std::vector<uint8_t> is_valid(clustered_histograms->size());
+  size_t max_alpha =
+      codes->use_prefix_code ? PREFIX_MAX_ALPHABET_SIZE : ANS_MAX_ALPHABET_SIZE;
+  for (HybridUintConfig cfg : configs) {
+    std::fill(is_valid.begin(), is_valid.end(), true);
+    std::fill(extra_bits.begin(), extra_bits.end(), 0);
+
+    for (size_t i = 0; i < clustered_histograms->size(); i++) {
+      (*clustered_histograms)[i].Clear();
+    }
+    for (size_t i = 0; i < tokens.size(); ++i) {
+      for (size_t j = 0; j < tokens[i].size(); ++j) {
+        const Token token = tokens[i][j];
+        // TODO(veluca): do not ignore lz77 commands.
+        if (token.is_lz77_length) continue;
+        size_t histo = context_map[token.context];
+        uint32_t tok, nbits, bits;
+        cfg.Encode(token.value, &tok, &nbits, &bits);
+        if (tok >= max_alpha ||
+            (codes->lz77.enabled && tok >= codes->lz77.min_symbol)) {
+          is_valid[histo] = false;
+          continue;
+        }
+        extra_bits[histo] += nbits;
+        (*clustered_histograms)[histo].Add(tok);
+      }
+    }
+
+    for (size_t i = 0; i < clustered_histograms->size(); i++) {
+      if (!is_valid[i]) continue;
+      float cost = (*clustered_histograms)[i].PopulationCost() + extra_bits[i];
+      // add signaling cost of the hybriduintconfig itself
+      cost += CeilLog2Nonzero(cfg.split_exponent + 1);
+      cost += CeilLog2Nonzero(cfg.split_exponent - cfg.msb_in_token + 1);
+      if (cost < costs[i]) {
+        codes->uint_config[i] = cfg;
+        costs[i] = cost;
+      }
+    }
+  }
+
+  // Rebuild histograms.
+  for (size_t i = 0; i < clustered_histograms->size(); i++) {
+    (*clustered_histograms)[i].Clear();
+  }
+  *log_alpha_size = 4;
+  for (size_t i = 0; i < tokens.size(); ++i) {
+    for (size_t j = 0; j < tokens[i].size(); ++j) {
+      const Token token = tokens[i][j];
+      uint32_t tok, nbits, bits;
+      size_t histo = context_map[token.context];
+      (token.is_lz77_length ? codes->lz77.length_uint_config
+                            : codes->uint_config[histo])
+          .Encode(token.value, &tok, &nbits, &bits);
+      tok += token.is_lz77_length ? codes->lz77.min_symbol : 0;
+      (*clustered_histograms)[histo].Add(tok);
+      while (tok >= (1u << *log_alpha_size)) (*log_alpha_size)++;
+    }
+  }
+#if JXL_ENABLE_ASSERT
+  size_t max_log_alpha_size = codes->use_prefix_code ? PREFIX_MAX_BITS : 8;
+  JXL_ASSERT(*log_alpha_size <= max_log_alpha_size);
+#endif
+}
+
+class HistogramBuilder {
+ public:
+  explicit HistogramBuilder(const size_t num_contexts)
+      : histograms_(num_contexts) {}
+
+  void VisitSymbol(int symbol, size_t histo_idx) {
+    JXL_DASSERT(histo_idx < histograms_.size());
+    histograms_[histo_idx].Add(symbol);
+  }
+
+  // NOTE: `layer` is only for clustered_entropy; caller does ReclaimAndCharge.
+  size_t BuildAndStoreEntropyCodes(
+      const HistogramParams& params,
+      const std::vector<std::vector<Token>>& tokens, EntropyEncodingData* codes,
+      std::vector<uint8_t>* context_map, bool use_prefix_code,
+      BitWriter* writer, size_t layer, AuxOut* aux_out) const {
+    size_t cost = 0;
+    codes->encoding_info.clear();
+    std::vector<Histogram> clustered_histograms(histograms_);
+    context_map->resize(histograms_.size());
+    if (histograms_.size() > 1) {
+      if (!ans_fuzzer_friendly_) {
+        std::vector<uint32_t> histogram_symbols;
+        ClusterHistograms(params, histograms_, kClustersLimit,
+                          &clustered_histograms, &histogram_symbols);
+        for (size_t c = 0; c < histograms_.size(); ++c) {
+          (*context_map)[c] = static_cast<uint8_t>(histogram_symbols[c]);
+        }
+      } else {
+        fill(context_map->begin(), context_map->end(), 0);
+        size_t max_symbol = 0;
+        for (const Histogram& h : histograms_) {
+          max_symbol = std::max(h.data_.size(), max_symbol);
+        }
+        size_t num_symbols = 1 << CeilLog2Nonzero(max_symbol + 1);
+        clustered_histograms.resize(1);
+        clustered_histograms[0].Clear();
+        for (size_t i = 0; i < num_symbols; i++) {
+          clustered_histograms[0].Add(i);
+        }
+      }
+      if (writer != nullptr) {
+        EncodeContextMap(*context_map, clustered_histograms.size(), writer,
+                         layer, aux_out);
+      }
+    }
+    if (aux_out != nullptr) {
+      for (size_t i = 0; i < clustered_histograms.size(); ++i) {
+        aux_out->layers[layer].clustered_entropy +=
+            clustered_histograms[i].ShannonEntropy();
+      }
+    }
+    codes->use_prefix_code = use_prefix_code;
+    size_t log_alpha_size = codes->lz77.enabled ? 8 : 7;  // Sane default.
+    if (ans_fuzzer_friendly_) {
+      codes->uint_config.clear();
+      codes->uint_config.resize(1, HybridUintConfig(7, 0, 0));
+    } else {
+      ChooseUintConfigs(params, tokens, *context_map, &clustered_histograms,
+                        codes, &log_alpha_size);
+    }
+    if (log_alpha_size < 5) log_alpha_size = 5;
+    SizeWriter size_writer;  // Used if writer == nullptr to estimate costs.
+    cost += 1;
+    if (writer) writer->Write(1, use_prefix_code);
+
+    if (use_prefix_code) {
+      log_alpha_size = PREFIX_MAX_BITS;
+    } else {
+      cost += 2;
+    }
+    if (writer == nullptr) {
+      EncodeUintConfigs(codes->uint_config, &size_writer, log_alpha_size);
+    } else {
+      if (!use_prefix_code) writer->Write(2, log_alpha_size - 5);
+      EncodeUintConfigs(codes->uint_config, writer, log_alpha_size);
+    }
+    if (use_prefix_code) {
+      for (size_t c = 0; c < clustered_histograms.size(); ++c) {
+        size_t num_symbol = 1;
+        for (size_t i = 0; i < clustered_histograms[c].data_.size(); i++) {
+          if (clustered_histograms[c].data_[i]) num_symbol = i + 1;
+        }
+        if (writer) {
+          StoreVarLenUint16(num_symbol - 1, writer);
+        } else {
+          StoreVarLenUint16(num_symbol - 1, &size_writer);
+        }
+      }
+    }
+    cost += size_writer.size;
+    for (size_t c = 0; c < clustered_histograms.size(); ++c) {
+      size_t num_symbol = 1;
+      for (size_t i = 0; i < clustered_histograms[c].data_.size(); i++) {
+        if (clustered_histograms[c].data_[i]) num_symbol = i + 1;
+      }
+      codes->encoding_info.emplace_back();
+      codes->encoding_info.back().resize(std::max<size_t>(1, num_symbol));
+
+      BitWriter::Allotment allotment(writer, 256 + num_symbol * 24);
+      cost += BuildAndStoreANSEncodingData(
+          params.ans_histogram_strategy, clustered_histograms[c].data_.data(),
+          num_symbol, log_alpha_size, use_prefix_code,
+          codes->encoding_info.back().data(), writer);
+      allotment.FinishedHistogram(writer);
+      allotment.ReclaimAndCharge(writer, layer, aux_out);
+    }
+    return cost;
+  }
+
+  const Histogram& Histo(size_t i) const { return histograms_[i]; }
+
+ private:
+  std::vector<Histogram> histograms_;
+};
+
+class SymbolCostEstimator {
+ public:
+  SymbolCostEstimator(size_t num_contexts, bool force_huffman,
+                      const std::vector<std::vector<Token>>& tokens,
+                      const LZ77Params& lz77) {
+    HistogramBuilder builder(num_contexts);
+    // Build histograms for estimating lz77 savings.
+    HybridUintConfig uint_config;
+    for (size_t i = 0; i < tokens.size(); ++i) {
+      for (size_t j = 0; j < tokens[i].size(); ++j) {
+        const Token token = tokens[i][j];
+        uint32_t tok, nbits, bits;
+        (token.is_lz77_length ? lz77.length_uint_config : uint_config)
+            .Encode(token.value, &tok, &nbits, &bits);
+        tok += token.is_lz77_length ? lz77.min_symbol : 0;
+        builder.VisitSymbol(tok, token.context);
+      }
+    }
+    max_alphabet_size_ = 0;
+    for (size_t i = 0; i < num_contexts; i++) {
+      max_alphabet_size_ =
+          std::max(max_alphabet_size_, builder.Histo(i).data_.size());
+    }
+    bits_.resize(num_contexts * max_alphabet_size_);
+    // TODO(veluca): SIMD?
+    add_symbol_cost_.resize(num_contexts);
+    for (size_t i = 0; i < num_contexts; i++) {
+      float inv_total = 1.0f / (builder.Histo(i).total_count_ + 1e-8f);
+      float total_cost = 0;
+      for (size_t j = 0; j < builder.Histo(i).data_.size(); j++) {
+        size_t cnt = builder.Histo(i).data_[j];
+        float cost = 0;
+        if (cnt != 0 && cnt != builder.Histo(i).total_count_) {
+          cost = -FastLog2f(cnt * inv_total);
+          if (force_huffman) cost = std::ceil(cost);
+        } else if (cnt == 0) {
+          cost = ANS_LOG_TAB_SIZE;  // Highest possible cost.
+        }
+        bits_[i * max_alphabet_size_ + j] = cost;
+        total_cost += cost * builder.Histo(i).data_[j];
+      }
+      // Penalty for adding a lz77 symbol to this contest (only used for static
+      // cost model). Higher penalty for contexts that have a very low
+      // per-symbol entropy.
+      add_symbol_cost_[i] = std::max(0.0f, 6.0f - total_cost * inv_total);
+    }
+  }
+  float Bits(size_t ctx, size_t sym) const {
+    return bits_[ctx * max_alphabet_size_ + sym];
+  }
+  float LenCost(size_t ctx, size_t len, const LZ77Params& lz77) const {
+    uint32_t nbits, bits, tok;
+    lz77.length_uint_config.Encode(len, &tok, &nbits, &bits);
+    tok += lz77.min_symbol;
+    return nbits + Bits(ctx, tok);
+  }
+  float DistCost(size_t len, const LZ77Params& lz77) const {
+    uint32_t nbits, bits, tok;
+    HybridUintConfig().Encode(len, &tok, &nbits, &bits);
+    return nbits + Bits(lz77.nonserialized_distance_context, tok);
+  }
+  float AddSymbolCost(size_t idx) const { return add_symbol_cost_[idx]; }
+
+ private:
+  size_t max_alphabet_size_;
+  std::vector<float> bits_;
+  std::vector<float> add_symbol_cost_;
+};
+
+void ApplyLZ77_RLE(const HistogramParams& params, size_t num_contexts,
+                   const std::vector<std::vector<Token>>& tokens,
+                   LZ77Params& lz77,
+                   std::vector<std::vector<Token>>& tokens_lz77) {
+  // TODO(veluca): tune heuristics here.
+  SymbolCostEstimator sce(num_contexts, params.force_huffman, tokens, lz77);
+  float bit_decrease = 0;
+  size_t total_symbols = 0;
+  tokens_lz77.resize(tokens.size());
+  std::vector<float> sym_cost;
+  HybridUintConfig uint_config;
+  for (size_t stream = 0; stream < tokens.size(); stream++) {
+    size_t distance_multiplier =
+        params.image_widths.size() > stream ? params.image_widths[stream] : 0;
+    const auto& in = tokens[stream];
+    auto& out = tokens_lz77[stream];
+    total_symbols += in.size();
+    // Cumulative sum of bit costs.
+    sym_cost.resize(in.size() + 1);
+    for (size_t i = 0; i < in.size(); i++) {
+      uint32_t tok, nbits, unused_bits;
+      uint_config.Encode(in[i].value, &tok, &nbits, &unused_bits);
+      sym_cost[i + 1] = sce.Bits(in[i].context, tok) + nbits + sym_cost[i];
+    }
+    out.reserve(in.size());
+    for (size_t i = 0; i < in.size(); i++) {
+      size_t num_to_copy = 0;
+      size_t distance_symbol = 0;  // 1 for RLE.
+      if (distance_multiplier != 0) {
+        distance_symbol = 1;  // Special distance 1 if enabled.
+        JXL_DASSERT(kSpecialDistances[1][0] == 1);
+        JXL_DASSERT(kSpecialDistances[1][1] == 0);
+      }
+      if (i > 0) {
+        for (; i + num_to_copy < in.size(); num_to_copy++) {
+          if (in[i + num_to_copy].value != in[i - 1].value) {
+            break;
+          }
+        }
+      }
+      if (num_to_copy == 0) {
+        out.push_back(in[i]);
+        continue;
+      }
+      float cost = sym_cost[i + num_to_copy] - sym_cost[i];
+      // This subtraction might overflow, but that's OK.
+      size_t lz77_len = num_to_copy - lz77.min_length;
+      float lz77_cost = num_to_copy >= lz77.min_length
+                            ? CeilLog2Nonzero(lz77_len + 1) + 1
+                            : 0;
+      if (num_to_copy < lz77.min_length || cost <= lz77_cost) {
+        for (size_t j = 0; j < num_to_copy; j++) {
+          out.push_back(in[i + j]);
+        }
+        i += num_to_copy - 1;
+        continue;
+      }
+      // Output the LZ77 length
+      out.emplace_back(in[i].context, lz77_len);
+      out.back().is_lz77_length = true;
+      i += num_to_copy - 1;
+      bit_decrease += cost - lz77_cost;
+      // Output the LZ77 copy distance.
+      out.emplace_back(lz77.nonserialized_distance_context, distance_symbol);
+    }
+  }
+
+  if (bit_decrease > total_symbols * 0.2 + 16) {
+    lz77.enabled = true;
+  }
+}
+
+// Hash chain for LZ77 matching
+struct HashChain {
+  size_t size_;
+  std::vector<uint32_t> data_;
+
+  unsigned hash_num_values_ = 32768;
+  unsigned hash_mask_ = hash_num_values_ - 1;
+  unsigned hash_shift_ = 5;
+
+  std::vector<int> head;
+  std::vector<uint32_t> chain;
+  std::vector<int> val;
+
+  // Speed up repetitions of zero
+  std::vector<int> headz;
+  std::vector<uint32_t> chainz;
+  std::vector<uint32_t> zeros;
+  uint32_t numzeros = 0;
+
+  size_t window_size_;
+  size_t window_mask_;
+  size_t min_length_;
+  size_t max_length_;
+
+  // Map of special distance codes.
+  std::unordered_map<int, int> special_dist_table_;
+  size_t num_special_distances_ = 0;
+
+  uint32_t maxchainlength = 256;  // window_size_ to allow all
+
+  HashChain(const Token* data, size_t size, size_t window_size,
+            size_t min_length, size_t max_length, size_t distance_multiplier)
+      : size_(size),
+        window_size_(window_size),
+        window_mask_(window_size - 1),
+        min_length_(min_length),
+        max_length_(max_length) {
+    data_.resize(size);
+    for (size_t i = 0; i < size; i++) {
+      data_[i] = data[i].value;
+    }
+
+    head.resize(hash_num_values_, -1);
+    val.resize(window_size_, -1);
+    chain.resize(window_size_);
+    for (uint32_t i = 0; i < window_size_; ++i) {
+      chain[i] = i;  // same value as index indicates uninitialized
+    }
+
+    zeros.resize(window_size_);
+    headz.resize(window_size_ + 1, -1);
+    chainz.resize(window_size_);
+    for (uint32_t i = 0; i < window_size_; ++i) {
+      chainz[i] = i;
+    }
+    // Translate distance to special distance code.
+    if (distance_multiplier) {
+      // Count down, so if due to small distance multiplier multiple distances
+      // map to the same code, the smallest code will be used in the end.
+      for (int i = kNumSpecialDistances - 1; i >= 0; --i) {
+        int xi = kSpecialDistances[i][0];
+        int yi = kSpecialDistances[i][1];
+        int distance = yi * distance_multiplier + xi;
+        // Ensure that we map distance 1 to the lowest symbols.
+        if (distance < 1) distance = 1;
+        special_dist_table_[distance] = i;
+      }
+      num_special_distances_ = kNumSpecialDistances;
+    }
+  }
+
+  uint32_t GetHash(size_t pos) const {
+    uint32_t result = 0;
+    if (pos + 2 < size_) {
+      // TODO(lode): take the MSB's of the uint32_t values into account as well,
+      // given that the hash code itself is less than 32 bits.
+      result ^= (uint32_t)(data_[pos + 0] << 0u);
+      result ^= (uint32_t)(data_[pos + 1] << hash_shift_);
+      result ^= (uint32_t)(data_[pos + 2] << (hash_shift_ * 2));
+    } else {
+      // No need to compute hash of last 2 bytes, the length 2 is too short.
+      return 0;
+    }
+    return result & hash_mask_;
+  }
+
+  uint32_t CountZeros(size_t pos, uint32_t prevzeros) const {
+    size_t end = pos + window_size_;
+    if (end > size_) end = size_;
+    if (prevzeros > 0) {
+      if (prevzeros >= window_mask_ && data_[end - 1] == 0 &&
+          end == pos + window_size_) {
+        return prevzeros;
+      } else {
+        return prevzeros - 1;
+      }
+    }
+    uint32_t num = 0;
+    while (pos + num < end && data_[pos + num] == 0) num++;
+    return num;
+  }
+
+  void Update(size_t pos) {
+    uint32_t hashval = GetHash(pos);
+    uint32_t wpos = pos & window_mask_;
+
+    val[wpos] = (int)hashval;
+    if (head[hashval] != -1) chain[wpos] = head[hashval];
+    head[hashval] = wpos;
+
+    if (pos > 0 && data_[pos] != data_[pos - 1]) numzeros = 0;
+    numzeros = CountZeros(pos, numzeros);
+
+    zeros[wpos] = numzeros;
+    if (headz[numzeros] != -1) chainz[wpos] = headz[numzeros];
+    headz[numzeros] = wpos;
+  }
+
+  void Update(size_t pos, size_t len) {
+    for (size_t i = 0; i < len; i++) {
+      Update(pos + i);
+    }
+  }
+
+  template <typename CB>
+  void FindMatches(size_t pos, int max_dist, const CB& found_match) const {
+    uint32_t wpos = pos & window_mask_;
+    uint32_t hashval = GetHash(pos);
+    uint32_t hashpos = chain[wpos];
+
+    int prev_dist = 0;
+    int end = std::min<int>(pos + max_length_, size_);
+    uint32_t chainlength = 0;
+    uint32_t best_len = 0;
+    for (;;) {
+      int dist = (hashpos <= wpos) ? (wpos - hashpos)
+                                   : (wpos - hashpos + window_mask_ + 1);
+      if (dist < prev_dist) break;
+      prev_dist = dist;
+      uint32_t len = 0;
+      if (dist > 0) {
+        int i = pos;
+        int j = pos - dist;
+        if (numzeros > 3) {
+          int r = std::min<int>(numzeros - 1, zeros[hashpos]);
+          if (i + r >= end) r = end - i - 1;
+          i += r;
+          j += r;
+        }
+        while (i < end && data_[i] == data_[j]) {
+          i++;
+          j++;
+        }
+        len = i - pos;
+        // This can trigger even if the new length is slightly smaller than the
+        // best length, because it is possible for a slightly cheaper distance
+        // symbol to occur.
+        if (len >= min_length_ && len + 2 >= best_len) {
+          auto it = special_dist_table_.find(dist);
+          int dist_symbol = (it == special_dist_table_.end())
+                                ? (num_special_distances_ + dist - 1)
+                                : it->second;
+          found_match(len, dist_symbol);
+          if (len > best_len) best_len = len;
+        }
+      }
+
+      chainlength++;
+      if (chainlength >= maxchainlength) break;
+
+      if (numzeros >= 3 && len > numzeros) {
+        if (hashpos == chainz[hashpos]) break;
+        hashpos = chainz[hashpos];
+        if (zeros[hashpos] != numzeros) break;
+      } else {
+        if (hashpos == chain[hashpos]) break;
+        hashpos = chain[hashpos];
+        if (val[hashpos] != (int)hashval) break;  // outdated hash value
+      }
+    }
+  }
+  void FindMatch(size_t pos, int max_dist, size_t* result_dist_symbol,
+                 size_t* result_len) const {
+    *result_dist_symbol = 0;
+    *result_len = 1;
+    FindMatches(pos, max_dist, [&](size_t len, size_t dist_symbol) {
+      if (len > *result_len ||
+          (len == *result_len && *result_dist_symbol > dist_symbol)) {
+        *result_len = len;
+        *result_dist_symbol = dist_symbol;
+      }
+    });
+  }
+};
+
+float LenCost(size_t len) {
+  uint32_t nbits, bits, tok;
+  HybridUintConfig(1, 0, 0).Encode(len, &tok, &nbits, &bits);
+  constexpr float kCostTable[] = {
+      2.797667318563126,  3.213177690381199,  2.5706009246743737,
+      2.408392498667534,  2.829649191872326,  3.3923087753324577,
+      4.029267451554331,  4.415576699706408,  4.509357574741465,
+      9.21481543803004,   10.020590190114898, 11.858671627804766,
+      12.45853300490526,  11.713105831990857, 12.561996324849314,
+      13.775477692278367, 13.174027068768641,
+  };
+  size_t table_size = sizeof kCostTable / sizeof *kCostTable;
+  if (tok >= table_size) tok = table_size - 1;
+  return kCostTable[tok] + nbits;
+}
+
+// TODO(veluca): this does not take into account usage or non-usage of distance
+// multipliers.
+float DistCost(size_t dist) {
+  uint32_t nbits, bits, tok;
+  HybridUintConfig(7, 0, 0).Encode(dist, &tok, &nbits, &bits);
+  constexpr float kCostTable[] = {
+      6.368282626312716,  5.680793277090298,  8.347404197105247,
+      7.641619201599141,  6.914328374119438,  7.959808291537444,
+      8.70023120759855,   8.71378518934703,   9.379132523982769,
+      9.110472749092708,  9.159029569270908,  9.430936766731973,
+      7.278284055315169,  7.8278514904267755, 10.026641158289236,
+      9.976049229827066,  9.64351607048908,   9.563403863480442,
+      10.171474111762747, 10.45950155077234,  9.994813912104219,
+      10.322524683741156, 8.465808729388186,  8.756254166066853,
+      10.160930174662234, 10.247329273413435, 10.04090403724809,
+      10.129398517544082, 9.342311691539546,  9.07608009102374,
+      10.104799540677513, 10.378079384990906, 10.165828974075072,
+      10.337595322341553, 7.940557464567944,  10.575665823319431,
+      11.023344321751955, 10.736144698831827, 11.118277044595054,
+      7.468468230648442,  10.738305230932939, 10.906980780216568,
+      10.163468216353817, 10.17805759656433,  11.167283670483565,
+      11.147050200274544, 10.517921919244333, 10.651764778156886,
+      10.17074446448919,  11.217636876224745, 11.261630721139484,
+      11.403140815247259, 10.892472096873417, 11.1859607804481,
+      8.017346947551262,  7.895143720278828,  11.036577113822025,
+      11.170562110315794, 10.326988722591086, 10.40872184751056,
+      11.213498225466386, 11.30580635516863,  10.672272515665442,
+      10.768069466228063, 11.145257364153565, 11.64668307145549,
+      10.593156194627339, 11.207499484844943, 10.767517766396908,
+      10.826629811407042, 10.737764794499988, 10.6200448518045,
+      10.191315385198092, 8.468384171390085,  11.731295299170432,
+      11.824619886654398, 10.41518844301179,  10.16310536548649,
+      10.539423685097576, 10.495136599328031, 10.469112847728267,
+      11.72057686174922,  10.910326337834674, 11.378921834673758,
+      11.847759036098536, 11.92071647623854,  10.810628276345282,
+      11.008601085273893, 11.910326337834674, 11.949212023423133,
+      11.298614839104337, 11.611603659010392, 10.472930394619985,
+      11.835564720850282, 11.523267392285337, 12.01055816679611,
+      8.413029688994023,  11.895784139536406, 11.984679534970505,
+      11.220654278717394, 11.716311684833672, 10.61036646226114,
+      10.89849965960364,  10.203762898863669, 10.997560826267238,
+      11.484217379438984, 11.792836176993665, 12.24310468755171,
+      11.464858097919262, 12.212747017409377, 11.425595666074955,
+      11.572048533398757, 12.742093965163013, 11.381874288645637,
+      12.191870445817015, 11.683156920035426, 11.152442115262197,
+      11.90303691580457,  11.653292787169159, 11.938615382266098,
+      16.970641701570223, 16.853602280380002, 17.26240782594733,
+      16.644655390108507, 17.14310889757499,  16.910935455445955,
+      17.505678976959697, 17.213498225466388, 2.4162310293553024,
+      3.494587244462329,  3.5258600986408344, 3.4959806589517095,
+      3.098390886949687,  3.343454654302911,  3.588847442290287,
+      4.14614790111827,   5.152948641990529,  7.433696808092598,
+      9.716311684833672,
+  };
+  size_t table_size = sizeof kCostTable / sizeof *kCostTable;
+  if (tok >= table_size) tok = table_size - 1;
+  return kCostTable[tok] + nbits;
+}
+
+void ApplyLZ77_LZ77(const HistogramParams& params, size_t num_contexts,
+                    const std::vector<std::vector<Token>>& tokens,
+                    LZ77Params& lz77,
+                    std::vector<std::vector<Token>>& tokens_lz77) {
+  // TODO(veluca): tune heuristics here.
+  SymbolCostEstimator sce(num_contexts, params.force_huffman, tokens, lz77);
+  float bit_decrease = 0;
+  size_t total_symbols = 0;
+  tokens_lz77.resize(tokens.size());
+  HybridUintConfig uint_config;
+  std::vector<float> sym_cost;
+  for (size_t stream = 0; stream < tokens.size(); stream++) {
+    size_t distance_multiplier =
+        params.image_widths.size() > stream ? params.image_widths[stream] : 0;
+    const auto& in = tokens[stream];
+    auto& out = tokens_lz77[stream];
+    total_symbols += in.size();
+    // Cumulative sum of bit costs.
+    sym_cost.resize(in.size() + 1);
+    for (size_t i = 0; i < in.size(); i++) {
+      uint32_t tok, nbits, unused_bits;
+      uint_config.Encode(in[i].value, &tok, &nbits, &unused_bits);
+      sym_cost[i + 1] = sce.Bits(in[i].context, tok) + nbits + sym_cost[i];
+    }
+
+    out.reserve(in.size());
+    size_t max_distance = in.size();
+    size_t min_length = lz77.min_length;
+    JXL_ASSERT(min_length >= 3);
+    size_t max_length = in.size();
+
+    // Use next power of two as window size.
+    size_t window_size = 1;
+    while (window_size < max_distance && window_size < kWindowSize) {
+      window_size <<= 1;
+    }
+
+    HashChain chain(in.data(), in.size(), window_size, min_length, max_length,
+                    distance_multiplier);
+    size_t len, dist_symbol;
+
+    const size_t max_lazy_match_len = 256;  // 0 to disable lazy matching
+
+    // Whether the next symbol was already updated (to test lazy matching)
+    bool already_updated = false;
+    for (size_t i = 0; i < in.size(); i++) {
+      out.push_back(in[i]);
+      if (!already_updated) chain.Update(i);
+      already_updated = false;
+      chain.FindMatch(i, max_distance, &dist_symbol, &len);
+      if (len >= min_length) {
+        if (len < max_lazy_match_len && i + 1 < in.size()) {
+          // Try length at next symbol lazy matching
+          chain.Update(i + 1);
+          already_updated = true;
+          size_t len2, dist_symbol2;
+          chain.FindMatch(i + 1, max_distance, &dist_symbol2, &len2);
+          if (len2 > len) {
+            // Use the lazy match. Add literal, and use the next length starting
+            // from the next byte.
+            ++i;
+            already_updated = false;
+            len = len2;
+            dist_symbol = dist_symbol2;
+            out.push_back(in[i]);
+          }
+        }
+
+        float cost = sym_cost[i + len] - sym_cost[i];
+        size_t lz77_len = len - lz77.min_length;
+        float lz77_cost = LenCost(lz77_len) + DistCost(dist_symbol) +
+                          sce.AddSymbolCost(out.back().context);
+
+        if (lz77_cost <= cost) {
+          out.back().value = len - min_length;
+          out.back().is_lz77_length = true;
+          out.emplace_back(lz77.nonserialized_distance_context, dist_symbol);
+          bit_decrease += cost - lz77_cost;
+        } else {
+          // LZ77 match ignored, and symbol already pushed. Push all other
+          // symbols and skip.
+          for (size_t j = 1; j < len; j++) {
+            out.push_back(in[i + j]);
+          }
+        }
+
+        if (already_updated) {
+          chain.Update(i + 2, len - 2);
+          already_updated = false;
+        } else {
+          chain.Update(i + 1, len - 1);
+        }
+        i += len - 1;
+      } else {
+        // Literal, already pushed
+      }
+    }
+  }
+
+  if (bit_decrease > total_symbols * 0.2 + 16) {
+    lz77.enabled = true;
+  }
+}
+
+void ApplyLZ77_Optimal(const HistogramParams& params, size_t num_contexts,
+                       const std::vector<std::vector<Token>>& tokens,
+                       LZ77Params& lz77,
+                       std::vector<std::vector<Token>>& tokens_lz77) {
+  std::vector<std::vector<Token>> tokens_for_cost_estimate;
+  ApplyLZ77_LZ77(params, num_contexts, tokens, lz77, tokens_for_cost_estimate);
+  // If greedy-LZ77 does not give better compression than no-lz77, no reason to
+  // run the optimal matching.
+  if (!lz77.enabled) return;
+  SymbolCostEstimator sce(num_contexts + 1, params.force_huffman,
+                          tokens_for_cost_estimate, lz77);
+  tokens_lz77.resize(tokens.size());
+  HybridUintConfig uint_config;
+  std::vector<float> sym_cost;
+  std::vector<uint32_t> dist_symbols;
+  for (size_t stream = 0; stream < tokens.size(); stream++) {
+    size_t distance_multiplier =
+        params.image_widths.size() > stream ? params.image_widths[stream] : 0;
+    const auto& in = tokens[stream];
+    auto& out = tokens_lz77[stream];
+    // Cumulative sum of bit costs.
+    sym_cost.resize(in.size() + 1);
+    for (size_t i = 0; i < in.size(); i++) {
+      uint32_t tok, nbits, unused_bits;
+      uint_config.Encode(in[i].value, &tok, &nbits, &unused_bits);
+      sym_cost[i + 1] = sce.Bits(in[i].context, tok) + nbits + sym_cost[i];
+    }
+
+    out.reserve(in.size());
+    size_t max_distance = in.size();
+    size_t min_length = lz77.min_length;
+    JXL_ASSERT(min_length >= 3);
+    size_t max_length = in.size();
+
+    // Use next power of two as window size.
+    size_t window_size = 1;
+    while (window_size < max_distance && window_size < kWindowSize) {
+      window_size <<= 1;
+    }
+
+    HashChain chain(in.data(), in.size(), window_size, min_length, max_length,
+                    distance_multiplier);
+
+    struct MatchInfo {
+      uint32_t len;
+      uint32_t dist_symbol;
+      uint32_t ctx;
+      float total_cost = std::numeric_limits<float>::max();
+    };
+    // Total cost to encode the first N symbols.
+    std::vector<MatchInfo> prefix_costs(in.size() + 1);
+    prefix_costs[0].total_cost = 0;
+
+    size_t rle_length = 0;
+    size_t skip_lz77 = 0;
+    for (size_t i = 0; i < in.size(); i++) {
+      chain.Update(i);
+      float lit_cost =
+          prefix_costs[i].total_cost + sym_cost[i + 1] - sym_cost[i];
+      if (prefix_costs[i + 1].total_cost > lit_cost) {
+        prefix_costs[i + 1].dist_symbol = 0;
+        prefix_costs[i + 1].len = 1;
+        prefix_costs[i + 1].ctx = in[i].context;
+        prefix_costs[i + 1].total_cost = lit_cost;
+      }
+      if (skip_lz77 > 0) {
+        skip_lz77--;
+        continue;
+      }
+      dist_symbols.clear();
+      chain.FindMatches(i, max_distance,
+                        [&dist_symbols](size_t len, size_t dist_symbol) {
+                          if (dist_symbols.size() <= len) {
+                            dist_symbols.resize(len + 1, dist_symbol);
+                          }
+                          if (dist_symbol < dist_symbols[len]) {
+                            dist_symbols[len] = dist_symbol;
+                          }
+                        });
+      if (dist_symbols.size() <= min_length) continue;
+      {
+        size_t best_cost = dist_symbols.back();
+        for (size_t j = dist_symbols.size() - 1; j >= min_length; j--) {
+          if (dist_symbols[j] < best_cost) {
+            best_cost = dist_symbols[j];
+          }
+          dist_symbols[j] = best_cost;
+        }
+      }
+      for (size_t j = min_length; j < dist_symbols.size(); j++) {
+        // Cost model that uses results from lazy LZ77.
+        float lz77_cost = sce.LenCost(in[i].context, j - min_length, lz77) +
+                          sce.DistCost(dist_symbols[j], lz77);
+        float cost = prefix_costs[i].total_cost + lz77_cost;
+        if (prefix_costs[i + j].total_cost > cost) {
+          prefix_costs[i + j].len = j;
+          prefix_costs[i + j].dist_symbol = dist_symbols[j] + 1;
+          prefix_costs[i + j].ctx = in[i].context;
+          prefix_costs[i + j].total_cost = cost;
+        }
+      }
+      // We are in a RLE sequence: skip all the symbols except the first 8 and
+      // the last 8. This avoid quadratic costs for sequences with long runs of
+      // the same symbol.
+      if ((dist_symbols.back() == 0 && distance_multiplier == 0) ||
+          (dist_symbols.back() == 1 && distance_multiplier != 0)) {
+        rle_length++;
+      } else {
+        rle_length = 0;
+      }
+      if (rle_length >= 8 && dist_symbols.size() > 9) {
+        skip_lz77 = dist_symbols.size() - 10;
+        rle_length = 0;
+      }
+    }
+    size_t pos = in.size();
+    while (pos > 0) {
+      bool is_lz77_length = prefix_costs[pos].dist_symbol != 0;
+      if (is_lz77_length) {
+        size_t dist_symbol = prefix_costs[pos].dist_symbol - 1;
+        out.emplace_back(lz77.nonserialized_distance_context, dist_symbol);
+      }
+      size_t val = is_lz77_length ? prefix_costs[pos].len - min_length
+                                  : in[pos - 1].value;
+      out.emplace_back(prefix_costs[pos].ctx, val);
+      out.back().is_lz77_length = is_lz77_length;
+      pos -= prefix_costs[pos].len;
+    }
+    std::reverse(out.begin(), out.end());
+  }
+}
+
+void ApplyLZ77(const HistogramParams& params, size_t num_contexts,
+               const std::vector<std::vector<Token>>& tokens, LZ77Params& lz77,
+               std::vector<std::vector<Token>>& tokens_lz77) {
+  lz77.enabled = false;
+  if (params.force_huffman) {
+    lz77.min_symbol = std::min(PREFIX_MAX_ALPHABET_SIZE - 32, 512);
+  } else {
+    lz77.min_symbol = 224;
+  }
+  if (params.lz77_method == HistogramParams::LZ77Method::kNone) {
+    return;
+  } else if (params.lz77_method == HistogramParams::LZ77Method::kRLE) {
+    ApplyLZ77_RLE(params, num_contexts, tokens, lz77, tokens_lz77);
+  } else if (params.lz77_method == HistogramParams::LZ77Method::kLZ77) {
+    ApplyLZ77_LZ77(params, num_contexts, tokens, lz77, tokens_lz77);
+  } else if (params.lz77_method == HistogramParams::LZ77Method::kOptimal) {
+    ApplyLZ77_Optimal(params, num_contexts, tokens, lz77, tokens_lz77);
+  } else {
+    JXL_ABORT("Not implemented");
+  }
+}
+}  // namespace
+
+size_t BuildAndEncodeHistograms(const HistogramParams& params,
+                                size_t num_contexts,
+                                std::vector<std::vector<Token>>& tokens,
+                                EntropyEncodingData* codes,
+                                std::vector<uint8_t>* context_map,
+                                BitWriter* writer, size_t layer,
+                                AuxOut* aux_out) {
+  size_t total_bits = 0;
+  codes->lz77.nonserialized_distance_context = num_contexts;
+  std::vector<std::vector<Token>> tokens_lz77;
+  ApplyLZ77(params, num_contexts, tokens, codes->lz77, tokens_lz77);
+  if (ans_fuzzer_friendly_) {
+    codes->lz77.length_uint_config = HybridUintConfig(10, 0, 0);
+    codes->lz77.min_symbol = 2048;
+  }
+
+  const size_t max_contexts = std::min(num_contexts, kClustersLimit);
+  BitWriter::Allotment allotment(writer,
+                                 128 + num_contexts * 40 + max_contexts * 96);
+  if (writer) {
+    JXL_CHECK(Bundle::Write(codes->lz77, writer, layer, aux_out));
+  } else {
+    size_t ebits, bits;
+    JXL_CHECK(Bundle::CanEncode(codes->lz77, &ebits, &bits));
+    total_bits += bits;
+  }
+  if (codes->lz77.enabled) {
+    if (writer) {
+      size_t b = writer->BitsWritten();
+      EncodeUintConfig(codes->lz77.length_uint_config, writer,
+                       /*log_alpha_size=*/8);
+      total_bits += writer->BitsWritten() - b;
+    } else {
+      SizeWriter size_writer;
+      EncodeUintConfig(codes->lz77.length_uint_config, &size_writer,
+                       /*log_alpha_size=*/8);
+      total_bits += size_writer.size;
+    }
+    num_contexts += 1;
+    tokens = std::move(tokens_lz77);
+  }
+  size_t total_tokens = 0;
+  // Build histograms.
+  HistogramBuilder builder(num_contexts);
+  HybridUintConfig uint_config;  //  Default config for clustering.
+  // Unless we are using the kContextMap histogram option.
+  if (params.uint_method == HistogramParams::HybridUintMethod::kContextMap) {
+    uint_config = HybridUintConfig(2, 0, 1);
+  }
+  if (params.uint_method == HistogramParams::HybridUintMethod::k000) {
+    uint_config = HybridUintConfig(0, 0, 0);
+  }
+  if (ans_fuzzer_friendly_) {
+    uint_config = HybridUintConfig(10, 0, 0);
+  }
+  for (size_t i = 0; i < tokens.size(); ++i) {
+    if (codes->lz77.enabled) {
+      for (size_t j = 0; j < tokens[i].size(); ++j) {
+        const Token& token = tokens[i][j];
+        total_tokens++;
+        uint32_t tok, nbits, bits;
+        (token.is_lz77_length ? codes->lz77.length_uint_config : uint_config)
+            .Encode(token.value, &tok, &nbits, &bits);
+        tok += token.is_lz77_length ? codes->lz77.min_symbol : 0;
+        builder.VisitSymbol(tok, token.context);
+      }
+    } else if (num_contexts == 1) {
+      for (size_t j = 0; j < tokens[i].size(); ++j) {
+        const Token& token = tokens[i][j];
+        total_tokens++;
+        uint32_t tok, nbits, bits;
+        uint_config.Encode(token.value, &tok, &nbits, &bits);
+        builder.VisitSymbol(tok, /*token.context=*/0);
+      }
+    } else {
+      for (size_t j = 0; j < tokens[i].size(); ++j) {
+        const Token& token = tokens[i][j];
+        total_tokens++;
+        uint32_t tok, nbits, bits;
+        uint_config.Encode(token.value, &tok, &nbits, &bits);
+        builder.VisitSymbol(tok, token.context);
+      }
+    }
+  }
+
+  bool use_prefix_code =
+      params.force_huffman || total_tokens < 100 ||
+      params.clustering == HistogramParams::ClusteringType::kFastest ||
+      ans_fuzzer_friendly_;
+  if (!use_prefix_code) {
+    bool all_singleton = true;
+    for (size_t i = 0; i < num_contexts; i++) {
+      if (builder.Histo(i).ShannonEntropy() >= 1e-5) {
+        all_singleton = false;
+      }
+    }
+    if (all_singleton) {
+      use_prefix_code = true;
+    }
+  }
+
+  // Encode histograms.
+  total_bits += builder.BuildAndStoreEntropyCodes(params, tokens, codes,
+                                                  context_map, use_prefix_code,
+                                                  writer, layer, aux_out);
+  allotment.FinishedHistogram(writer);
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+
+  if (aux_out != nullptr) {
+    aux_out->layers[layer].num_clustered_histograms +=
+        codes->encoding_info.size();
+  }
+  return total_bits;
+}
+
+size_t WriteTokens(const std::vector<Token>& tokens,
+                   const EntropyEncodingData& codes,
+                   const std::vector<uint8_t>& context_map, BitWriter* writer) {
+  size_t num_extra_bits = 0;
+  if (codes.use_prefix_code) {
+    for (size_t i = 0; i < tokens.size(); i++) {
+      uint32_t tok, nbits, bits;
+      const Token& token = tokens[i];
+      size_t histo = context_map[token.context];
+      (token.is_lz77_length ? codes.lz77.length_uint_config
+                            : codes.uint_config[histo])
+          .Encode(token.value, &tok, &nbits, &bits);
+      tok += token.is_lz77_length ? codes.lz77.min_symbol : 0;
+      // Combine two calls to the BitWriter. Equivalent to:
+      // writer->Write(codes.encoding_info[histo][tok].depth,
+      //               codes.encoding_info[histo][tok].bits);
+      // writer->Write(nbits, bits);
+      uint64_t data = codes.encoding_info[histo][tok].bits;
+      data |= bits << codes.encoding_info[histo][tok].depth;
+      writer->Write(codes.encoding_info[histo][tok].depth + nbits, data);
+      num_extra_bits += nbits;
+    }
+    return num_extra_bits;
+  }
+  std::vector<uint64_t> out;
+  std::vector<uint8_t> out_nbits;
+  out.reserve(tokens.size());
+  out_nbits.reserve(tokens.size());
+  uint64_t allbits = 0;
+  size_t numallbits = 0;
+  // Writes in *reversed* order.
+  auto addbits = [&](size_t bits, size_t nbits) {
+    if (JXL_UNLIKELY(nbits)) {
+      JXL_DASSERT(bits >> nbits == 0);
+      if (JXL_UNLIKELY(numallbits + nbits > BitWriter::kMaxBitsPerCall)) {
+        out.push_back(allbits);
+        out_nbits.push_back(numallbits);
+        numallbits = allbits = 0;
+      }
+      allbits <<= nbits;
+      allbits |= bits;
+      numallbits += nbits;
+    }
+  };
+  const int end = tokens.size();
+  ANSCoder ans;
+  if (codes.lz77.enabled || context_map.size() > 1) {
+    for (int i = end - 1; i >= 0; --i) {
+      const Token token = tokens[i];
+      const uint8_t histo = context_map[token.context];
+      uint32_t tok, nbits, bits;
+      (token.is_lz77_length ? codes.lz77.length_uint_config
+                            : codes.uint_config[histo])
+          .Encode(tokens[i].value, &tok, &nbits, &bits);
+      tok += token.is_lz77_length ? codes.lz77.min_symbol : 0;
+      const ANSEncSymbolInfo& info = codes.encoding_info[histo][tok];
+      // Extra bits first as this is reversed.
+      addbits(bits, nbits);
+      num_extra_bits += nbits;
+      uint8_t ans_nbits = 0;
+      uint32_t ans_bits = ans.PutSymbol(info, &ans_nbits);
+      addbits(ans_bits, ans_nbits);
+    }
+  } else {
+    for (int i = end - 1; i >= 0; --i) {
+      uint32_t tok, nbits, bits;
+      codes.uint_config[0].Encode(tokens[i].value, &tok, &nbits, &bits);
+      const ANSEncSymbolInfo& info = codes.encoding_info[0][tok];
+      // Extra bits first as this is reversed.
+      addbits(bits, nbits);
+      num_extra_bits += nbits;
+      uint8_t ans_nbits = 0;
+      uint32_t ans_bits = ans.PutSymbol(info, &ans_nbits);
+      addbits(ans_bits, ans_nbits);
+    }
+  }
+  const uint32_t state = ans.GetState();
+  writer->Write(32, state);
+  writer->Write(numallbits, allbits);
+  for (int i = out.size(); i > 0; --i) {
+    writer->Write(out_nbits[i - 1], out[i - 1]);
+  }
+  return num_extra_bits;
+}
+
+void WriteTokens(const std::vector<Token>& tokens,
+                 const EntropyEncodingData& codes,
+                 const std::vector<uint8_t>& context_map, BitWriter* writer,
+                 size_t layer, AuxOut* aux_out) {
+  BitWriter::Allotment allotment(writer, 32 * tokens.size() + 32 * 1024 * 4);
+  size_t num_extra_bits = WriteTokens(tokens, codes, context_map, writer);
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+  if (aux_out != nullptr) {
+    aux_out->layers[layer].extra_bits += num_extra_bits;
+  }
+}
+
+void SetANSFuzzerFriendly(bool ans_fuzzer_friendly) {
+#if JXL_IS_DEBUG_BUILD  // Guard against accidental / malicious changes.
+  ans_fuzzer_friendly_ = ans_fuzzer_friendly;
+#endif
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_ans.h b/third_party/jpeg-xl/lib/jxl/enc_ans.h
new file mode 100644
index 0000000000..a4afb19b4e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_ans.h
@@ -0,0 +1,143 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_ANS_H_
+#define LIB_JXL_ENC_ANS_H_
+
+// Library to encode the ANS population counts to the bit-stream and encode
+// symbols based on the respective distributions.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "lib/jxl/ans_common.h"
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/enc_ans_params.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/huffman_table.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+#define USE_MULT_BY_RECIPROCAL
+
+// precision must be equal to:  #bits(state_) + #bits(freq)
+#define RECIPROCAL_PRECISION (32 + ANS_LOG_TAB_SIZE)
+
+// Data structure representing one element of the encoding table built
+// from a distribution.
+// TODO(veluca): split this up, or use an union.
+struct ANSEncSymbolInfo {
+  // ANS
+  uint16_t freq_;
+  std::vector<uint16_t> reverse_map_;
+#ifdef USE_MULT_BY_RECIPROCAL
+  uint64_t ifreq_;
+#endif
+  // Prefix coding.
+  uint8_t depth;
+  uint16_t bits;
+};
+
+class ANSCoder {
+ public:
+  ANSCoder() : state_(ANS_SIGNATURE << 16) {}
+
+  uint32_t PutSymbol(const ANSEncSymbolInfo& t, uint8_t* nbits) {
+    uint32_t bits = 0;
+    *nbits = 0;
+    if ((state_ >> (32 - ANS_LOG_TAB_SIZE)) >= t.freq_) {
+      bits = state_ & 0xffff;
+      state_ >>= 16;
+      *nbits = 16;
+    }
+#ifdef USE_MULT_BY_RECIPROCAL
+    // We use mult-by-reciprocal trick, but that requires 64b calc.
+    const uint32_t v = (state_ * t.ifreq_) >> RECIPROCAL_PRECISION;
+    const uint32_t offset = t.reverse_map_[state_ - v * t.freq_];
+    state_ = (v << ANS_LOG_TAB_SIZE) + offset;
+#else
+    state_ = ((state_ / t.freq_) << ANS_LOG_TAB_SIZE) +
+             t.reverse_map_[state_ % t.freq_];
+#endif
+    return bits;
+  }
+
+  uint32_t GetState() const { return state_; }
+
+ private:
+  uint32_t state_;
+};
+
+// RebalanceHistogram requires a signed type.
+using ANSHistBin = int32_t;
+
+struct EntropyEncodingData {
+  std::vector<std::vector<ANSEncSymbolInfo>> encoding_info;
+  bool use_prefix_code;
+  std::vector<HybridUintConfig> uint_config;
+  LZ77Params lz77;
+};
+
+// Integer to be encoded by an entropy coder, either ANS or Huffman.
+struct Token {
+  Token() {}
+  Token(uint32_t c, uint32_t value)
+      : is_lz77_length(false), context(c), value(value) {}
+  uint32_t is_lz77_length : 1;
+  uint32_t context : 31;
+  uint32_t value;
+};
+
+// Returns an estimate of the number of bits required to encode the given
+// histogram (header bits plus data bits).
+float ANSPopulationCost(const ANSHistBin* data, size_t alphabet_size);
+
+// Apply context clustering, compute histograms and encode them. Returns an
+// estimate of the total bits used for encoding the stream. If `writer` ==
+// nullptr, the bit estimate will not take into account the context map (which
+// does not get written if `num_contexts` == 1).
+size_t BuildAndEncodeHistograms(const HistogramParams& params,
+                                size_t num_contexts,
+                                std::vector<std::vector<Token>>& tokens,
+                                EntropyEncodingData* codes,
+                                std::vector<uint8_t>* context_map,
+                                BitWriter* writer, size_t layer,
+                                AuxOut* aux_out);
+
+// Write the tokens to a string.
+void WriteTokens(const std::vector<Token>& tokens,
+                 const EntropyEncodingData& codes,
+                 const std::vector<uint8_t>& context_map, BitWriter* writer,
+                 size_t layer, AuxOut* aux_out);
+
+// Same as above, but assumes allotment created by caller.
+size_t WriteTokens(const std::vector<Token>& tokens,
+                   const EntropyEncodingData& codes,
+                   const std::vector<uint8_t>& context_map, BitWriter* writer);
+
+// Exposed for tests; to be used with Writer=BitWriter only.
+template <typename Writer>
+void EncodeUintConfigs(const std::vector<HybridUintConfig>& uint_config,
+                       Writer* writer, size_t log_alpha_size);
+extern template void EncodeUintConfigs(const std::vector<HybridUintConfig>&,
+                                       BitWriter*, size_t);
+
+// Globally set the option to create fuzzer-friendly ANS streams. Negatively
+// impacts compression. Not thread-safe.
+void SetANSFuzzerFriendly(bool ans_fuzzer_friendly);
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_ANS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_ans_params.h b/third_party/jpeg-xl/lib/jxl/enc_ans_params.h
new file mode 100644
index 0000000000..50ca31dc03
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_ans_params.h
@@ -0,0 +1,76 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_ANS_PARAMS_H_
+#define LIB_JXL_ENC_ANS_PARAMS_H_
+
+// Encoder-only parameter needed for ANS entropy encoding methods.
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "lib/jxl/enc_params.h"
+
+namespace jxl {
+
+struct HistogramParams {
+  enum class ClusteringType {
+    kFastest,  // Only 4 clusters.
+    kFast,
+    kBest,
+  };
+
+  enum class HybridUintMethod {
+    kNone,        // just use kHybridUint420Config.
+    k000,         // force the fastest option.
+    kFast,        // just try a couple of options.
+    kContextMap,  // fast choice for ctx map.
+    kBest,
+  };
+
+  enum class LZ77Method {
+    kNone,     // do not try lz77.
+    kRLE,      // only try doing RLE.
+    kLZ77,     // try lz77 with backward references.
+    kOptimal,  // optimal-matching LZ77 parsing.
+  };
+
+  enum class ANSHistogramStrategy {
+    kFast,         // Only try some methods, early exit.
+    kApproximate,  // Only try some methods.
+    kPrecise,      // Try all methods.
+  };
+
+  HistogramParams() = default;
+
+  HistogramParams(SpeedTier tier, size_t num_ctx) {
+    if (tier > SpeedTier::kFalcon) {
+      clustering = ClusteringType::kFastest;
+      lz77_method = LZ77Method::kNone;
+    } else if (tier > SpeedTier::kTortoise) {
+      clustering = ClusteringType::kFast;
+    } else {
+      clustering = ClusteringType::kBest;
+    }
+    if (tier > SpeedTier::kTortoise) {
+      uint_method = HybridUintMethod::kNone;
+    }
+    if (tier >= SpeedTier::kSquirrel) {
+      ans_histogram_strategy = ANSHistogramStrategy::kApproximate;
+    }
+  }
+
+  ClusteringType clustering = ClusteringType::kBest;
+  HybridUintMethod uint_method = HybridUintMethod::kBest;
+  LZ77Method lz77_method = LZ77Method::kRLE;
+  ANSHistogramStrategy ans_histogram_strategy = ANSHistogramStrategy::kPrecise;
+  std::vector<size_t> image_widths;
+  size_t max_histograms = ~0;
+  bool force_huffman = false;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_ANS_PARAMS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc
new file mode 100644
index 0000000000..9030430e2b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc
@@ -0,0 +1,325 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_ar_control_field.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <algorithm>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_ar_control_field.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_adaptive_quantization.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Sqrt;
+
+void ProcessTile(const Image3F& opsin, PassesEncoderState* enc_state,
+                 const Rect& rect,
+                 ArControlFieldHeuristics::TempImages* temp_image) {
+  constexpr size_t N = kBlockDim;
+  ImageB* JXL_RESTRICT epf_sharpness = &enc_state->shared.epf_sharpness;
+  ImageF* JXL_RESTRICT quant = &enc_state->initial_quant_field;
+  JXL_ASSERT(
+      epf_sharpness->xsize() == enc_state->shared.frame_dim.xsize_blocks &&
+      epf_sharpness->ysize() == enc_state->shared.frame_dim.ysize_blocks);
+
+  if (enc_state->cparams.butteraugli_distance < kMinButteraugliForDynamicAR ||
+      enc_state->cparams.speed_tier > SpeedTier::kWombat ||
+      enc_state->shared.frame_header.loop_filter.epf_iters == 0) {
+    FillPlane(static_cast<uint8_t>(4), epf_sharpness, rect);
+    return;
+  }
+
+  // Likely better to have a higher X weight, like:
+  // const float kChannelWeights[3] = {47.0f, 4.35f, 0.287f};
+  const float kChannelWeights[3] = {4.35f, 4.35f, 0.287f};
+  const float kChannelWeightsLapNeg[3] = {-0.125f * kChannelWeights[0],
+                                          -0.125f * kChannelWeights[1],
+                                          -0.125f * kChannelWeights[2]};
+  const size_t sharpness_stride =
+      static_cast<size_t>(epf_sharpness->PixelsPerRow());
+
+  size_t by0 = rect.y0();
+  size_t by1 = rect.y0() + rect.ysize();
+  size_t bx0 = rect.x0();
+  size_t bx1 = rect.x0() + rect.xsize();
+  temp_image->InitOnce();
+  ImageF& laplacian_sqrsum = temp_image->laplacian_sqrsum;
+  // Calculate the L2 of the 3x3 Laplacian in an integral transform
+  // (for example 32x32 dct). This relates to transforms ability
+  // to propagate artefacts.
+  size_t y0 = by0 == 0 ? 2 : 0;
+  size_t y1 = by1 * N + 4 <= opsin.ysize() + 2 ? (by1 - by0) * N + 4
+                                               : opsin.ysize() + 2 - by0 * N;
+  size_t x0 = bx0 == 0 ? 2 : 0;
+  size_t x1 = bx1 * N + 4 <= opsin.xsize() + 2 ? (bx1 - bx0) * N + 4
+                                               : opsin.xsize() + 2 - bx0 * N;
+  HWY_FULL(float) df;
+  for (size_t y = y0; y < y1; y++) {
+    float* JXL_RESTRICT laplacian_sqrsum_row = laplacian_sqrsum.Row(y);
+    size_t cy = y + by0 * N - 2;
+    const float* JXL_RESTRICT in_row_t[3];
+    const float* JXL_RESTRICT in_row[3];
+    const float* JXL_RESTRICT in_row_b[3];
+    for (size_t c = 0; c < 3; c++) {
+      in_row_t[c] = opsin.PlaneRow(c, cy > 0 ? cy - 1 : cy);
+      in_row[c] = opsin.PlaneRow(c, cy);
+      in_row_b[c] = opsin.PlaneRow(c, cy + 1 < opsin.ysize() ? cy + 1 : cy);
+    }
+    auto compute_laplacian_scalar = [&](size_t x) {
+      size_t cx = x + bx0 * N - 2;
+      const size_t prevX = cx >= 1 ? cx - 1 : cx;
+      const size_t nextX = cx + 1 < opsin.xsize() ? cx + 1 : cx;
+      float sumsqr = 0;
+      for (size_t c = 0; c < 3; c++) {
+        float laplacian =
+            kChannelWeights[c] * in_row[c][cx] +
+            kChannelWeightsLapNeg[c] *
+                (in_row[c][prevX] + in_row[c][nextX] + in_row_b[c][prevX] +
+                 in_row_b[c][cx] + in_row_b[c][nextX] + in_row_t[c][prevX] +
+                 in_row_t[c][cx] + in_row_t[c][nextX]);
+        sumsqr += laplacian * laplacian;
+      }
+      laplacian_sqrsum_row[x] = sumsqr;
+    };
+    size_t x = x0;
+    for (; x + bx0 * N < 3; x++) {
+      compute_laplacian_scalar(x);
+    }
+    // Interior. One extra pixel of border as the last pixel is special.
+    for (; x + Lanes(df) <= x1 && x + Lanes(df) + bx0 * N - 1 <= opsin.xsize();
+         x += Lanes(df)) {
+      size_t cx = x + bx0 * N - 2;
+      auto sumsqr = Zero(df);
+      for (size_t c = 0; c < 3; c++) {
+        auto laplacian =
+            Mul(LoadU(df, in_row[c] + cx), Set(df, kChannelWeights[c]));
+        auto sum_oth0 = LoadU(df, in_row[c] + cx - 1);
+        auto sum_oth1 = LoadU(df, in_row[c] + cx + 1);
+        auto sum_oth2 = LoadU(df, in_row_t[c] + cx - 1);
+        auto sum_oth3 = LoadU(df, in_row_t[c] + cx);
+        sum_oth0 = Add(sum_oth0, LoadU(df, in_row_t[c] + cx + 1));
+        sum_oth1 = Add(sum_oth1, LoadU(df, in_row_b[c] + cx - 1));
+        sum_oth2 = Add(sum_oth2, LoadU(df, in_row_b[c] + cx));
+        sum_oth3 = Add(sum_oth3, LoadU(df, in_row_b[c] + cx + 1));
+        sum_oth0 = Add(sum_oth0, sum_oth1);
+        sum_oth2 = Add(sum_oth2, sum_oth3);
+        sum_oth0 = Add(sum_oth0, sum_oth2);
+        laplacian =
+            MulAdd(Set(df, kChannelWeightsLapNeg[c]), sum_oth0, laplacian);
+        sumsqr = MulAdd(laplacian, laplacian, sumsqr);
+      }
+      StoreU(sumsqr, df, laplacian_sqrsum_row + x);
+    }
+    for (; x < x1; x++) {
+      compute_laplacian_scalar(x);
+    }
+  }
+  HWY_CAPPED(float, 4) df4;
+  // Calculate the L2 of the 3x3 Laplacian in 4x4 blocks within the area
+  // of the integral transform. Sample them within the integral transform
+  // with two offsets (0,0) and (-2, -2) pixels (sqrsum_00 and sqrsum_22,
+  //  respectively).
+  ImageF& sqrsum_00 = temp_image->sqrsum_00;
+  size_t sqrsum_00_stride = sqrsum_00.PixelsPerRow();
+  float* JXL_RESTRICT sqrsum_00_row = sqrsum_00.Row(0);
+  for (size_t y = 0; y < (by1 - by0) * 2; y++) {
+    const float* JXL_RESTRICT rows_in[4];
+    for (size_t iy = 0; iy < 4; iy++) {
+      rows_in[iy] = laplacian_sqrsum.ConstRow(y * 4 + iy + 2);
+    }
+    float* JXL_RESTRICT row_out = sqrsum_00_row + y * sqrsum_00_stride;
+    for (size_t x = 0; x < (bx1 - bx0) * 2; x++) {
+      auto sum = Zero(df4);
+      for (size_t iy = 0; iy < 4; iy++) {
+        for (size_t ix = 0; ix < 4; ix += Lanes(df4)) {
+          sum = Add(sum, LoadU(df4, rows_in[iy] + x * 4 + ix + 2));
+        }
+      }
+      row_out[x] = GetLane(Sqrt(SumOfLanes(df4, sum))) * (1.0f / 4.0f);
+    }
+  }
+  // Indexing iy and ix is a bit tricky as we include a 2 pixel border
+  // around the block for evenness calculations. This is similar to what
+  // we did in guetzli for the observability of artefacts, except there
+  // the element is a sliding 5x5, not sparsely sampled 4x4 box like here.
+  ImageF& sqrsum_22 = temp_image->sqrsum_22;
+  size_t sqrsum_22_stride = sqrsum_22.PixelsPerRow();
+  float* JXL_RESTRICT sqrsum_22_row = sqrsum_22.Row(0);
+  for (size_t y = 0; y < (by1 - by0) * 2 + 1; y++) {
+    const float* JXL_RESTRICT rows_in[4];
+    for (size_t iy = 0; iy < 4; iy++) {
+      rows_in[iy] = laplacian_sqrsum.ConstRow(y * 4 + iy);
+    }
+    float* JXL_RESTRICT row_out = sqrsum_22_row + y * sqrsum_22_stride;
+    // ignore pixels outside the image.
+    // Y coordinates are relative to by0*8+y*4.
+    size_t sy = y * 4 + by0 * 8 > 0 ? 0 : 2;
+    size_t ey = y * 4 + by0 * 8 + 4 <= opsin.ysize() + 2
+                    ? 4
+                    : opsin.ysize() - y * 4 - by0 * 8 + 2;
+    for (size_t x = 0; x < (bx1 - bx0) * 2 + 1; x++) {
+      // ignore pixels outside the image.
+      // X coordinates are relative to bx0*8.
+      size_t sx = x * 4 + bx0 * 8 > 0 ? x * 4 : x * 4 + 2;
+      size_t ex = x * 4 + bx0 * 8 + 4 <= opsin.xsize() + 2
+                      ? x * 4 + 4
+                      : opsin.xsize() - bx0 * 8 + 2;
+      if (ex - sx == 4 && ey - sy == 4) {
+        auto sum = Zero(df4);
+        for (size_t iy = 0; iy < 4; iy++) {
+          for (size_t ix = 0; ix < 4; ix += Lanes(df4)) {
+            sum = Add(sum, Load(df4, rows_in[iy] + sx + ix));
+          }
+        }
+        row_out[x] = GetLane(Sqrt(SumOfLanes(df4, sum))) * (1.0f / 4.0f);
+      } else {
+        float sum = 0;
+        for (size_t iy = sy; iy < ey; iy++) {
+          for (size_t ix = sx; ix < ex; ix++) {
+            sum += rows_in[iy][ix];
+          }
+        }
+        row_out[x] = std::sqrt(sum / ((ex - sx) * (ey - sy)));
+      }
+    }
+  }
+  for (size_t by = by0; by < by1; by++) {
+    AcStrategyRow acs_row = enc_state->shared.ac_strategy.ConstRow(by);
+    uint8_t* JXL_RESTRICT out_row = epf_sharpness->Row(by);
+    float* JXL_RESTRICT quant_row = quant->Row(by);
+    for (size_t bx = bx0; bx < bx1; bx++) {
+      AcStrategy acs = acs_row[bx];
+      if (!acs.IsFirstBlock()) continue;
+      // The errors are going to be linear to the quantization value in this
+      // locality. We only have access to the initial quant field here.
+      float quant_val = 1.0f / quant_row[bx];
+
+      const auto sq00 = [&](size_t y, size_t x) {
+        return sqrsum_00_row[((by - by0) * 2 + y) * sqrsum_00_stride +
+                             (bx - bx0) * 2 + x];
+      };
+      const auto sq22 = [&](size_t y, size_t x) {
+        return sqrsum_22_row[((by - by0) * 2 + y) * sqrsum_22_stride +
+                             (bx - bx0) * 2 + x];
+      };
+      float sqrsum_integral_transform = 0;
+      for (size_t iy = 0; iy < acs.covered_blocks_y() * 2; iy++) {
+        for (size_t ix = 0; ix < acs.covered_blocks_x() * 2; ix++) {
+          sqrsum_integral_transform += sq00(iy, ix) * sq00(iy, ix);
+        }
+      }
+      sqrsum_integral_transform /=
+          4 * acs.covered_blocks_x() * acs.covered_blocks_y();
+      sqrsum_integral_transform = std::sqrt(sqrsum_integral_transform);
+      // If masking is high or amplitude of the artefacts is low, then no
+      // smoothing is needed.
+      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+          // Five 4x4 blocks for masking estimation, all within the
+          // 8x8 area.
+          float minval_1 = std::min(sq00(2 * iy + 0, 2 * ix + 0),
+                                    sq00(2 * iy + 0, 2 * ix + 1));
+          float minval_2 = std::min(sq00(2 * iy + 1, 2 * ix + 0),
+                                    sq00(2 * iy + 1, 2 * ix + 1));
+          float minval = std::min(minval_1, minval_2);
+          minval = std::min(minval, sq22(2 * iy + 1, 2 * ix + 1));
+          // Nine more 4x4 blocks for masking estimation, includes
+          // the 2 pixel area around the 8x8 block being controlled.
+          float minval2_1 = std::min(sq22(2 * iy + 0, 2 * ix + 0),
+                                     sq22(2 * iy + 0, 2 * ix + 1));
+          float minval2_2 = std::min(sq22(2 * iy + 0, 2 * ix + 2),
+                                     sq22(2 * iy + 1, 2 * ix + 0));
+          float minval2_3 = std::min(sq22(2 * iy + 1, 2 * ix + 1),
+                                     sq22(2 * iy + 1, 2 * ix + 2));
+          float minval2_4 = std::min(sq22(2 * iy + 2, 2 * ix + 0),
+                                     sq22(2 * iy + 2, 2 * ix + 1));
+          float minval2_5 = std::min(minval2_1, minval2_2);
+          float minval2_6 = std::min(minval2_3, minval2_4);
+          float minval2 = std::min(minval2_5, minval2_6);
+          minval2 = std::min(minval2, sq22(2 * iy + 2, 2 * ix + 2));
+          float minval3 = std::min(minval, minval2);
+          minval *= 0.125f;
+          minval += 0.625f * minval3;
+          minval +=
+              0.125f * std::min(1.5f * minval3, sq22(2 * iy + 1, 2 * ix + 1));
+          minval += 0.125f * minval2;
+          // Larger kBias, less smoothing for low intensity changes.
+          float kDeltaLimit = 3.2;
+          float bias = 0.0625f * quant_val;
+          float delta =
+              (sqrsum_integral_transform + (kDeltaLimit + 0.05) * bias) /
+              (minval + bias);
+          int out = 4;
+          if (delta > kDeltaLimit) {
+            out = 4;  // smooth
+          } else {
+            out = 0;
+          }
+          // 'threshold' is separate from 'bias' for easier tuning of these
+          // heuristics.
+          float threshold = 0.0625f * quant_val;
+          const float kSmoothLimit = 0.085f;
+          float smooth = 0.20f * (sq00(2 * iy + 0, 2 * ix + 0) +
+                                  sq00(2 * iy + 0, 2 * ix + 1) +
+                                  sq00(2 * iy + 1, 2 * ix + 0) +
+                                  sq00(2 * iy + 1, 2 * ix + 1) + minval);
+          if (smooth < kSmoothLimit * threshold) {
+            out = 4;
+          }
+          out_row[bx + sharpness_stride * iy + ix] = out;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(ProcessTile);
+
+void ArControlFieldHeuristics::RunRect(const Rect& block_rect,
+                                       const Image3F& opsin,
+                                       PassesEncoderState* enc_state,
+                                       size_t thread) {
+  HWY_DYNAMIC_DISPATCH(ProcessTile)
+  (opsin, enc_state, block_rect, &temp_images[thread]);
+}
+
+}  // namespace jxl
+
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h
new file mode 100644
index 0000000000..aabe71f46f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h
@@ -0,0 +1,49 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_AR_CONTROL_FIELD_H_
+#define LIB_JXL_ENC_AR_CONTROL_FIELD_H_
+
+#include <stddef.h>
+
+#include <vector>
+
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+struct PassesEncoderState;
+
+struct ArControlFieldHeuristics {
+  struct TempImages {
+    void InitOnce() {
+      if (laplacian_sqrsum.xsize() != 0) return;
+      laplacian_sqrsum = ImageF(kEncTileDim + 4, kEncTileDim + 4);
+      sqrsum_00 = ImageF(kEncTileDim / 4, kEncTileDim / 4);
+      sqrsum_22 = ImageF(kEncTileDim / 4 + 1, kEncTileDim / 4 + 1);
+    }
+
+    ImageF laplacian_sqrsum;
+    ImageF sqrsum_00;
+    ImageF sqrsum_22;
+  };
+
+  void PrepareForThreads(size_t num_threads) {
+    temp_images.resize(num_threads);
+  }
+
+  void RunRect(const Rect& block_rect, const Image3F& opsin,
+               PassesEncoderState* enc_state, size_t thread);
+
+  std::vector<TempImages> temp_images;
+  ImageB* epf_sharpness;
+  ImageF* quant;
+  bool all_default;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_AR_ENC_CONTROL_FIELD_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_aux_out.cc b/third_party/jpeg-xl/lib/jxl/enc_aux_out.cc
new file mode 100644
index 0000000000..1c141d1727
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_aux_out.cc
@@ -0,0 +1,205 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_aux_out.h"
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <numeric>  // accumulate
+#include <sstream>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+const char* LayerName(size_t layer) {
+  switch (layer) {
+    case kLayerHeader:
+      return "Headers";
+    case kLayerTOC:
+      return "TOC";
+    case kLayerDictionary:
+      return "Patches";
+    case kLayerSplines:
+      return "Splines";
+    case kLayerNoise:
+      return "Noise";
+    case kLayerQuant:
+      return "Quantizer";
+    case kLayerModularTree:
+      return "ModularTree";
+    case kLayerModularGlobal:
+      return "ModularGlobal";
+    case kLayerDC:
+      return "DC";
+    case kLayerModularDcGroup:
+      return "ModularDcGroup";
+    case kLayerControlFields:
+      return "ControlFields";
+    case kLayerOrder:
+      return "CoeffOrder";
+    case kLayerAC:
+      return "ACHistograms";
+    case kLayerACTokens:
+      return "ACTokens";
+    case kLayerModularAcGroup:
+      return "ModularAcGroup";
+    default:
+      JXL_ABORT("Invalid layer %d\n", static_cast<int>(layer));
+  }
+}
+
+void AuxOut::LayerTotals::Print(size_t num_inputs) const {
+  printf("%10" PRId64, static_cast<int64_t>(total_bits));
+  if (histogram_bits != 0) {
+    printf("   [c/i:%6.2f | hst:%8" PRId64 " | ex:%8" PRId64 " | h+c+e:%12.3f",
+           num_clustered_histograms * 1.0 / num_inputs,
+           static_cast<int64_t>(histogram_bits >> 3),
+           static_cast<int64_t>(extra_bits >> 3),
+           (histogram_bits + clustered_entropy + extra_bits) / 8.0);
+    printf("]");
+  }
+  printf("\n");
+}
+
+void AuxOut::Assimilate(const AuxOut& victim) {
+  for (size_t i = 0; i < layers.size(); ++i) {
+    layers[i].Assimilate(victim.layers[i]);
+  }
+  num_blocks += victim.num_blocks;
+  num_small_blocks += victim.num_small_blocks;
+  num_dct4x8_blocks += victim.num_dct4x8_blocks;
+  num_afv_blocks += victim.num_afv_blocks;
+  num_dct8_blocks += victim.num_dct8_blocks;
+  num_dct8x16_blocks += victim.num_dct8x16_blocks;
+  num_dct8x32_blocks += victim.num_dct8x32_blocks;
+  num_dct16_blocks += victim.num_dct16_blocks;
+  num_dct16x32_blocks += victim.num_dct16x32_blocks;
+  num_dct32_blocks += victim.num_dct32_blocks;
+  num_dct32x64_blocks += victim.num_dct32x64_blocks;
+  num_dct64_blocks += victim.num_dct64_blocks;
+  num_butteraugli_iters += victim.num_butteraugli_iters;
+  for (size_t i = 0; i < dc_pred_usage.size(); ++i) {
+    dc_pred_usage[i] += victim.dc_pred_usage[i];
+    dc_pred_usage_xb[i] += victim.dc_pred_usage_xb[i];
+  }
+  max_quant_rescale = std::max(max_quant_rescale, victim.max_quant_rescale);
+  min_quant_rescale = std::min(min_quant_rescale, victim.min_quant_rescale);
+  max_bitrate_error = std::max(max_bitrate_error, victim.max_bitrate_error);
+  min_bitrate_error = std::min(min_bitrate_error, victim.min_bitrate_error);
+}
+
+void AuxOut::Print(size_t num_inputs) const {
+  if (num_inputs == 0) return;
+
+  LayerTotals all_layers;
+  for (size_t i = 0; i < layers.size(); ++i) {
+    all_layers.Assimilate(layers[i]);
+  }
+
+  printf("Average butteraugli iters: %10.2f\n",
+         num_butteraugli_iters * 1.0 / num_inputs);
+  if (min_quant_rescale != 1.0 || max_quant_rescale != 1.0) {
+    printf("quant rescale range: %f .. %f\n", min_quant_rescale,
+           max_quant_rescale);
+    printf("bitrate error range: %.3f%% .. %.3f%%\n",
+           100.0f * min_bitrate_error, 100.0f * max_bitrate_error);
+  }
+
+  for (size_t i = 0; i < layers.size(); ++i) {
+    if (layers[i].total_bits != 0) {
+      printf("Total layer bits %-10s\t", LayerName(i));
+      printf("%10f%%", 100.0 * layers[i].total_bits / all_layers.total_bits);
+      layers[i].Print(num_inputs);
+    }
+  }
+  printf("Total image size           ");
+  all_layers.Print(num_inputs);
+
+  const uint32_t dc_pred_total =
+      std::accumulate(dc_pred_usage.begin(), dc_pred_usage.end(), 0u);
+  const uint32_t dc_pred_total_xb =
+      std::accumulate(dc_pred_usage_xb.begin(), dc_pred_usage_xb.end(), 0u);
+  if (dc_pred_total + dc_pred_total_xb != 0) {
+    printf("\nDC pred     Y                XB:\n");
+    for (size_t i = 0; i < dc_pred_usage.size(); ++i) {
+      printf("  %6u (%5.2f%%)    %6u (%5.2f%%)\n", dc_pred_usage[i],
+             100.0 * dc_pred_usage[i] / dc_pred_total, dc_pred_usage_xb[i],
+             100.0 * dc_pred_usage_xb[i] / dc_pred_total_xb);
+    }
+  }
+
+  size_t total_blocks = 0;
+  size_t total_positions = 0;
+  if (total_blocks != 0 && total_positions != 0) {
+    printf("\n\t\t  Blocks\t\tPositions\t\t\tBlocks/Position\n");
+    printf(" Total:\t\t    %7" PRIuS "\t\t     %7" PRIuS " \t\t\t%10f%%\n\n",
+           total_blocks, total_positions,
+           100.0 * total_blocks / total_positions);
+  }
+}
+
+template <typename T>
+void AuxOut::DumpImage(const char* label, const Image3<T>& image) const {
+  if (!dump_image) return;
+  if (debug_prefix.empty()) return;
+  std::ostringstream pathname;
+  pathname << debug_prefix << label << ".png";
+  (void)dump_image(ConvertToFloat(image), ColorEncoding::SRGB(),
+                   pathname.str());
+}
+template void AuxOut::DumpImage(const char* label,
+                                const Image3<float>& image) const;
+template void AuxOut::DumpImage(const char* label,
+                                const Image3<uint8_t>& image) const;
+
+template <typename T>
+void AuxOut::DumpPlaneNormalized(const char* label,
+                                 const Plane<T>& image) const {
+  T min;
+  T max;
+  ImageMinMax(image, &min, &max);
+  Image3B normalized(image.xsize(), image.ysize());
+  for (size_t c = 0; c < 3; ++c) {
+    float mul = min == max ? 0 : (255.0f / (max - min));
+    for (size_t y = 0; y < image.ysize(); ++y) {
+      const T* JXL_RESTRICT row_in = image.ConstRow(y);
+      uint8_t* JXL_RESTRICT row_out = normalized.PlaneRow(c, y);
+      for (size_t x = 0; x < image.xsize(); ++x) {
+        row_out[x] = static_cast<uint8_t>((row_in[x] - min) * mul);
+      }
+    }
+  }
+  DumpImage(label, normalized);
+}
+template void AuxOut::DumpPlaneNormalized(const char* label,
+                                          const Plane<float>& image) const;
+template void AuxOut::DumpPlaneNormalized(const char* label,
+                                          const Plane<uint8_t>& image) const;
+
+void AuxOut::DumpXybImage(const char* label, const Image3F& image) const {
+  if (!dump_image) return;
+  if (debug_prefix.empty()) return;
+  std::ostringstream pathname;
+  pathname << debug_prefix << label << ".png";
+
+  Image3F linear(image.xsize(), image.ysize());
+  OpsinParams opsin_params;
+  opsin_params.Init(kDefaultIntensityTarget);
+  OpsinToLinear(image, Rect(linear), nullptr, &linear, opsin_params);
+
+  (void)dump_image(std::move(linear), ColorEncoding::LinearSRGB(),
+                   pathname.str());
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_aux_out.h b/third_party/jpeg-xl/lib/jxl/enc_aux_out.h
new file mode 100644
index 0000000000..78222823ae
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_aux_out.h
@@ -0,0 +1,163 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_AUX_OUT_H_
+#define LIB_JXL_AUX_OUT_H_
+
+// Optional output information for debugging and analyzing size usage.
+
+#include <stddef.h>
+
+#include <array>
+#include <functional>
+#include <string>
+
+#include "lib/jxl/image.h"
+#include "lib/jxl/jxl_inspection.h"
+
+namespace jxl {
+
+struct ColorEncoding;
+
+// For LayerName and AuxOut::layers[] index. Order does not matter.
+enum {
+  kLayerHeader = 0,
+  kLayerTOC,
+  kLayerDictionary,
+  kLayerSplines,
+  kLayerNoise,
+  kLayerQuant,
+  kLayerModularTree,
+  kLayerModularGlobal,
+  kLayerDC,
+  kLayerModularDcGroup,
+  kLayerControlFields,
+  kLayerOrder,
+  kLayerAC,
+  kLayerACTokens,
+  kLayerModularAcGroup,
+  kNumImageLayers
+};
+
+const char* LayerName(size_t layer);
+
+// Statistics gathered during compression or decompression.
+struct AuxOut {
+ private:
+  struct LayerTotals {
+    void Assimilate(const LayerTotals& victim) {
+      num_clustered_histograms += victim.num_clustered_histograms;
+      histogram_bits += victim.histogram_bits;
+      extra_bits += victim.extra_bits;
+      total_bits += victim.total_bits;
+      clustered_entropy += victim.clustered_entropy;
+    }
+    void Print(size_t num_inputs) const;
+
+    size_t num_clustered_histograms = 0;
+    size_t extra_bits = 0;
+
+    // Set via BitsWritten below
+    size_t histogram_bits = 0;
+    size_t total_bits = 0;
+
+    double clustered_entropy = 0.0;
+  };
+
+ public:
+  AuxOut() = default;
+  AuxOut(const AuxOut&) = default;
+
+  void Assimilate(const AuxOut& victim);
+
+  void Print(size_t num_inputs) const;
+
+  size_t TotalBits() const {
+    size_t total = 0;
+    for (const auto& layer : layers) {
+      total += layer.total_bits;
+    }
+    return total;
+  }
+
+  template <typename T>
+  void DumpImage(const char* label, const Image3<T>& image) const;
+
+  void DumpXybImage(const char* label, const Image3F& image) const;
+
+  template <typename T>
+  void DumpPlaneNormalized(const char* label, const Plane<T>& image) const;
+
+  void SetInspectorImage3F(const jxl::InspectorImage3F& inspector) {
+    inspector_image3f_ = inspector;
+  }
+
+  // Allows hooking intermediate data inspection into various places of the
+  // processing pipeline. Returns true iff processing should proceed.
+  bool InspectImage3F(const char* label, const Image3F& image) {
+    if (inspector_image3f_ != nullptr) {
+      return inspector_image3f_(label, image);
+    }
+    return true;
+  }
+
+  std::array<LayerTotals, kNumImageLayers> layers;
+  size_t num_blocks = 0;
+
+  // Number of blocks that use larger DCT (set by ac_strategy).
+  size_t num_small_blocks = 0;
+  size_t num_dct4x8_blocks = 0;
+  size_t num_afv_blocks = 0;
+  size_t num_dct8_blocks = 0;
+  size_t num_dct8x16_blocks = 0;
+  size_t num_dct8x32_blocks = 0;
+  size_t num_dct16_blocks = 0;
+  size_t num_dct16x32_blocks = 0;
+  size_t num_dct32_blocks = 0;
+  size_t num_dct32x64_blocks = 0;
+  size_t num_dct64_blocks = 0;
+
+  std::array<uint32_t, 8> dc_pred_usage = {{0}};
+  std::array<uint32_t, 8> dc_pred_usage_xb = {{0}};
+
+  int num_butteraugli_iters = 0;
+
+  float max_quant_rescale = 1.0f;
+  float min_quant_rescale = 1.0f;
+  float min_bitrate_error = 0.0f;
+  float max_bitrate_error = 0.0f;
+
+  // If not empty, additional debugging information (e.g. debug images) is
+  // saved in files with this prefix.
+  std::string debug_prefix;
+
+  // By how much the decoded image was downsampled relative to the encoded
+  // image.
+  size_t downsampling = 1;
+
+  jxl::InspectorImage3F inspector_image3f_;
+
+  std::function<Status(Image3F&&, const ColorEncoding&, const std::string&)>
+      dump_image = nullptr;
+};
+
+extern template void AuxOut::DumpImage(const char* label,
+                                       const Image3<float>& image) const;
+extern template void AuxOut::DumpImage(const char* label,
+                                       const Image3<uint8_t>& image) const;
+extern template void AuxOut::DumpPlaneNormalized(
+    const char* label, const Plane<float>& image) const;
+extern template void AuxOut::DumpPlaneNormalized(
+    const char* label, const Plane<uint8_t>& image) const;
+
+// Used to skip image creation if they won't be written to debug directory.
+static inline bool WantDebugOutput(const AuxOut* aux_out) {
+  // Need valid pointer and filename.
+  return aux_out != nullptr && !aux_out->debug_prefix.empty();
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_AUX_OUT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc b/third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc
new file mode 100644
index 0000000000..7964c28f76
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc
@@ -0,0 +1,201 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_bit_writer.h"
+
+#include <string.h>  // memcpy
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_aux_out.h"
+
+namespace jxl {
+
+BitWriter::Allotment::Allotment(BitWriter* JXL_RESTRICT writer, size_t max_bits)
+    : max_bits_(max_bits) {
+  if (writer == nullptr) return;
+  prev_bits_written_ = writer->BitsWritten();
+  const size_t prev_bytes = writer->storage_.size();
+  const size_t next_bytes = DivCeil(max_bits, kBitsPerByte);
+  writer->storage_.resize(prev_bytes + next_bytes);
+  parent_ = writer->current_allotment_;
+  writer->current_allotment_ = this;
+}
+
+BitWriter::Allotment::~Allotment() {
+  if (!called_) {
+    // Not calling is a bug - unused storage will not be reclaimed.
+    JXL_ABORT("Did not call Allotment::ReclaimUnused");
+  }
+}
+
+void BitWriter::Allotment::FinishedHistogram(BitWriter* JXL_RESTRICT writer) {
+  if (writer == nullptr) return;
+  JXL_ASSERT(!called_);              // Call before ReclaimUnused
+  JXL_ASSERT(histogram_bits_ == 0);  // Do not call twice
+  JXL_ASSERT(writer->BitsWritten() >= prev_bits_written_);
+  histogram_bits_ = writer->BitsWritten() - prev_bits_written_;
+}
+
+void BitWriter::Allotment::ReclaimAndCharge(BitWriter* JXL_RESTRICT writer,
+                                            size_t layer,
+                                            AuxOut* JXL_RESTRICT aux_out) {
+  size_t used_bits, unused_bits;
+  PrivateReclaim(writer, &used_bits, &unused_bits);
+
+#if 0
+  printf("Layer %s bits: max %" PRIuS " used %" PRIuS " unused %" PRIuS "\n",
+         LayerName(layer), MaxBits(), used_bits, unused_bits);
+#endif
+
+  // This may be a nested call with aux_out == null. Whenever we know that
+  // aux_out is null, we can call ReclaimUnused directly.
+  if (aux_out != nullptr) {
+    aux_out->layers[layer].total_bits += used_bits;
+    aux_out->layers[layer].histogram_bits += HistogramBits();
+  }
+}
+
+void BitWriter::Allotment::PrivateReclaim(BitWriter* JXL_RESTRICT writer,
+                                          size_t* JXL_RESTRICT used_bits,
+                                          size_t* JXL_RESTRICT unused_bits) {
+  JXL_ASSERT(!called_);  // Do not call twice
+  called_ = true;
+  if (writer == nullptr) return;
+
+  JXL_ASSERT(writer->BitsWritten() >= prev_bits_written_);
+  *used_bits = writer->BitsWritten() - prev_bits_written_;
+  JXL_ASSERT(*used_bits <= max_bits_);
+  *unused_bits = max_bits_ - *used_bits;
+
+  // Reclaim unused bytes whole bytes from writer's allotment.
+  const size_t unused_bytes = *unused_bits / kBitsPerByte;  // truncate
+  JXL_ASSERT(writer->storage_.size() >= unused_bytes);
+  writer->storage_.resize(writer->storage_.size() - unused_bytes);
+  writer->current_allotment_ = parent_;
+  // Ensure we don't also charge the parent for these bits.
+  auto parent = parent_;
+  while (parent != nullptr) {
+    parent->prev_bits_written_ += *used_bits;
+    parent = parent->parent_;
+  }
+}
+
+void BitWriter::AppendByteAligned(const Span<const uint8_t>& span) {
+  if (span.empty()) return;
+  storage_.resize(storage_.size() + span.size() + 1);  // extra zero padding
+
+  // Concatenate by copying bytes because both source and destination are bytes.
+  JXL_ASSERT(BitsWritten() % kBitsPerByte == 0);
+  size_t pos = BitsWritten() / kBitsPerByte;
+  memcpy(storage_.data() + pos, span.data(), span.size());
+  pos += span.size();
+  storage_[pos++] = 0;  // for next Write
+  JXL_ASSERT(pos <= storage_.size());
+  bits_written_ += span.size() * kBitsPerByte;
+}
+
+void BitWriter::AppendByteAligned(const BitWriter& other) {
+  JXL_ASSERT(other.BitsWritten() % kBitsPerByte == 0);
+  JXL_ASSERT(other.BitsWritten() / kBitsPerByte != 0);
+
+  AppendByteAligned(other.GetSpan());
+}
+
+void BitWriter::AppendByteAligned(const std::vector<BitWriter>& others) {
+  // Total size to add so we can preallocate
+  size_t other_bytes = 0;
+  for (const BitWriter& writer : others) {
+    JXL_ASSERT(writer.BitsWritten() % kBitsPerByte == 0);
+    other_bytes += writer.BitsWritten() / kBitsPerByte;
+  }
+  if (other_bytes == 0) {
+    // No bytes to append: this happens for example when creating per-group
+    // storage for groups, but not writing anything in them for e.g. lossless
+    // images with no alpha. Do nothing.
+    return;
+  }
+  storage_.resize(storage_.size() + other_bytes + 1);  // extra zero padding
+
+  // Concatenate by copying bytes because both source and destination are bytes.
+  JXL_ASSERT(BitsWritten() % kBitsPerByte == 0);
+  size_t pos = BitsWritten() / kBitsPerByte;
+  for (const BitWriter& writer : others) {
+    const Span<const uint8_t> span = writer.GetSpan();
+    if (!span.empty()) {
+      memcpy(storage_.data() + pos, span.data(), span.size());
+      pos += span.size();
+    }
+  }
+  storage_[pos++] = 0;  // for next Write
+  JXL_ASSERT(pos <= storage_.size());
+  bits_written_ += other_bytes * kBitsPerByte;
+}
+
+// TODO(lode): avoid code duplication
+void BitWriter::AppendByteAligned(
+    const std::vector<std::unique_ptr<BitWriter>>& others) {
+  // Total size to add so we can preallocate
+  size_t other_bytes = 0;
+  for (const auto& writer : others) {
+    JXL_ASSERT(writer->BitsWritten() % kBitsPerByte == 0);
+    other_bytes += writer->BitsWritten() / kBitsPerByte;
+  }
+  if (other_bytes == 0) {
+    // No bytes to append: this happens for example when creating per-group
+    // storage for groups, but not writing anything in them for e.g. lossless
+    // images with no alpha. Do nothing.
+    return;
+  }
+  storage_.resize(storage_.size() + other_bytes + 1);  // extra zero padding
+
+  // Concatenate by copying bytes because both source and destination are bytes.
+  JXL_ASSERT(BitsWritten() % kBitsPerByte == 0);
+  size_t pos = BitsWritten() / kBitsPerByte;
+  for (const auto& writer : others) {
+    const Span<const uint8_t> span = writer->GetSpan();
+    memcpy(storage_.data() + pos, span.data(), span.size());
+    pos += span.size();
+  }
+  storage_[pos++] = 0;  // for next Write
+  JXL_ASSERT(pos <= storage_.size());
+  bits_written_ += other_bytes * kBitsPerByte;
+}
+
+// Example: let's assume that 3 bits (Rs below) have been written already:
+// BYTE+0       BYTE+1       BYTE+2
+// 0000 0RRR    ???? ????    ???? ????
+//
+// Now, we could write up to 5 bits by just shifting them left by 3 bits and
+// OR'ing to BYTE-0.
+//
+// For n > 5 bits, we write the lowest 5 bits as above, then write the next
+// lowest bits into BYTE+1 starting from its lower bits and so on.
+void BitWriter::Write(size_t n_bits, uint64_t bits) {
+  JXL_DASSERT((bits >> n_bits) == 0);
+  JXL_DASSERT(n_bits <= kMaxBitsPerCall);
+  uint8_t* p = &storage_[bits_written_ / kBitsPerByte];
+  const size_t bits_in_first_byte = bits_written_ % kBitsPerByte;
+  bits <<= bits_in_first_byte;
+#if JXL_BYTE_ORDER_LITTLE
+  uint64_t v = *p;
+  // Last (partial) or next byte to write must be zero-initialized!
+  // PaddedBytes initializes the first, and Write/Append maintain this.
+  JXL_DASSERT(v >> bits_in_first_byte == 0);
+  v |= bits;
+  memcpy(p, &v, sizeof(v));  // Write bytes: possibly more than n_bits/8
+#else
+  *p++ |= static_cast<uint8_t>(bits & 0xFF);
+  for (size_t bits_left_to_write = n_bits + bits_in_first_byte;
+       bits_left_to_write >= 9; bits_left_to_write -= 8) {
+    bits >>= 8;
+    *p++ = static_cast<uint8_t>(bits & 0xFF);
+  }
+  *p = 0;
+#endif
+  bits_written_ += n_bits;
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_bit_writer.h b/third_party/jpeg-xl/lib/jxl/enc_bit_writer.h
new file mode 100644
index 0000000000..d3fac15a68
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_bit_writer.h
@@ -0,0 +1,129 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_BIT_WRITER_H_
+#define LIB_JXL_ENC_BIT_WRITER_H_
+
+// BitWriter class: unbuffered writes using unaligned 64-bit stores.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+struct BitWriter {
+  // Upper bound on `n_bits` in each call to Write. We shift a 64-bit word by
+  // 7 bits (max already valid bits in the last byte) and at least 1 bit is
+  // needed to zero-initialize the bit-stream ahead (i.e. if 7 bits are valid
+  // and we write 57 bits, then the next write will access a byte that was not
+  // yet zero-initialized).
+  static constexpr size_t kMaxBitsPerCall = 56;
+
+  BitWriter() : bits_written_(0) {}
+
+  // Disallow copying - may lead to bugs.
+  BitWriter(const BitWriter&) = delete;
+  BitWriter& operator=(const BitWriter&) = delete;
+  BitWriter(BitWriter&&) = default;
+  BitWriter& operator=(BitWriter&&) = default;
+
+  size_t BitsWritten() const { return bits_written_; }
+
+  Span<const uint8_t> GetSpan() const {
+    // Callers must ensure byte alignment to avoid uninitialized bits.
+    JXL_ASSERT(bits_written_ % kBitsPerByte == 0);
+    return Span<const uint8_t>(storage_.data(), bits_written_ / kBitsPerByte);
+  }
+
+  // Example usage: bytes = std::move(writer).TakeBytes(); Useful for the
+  // top-level encoder which returns PaddedBytes, not a BitWriter.
+  // *this must be an rvalue reference and is invalid afterwards.
+  PaddedBytes&& TakeBytes() && {
+    // Callers must ensure byte alignment to avoid uninitialized bits.
+    JXL_ASSERT(bits_written_ % kBitsPerByte == 0);
+    storage_.resize(bits_written_ / kBitsPerByte);
+    return std::move(storage_);
+  }
+
+ private:
+  // Must be byte-aligned before calling.
+  void AppendByteAligned(const Span<const uint8_t>& span);
+
+ public:
+  // NOTE: no allotment needed, the other BitWriters have already been charged.
+  void AppendByteAligned(const BitWriter& other);
+  void AppendByteAligned(const std::vector<std::unique_ptr<BitWriter>>& others);
+  void AppendByteAligned(const std::vector<BitWriter>& others);
+
+  class Allotment {
+   public:
+    // Expands a BitWriter's storage. Must happen before calling Write or
+    // ZeroPadToByte. Must call ReclaimUnused after writing to reclaim the
+    // unused storage so that BitWriter memory use remains tightly bounded.
+    Allotment(BitWriter* JXL_RESTRICT writer, size_t max_bits);
+    ~Allotment();
+
+    size_t MaxBits() const { return max_bits_; }
+
+    // Call after writing a histogram, but before ReclaimUnused.
+    void FinishedHistogram(BitWriter* JXL_RESTRICT writer);
+
+    size_t HistogramBits() const {
+      JXL_ASSERT(called_);
+      return histogram_bits_;
+    }
+
+    void ReclaimAndCharge(BitWriter* JXL_RESTRICT writer, size_t layer,
+                          AuxOut* JXL_RESTRICT aux_out);
+
+   private:
+    void PrivateReclaim(BitWriter* JXL_RESTRICT writer,
+                        size_t* JXL_RESTRICT used_bits,
+                        size_t* JXL_RESTRICT unused_bits);
+
+    size_t prev_bits_written_;
+    const size_t max_bits_;
+    size_t histogram_bits_ = 0;
+    bool called_ = false;
+    Allotment* parent_;
+  };
+
+  // Writes bits into bytes in increasing addresses, and within a byte
+  // least-significant-bit first.
+  //
+  // The function can write up to 56 bits in one go.
+  void Write(size_t n_bits, uint64_t bits);
+
+  // This should only rarely be used - e.g. when the current location will be
+  // referenced via byte offset (TOCs point to groups), or byte-aligned reading
+  // is required for speed.
+  void ZeroPadToByte() {
+    const size_t remainder_bits =
+        RoundUpBitsToByteMultiple(bits_written_) - bits_written_;
+    if (remainder_bits == 0) return;
+    Write(remainder_bits, 0);
+    JXL_ASSERT(bits_written_ % kBitsPerByte == 0);
+  }
+
+ private:
+  size_t bits_written_;
+  PaddedBytes storage_;
+  Allotment* current_allotment_ = nullptr;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_BIT_WRITER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.cc b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.cc
new file mode 100644
index 0000000000..5711f45884
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.cc
@@ -0,0 +1,99 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_butteraugli_comparator.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/enc_image_bundle.h"
+
+namespace jxl {
+
+JxlButteraugliComparator::JxlButteraugliComparator(
+    const ButteraugliParams& params, const JxlCmsInterface& cms)
+    : params_(params), cms_(cms) {}
+
+Status JxlButteraugliComparator::SetReferenceImage(const ImageBundle& ref) {
+  const ImageBundle* ref_linear_srgb;
+  ImageMetadata metadata = *ref.metadata();
+  ImageBundle store(&metadata);
+  if (!TransformIfNeeded(ref, ColorEncoding::LinearSRGB(ref.IsGray()), cms_,
+                         /*pool=*/nullptr, &store, &ref_linear_srgb)) {
+    return false;
+  }
+
+  comparator_.reset(
+      new ButteraugliComparator(ref_linear_srgb->color(), params_));
+  xsize_ = ref.xsize();
+  ysize_ = ref.ysize();
+  return true;
+}
+
+Status JxlButteraugliComparator::CompareWith(const ImageBundle& actual,
+                                             ImageF* diffmap, float* score) {
+  if (!comparator_) {
+    return JXL_FAILURE("Must set reference image first");
+  }
+  if (xsize_ != actual.xsize() || ysize_ != actual.ysize()) {
+    return JXL_FAILURE("Images must have same size");
+  }
+
+  const ImageBundle* actual_linear_srgb;
+  ImageMetadata metadata = *actual.metadata();
+  ImageBundle store(&metadata);
+  if (!TransformIfNeeded(actual, ColorEncoding::LinearSRGB(actual.IsGray()),
+                         cms_,
+                         /*pool=*/nullptr, &store, &actual_linear_srgb)) {
+    return false;
+  }
+
+  ImageF temp_diffmap(xsize_, ysize_);
+  comparator_->Diffmap(actual_linear_srgb->color(), temp_diffmap);
+
+  if (score != nullptr) {
+    *score = ButteraugliScoreFromDiffmap(temp_diffmap, &params_);
+  }
+  if (diffmap != nullptr) {
+    diffmap->Swap(temp_diffmap);
+  }
+
+  return true;
+}
+
+float JxlButteraugliComparator::GoodQualityScore() const {
+  return ButteraugliFuzzyInverse(1.5);
+}
+
+float JxlButteraugliComparator::BadQualityScore() const {
+  return ButteraugliFuzzyInverse(0.5);
+}
+
+float ButteraugliDistance(const ImageBundle& rgb0, const ImageBundle& rgb1,
+                          const ButteraugliParams& params,
+                          const JxlCmsInterface& cms, ImageF* distmap,
+                          ThreadPool* pool) {
+  JxlButteraugliComparator comparator(params, cms);
+  return ComputeScore(rgb0, rgb1, &comparator, cms, distmap, pool);
+}
+
+float ButteraugliDistance(const std::vector<ImageBundle>& frames0,
+                          const std::vector<ImageBundle>& frames1,
+                          const ButteraugliParams& params,
+                          const JxlCmsInterface& cms, ImageF* distmap,
+                          ThreadPool* pool) {
+  JxlButteraugliComparator comparator(params, cms);
+  JXL_ASSERT(frames0.size() == frames1.size());
+  float max_dist = 0.0f;
+  for (size_t i = 0; i < frames0.size(); ++i) {
+    max_dist = std::max(
+        max_dist,
+        ComputeScore(frames0[i], frames1[i], &comparator, cms, distmap, pool));
+  }
+  return max_dist;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.h b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.h
new file mode 100644
index 0000000000..6c37d1dc7d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.h
@@ -0,0 +1,59 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_BUTTERAUGLI_COMPARATOR_H_
+#define LIB_JXL_ENC_BUTTERAUGLI_COMPARATOR_H_
+
+#include <jxl/cms_interface.h>
+#include <stddef.h>
+
+#include <memory>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/butteraugli/butteraugli.h"
+#include "lib/jxl/enc_comparator.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+class JxlButteraugliComparator : public Comparator {
+ public:
+  explicit JxlButteraugliComparator(const ButteraugliParams& params,
+                                    const JxlCmsInterface& cms);
+
+  Status SetReferenceImage(const ImageBundle& ref) override;
+
+  Status CompareWith(const ImageBundle& actual, ImageF* diffmap,
+                     float* score) override;
+
+  float GoodQualityScore() const override;
+  float BadQualityScore() const override;
+
+ private:
+  ButteraugliParams params_;
+  JxlCmsInterface cms_;
+  std::unique_ptr<ButteraugliComparator> comparator_;
+  size_t xsize_ = 0;
+  size_t ysize_ = 0;
+};
+
+// Returns the butteraugli distance between rgb0 and rgb1.
+// If distmap is not null, it must be the same size as rgb0 and rgb1.
+float ButteraugliDistance(const ImageBundle& rgb0, const ImageBundle& rgb1,
+                          const ButteraugliParams& params,
+                          const JxlCmsInterface& cms, ImageF* distmap = nullptr,
+                          ThreadPool* pool = nullptr);
+
+float ButteraugliDistance(const std::vector<ImageBundle>& frames0,
+                          const std::vector<ImageBundle>& frames1,
+                          const ButteraugliParams& params,
+                          const JxlCmsInterface& cms, ImageF* distmap = nullptr,
+                          ThreadPool* pool = nullptr);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_BUTTERAUGLI_COMPARATOR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.cc b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.cc
new file mode 100644
index 0000000000..fe5629dcda
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.cc
@@ -0,0 +1,211 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_butteraugli_pnorm.h"
+
+#include <math.h>
+#include <stdlib.h>
+
+#include <atomic>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_butteraugli_pnorm.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::Rebind;
+
+double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params,
+                        double p) {
+  PROFILER_FUNC;
+
+  const double onePerPixels = 1.0 / (distmap.ysize() * distmap.xsize());
+  if (std::abs(p - 3.0) < 1E-6) {
+    double sum1[3] = {0.0};
+
+// Prefer double if possible, but otherwise use float rather than scalar.
+#if HWY_CAP_FLOAT64
+    using T = double;
+    const Rebind<float, HWY_FULL(double)> df;
+#else
+    using T = float;
+#endif
+    const HWY_FULL(T) d;
+    constexpr size_t N = MaxLanes(HWY_FULL(T)());
+    // Manually aligned storage to avoid asan crash on clang-7 due to
+    // unaligned spill.
+    HWY_ALIGN T sum_totals0[N] = {0};
+    HWY_ALIGN T sum_totals1[N] = {0};
+    HWY_ALIGN T sum_totals2[N] = {0};
+
+    for (size_t y = 0; y < distmap.ysize(); ++y) {
+      const float* JXL_RESTRICT row = distmap.ConstRow(y);
+
+      auto sums0 = Zero(d);
+      auto sums1 = Zero(d);
+      auto sums2 = Zero(d);
+
+      size_t x = 0;
+      for (; x + Lanes(d) <= distmap.xsize(); x += Lanes(d)) {
+#if HWY_CAP_FLOAT64
+        const auto d1 = PromoteTo(d, Load(df, row + x));
+#else
+        const auto d1 = Load(d, row + x);
+#endif
+        const auto d2 = Mul(d1, Mul(d1, d1));
+        sums0 = Add(sums0, d2);
+        const auto d3 = Mul(d2, d2);
+        sums1 = Add(sums1, d3);
+        const auto d4 = Mul(d3, d3);
+        sums2 = Add(sums2, d4);
+      }
+
+      Store(Add(sums0, Load(d, sum_totals0)), d, sum_totals0);
+      Store(Add(sums1, Load(d, sum_totals1)), d, sum_totals1);
+      Store(Add(sums2, Load(d, sum_totals2)), d, sum_totals2);
+
+      for (; x < distmap.xsize(); ++x) {
+        const double d1 = row[x];
+        double d2 = d1 * d1 * d1;
+        sum1[0] += d2;
+        d2 *= d2;
+        sum1[1] += d2;
+        d2 *= d2;
+        sum1[2] += d2;
+      }
+    }
+    double v = 0;
+    v += pow(
+        onePerPixels * (sum1[0] + GetLane(SumOfLanes(d, Load(d, sum_totals0)))),
+        1.0 / (p * 1.0));
+    v += pow(
+        onePerPixels * (sum1[1] + GetLane(SumOfLanes(d, Load(d, sum_totals1)))),
+        1.0 / (p * 2.0));
+    v += pow(
+        onePerPixels * (sum1[2] + GetLane(SumOfLanes(d, Load(d, sum_totals2)))),
+        1.0 / (p * 4.0));
+    v /= 3.0;
+    return v;
+  } else {
+    static std::atomic<int> once{0};
+    if (once.fetch_add(1, std::memory_order_relaxed) == 0) {
+      JXL_WARNING("WARNING: using slow ComputeDistanceP");
+    }
+    double sum1[3] = {0.0};
+    for (size_t y = 0; y < distmap.ysize(); ++y) {
+      const float* JXL_RESTRICT row = distmap.ConstRow(y);
+      for (size_t x = 0; x < distmap.xsize(); ++x) {
+        double d2 = std::pow(row[x], p);
+        sum1[0] += d2;
+        d2 *= d2;
+        sum1[1] += d2;
+        d2 *= d2;
+        sum1[2] += d2;
+      }
+    }
+    double v = 0;
+    for (int i = 0; i < 3; ++i) {
+      v += pow(onePerPixels * (sum1[i]), 1.0 / (p * (1 << i)));
+    }
+    v /= 3.0;
+    return v;
+  }
+}
+
+// TODO(lode): take alpha into account when needed
+double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2,
+                        const JxlCmsInterface& cms) {
+  PROFILER_FUNC;
+  // Convert to sRGB - closer to perception than linear.
+  const Image3F* srgb1 = &ib1.color();
+  Image3F copy1;
+  if (!ib1.IsSRGB()) {
+    JXL_CHECK(
+        ib1.CopyTo(Rect(ib1), ColorEncoding::SRGB(ib1.IsGray()), cms, &copy1));
+    srgb1 = &copy1;
+  }
+  const Image3F* srgb2 = &ib2.color();
+  Image3F copy2;
+  if (!ib2.IsSRGB()) {
+    JXL_CHECK(
+        ib2.CopyTo(Rect(ib2), ColorEncoding::SRGB(ib2.IsGray()), cms, &copy2));
+    srgb2 = &copy2;
+  }
+
+  JXL_CHECK(SameSize(*srgb1, *srgb2));
+
+  // TODO(veluca): SIMD.
+  float yuvmatrix[3][3] = {{0.299, 0.587, 0.114},
+                           {-0.14713, -0.28886, 0.436},
+                           {0.615, -0.51499, -0.10001}};
+  double sum_of_squares[3] = {};
+  for (size_t y = 0; y < srgb1->ysize(); ++y) {
+    const float* JXL_RESTRICT row1[3];
+    const float* JXL_RESTRICT row2[3];
+    for (size_t j = 0; j < 3; j++) {
+      row1[j] = srgb1->ConstPlaneRow(j, y);
+      row2[j] = srgb2->ConstPlaneRow(j, y);
+    }
+    for (size_t x = 0; x < srgb1->xsize(); ++x) {
+      float cdiff[3] = {};
+      // YUV conversion is linear, so we can run it on the difference.
+      for (size_t j = 0; j < 3; j++) {
+        cdiff[j] = row1[j][x] - row2[j][x];
+      }
+      float yuvdiff[3] = {};
+      for (size_t j = 0; j < 3; j++) {
+        for (size_t k = 0; k < 3; k++) {
+          yuvdiff[j] += yuvmatrix[j][k] * cdiff[k];
+        }
+      }
+      for (size_t j = 0; j < 3; j++) {
+        sum_of_squares[j] += yuvdiff[j] * yuvdiff[j];
+      }
+    }
+  }
+  // Weighted PSNR as in JPEG-XL: chroma counts 1/8.
+  const float weights[3] = {6.0f / 8, 1.0f / 8, 1.0f / 8};
+  // Avoid squaring the weight - 1/64 is too extreme.
+  double norm = 0;
+  for (size_t i = 0; i < 3; i++) {
+    norm += std::sqrt(sum_of_squares[i]) * weights[i];
+  }
+  // This function returns distance *squared*.
+  return norm * norm;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(ComputeDistanceP);
+double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params,
+                        double p) {
+  return HWY_DYNAMIC_DISPATCH(ComputeDistanceP)(distmap, params, p);
+}
+
+HWY_EXPORT(ComputeDistance2);
+double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2,
+                        const JxlCmsInterface& cms) {
+  return HWY_DYNAMIC_DISPATCH(ComputeDistance2)(ib1, ib2, cms);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.h b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.h
new file mode 100644
index 0000000000..cf6872e5d0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.h
@@ -0,0 +1,25 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_BUTTERAUGLI_PNORM_H_
+#define LIB_JXL_ENC_BUTTERAUGLI_PNORM_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/butteraugli/butteraugli.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+// Computes p-norm given the butteraugli distmap.
+double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params,
+                        double p);
+
+double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2,
+                        const JxlCmsInterface& cms);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_BUTTERAUGLI_PNORM_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_cache.cc b/third_party/jpeg-xl/lib/jxl/enc_cache.cc
new file mode 100644
index 0000000000..fc3e5c9f30
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_cache.cc
@@ -0,0 +1,218 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_cache.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <type_traits>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/compressed_dc.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/dec_frame.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_frame.h"
+#include "lib/jxl/enc_group.h"
+#include "lib/jxl/enc_modular.h"
+#include "lib/jxl/enc_quant_weights.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/passes_state.h"
+#include "lib/jxl/quantizer.h"
+
+namespace jxl {
+
+Status InitializePassesEncoder(const Image3F& opsin, const JxlCmsInterface& cms,
+                               ThreadPool* pool, PassesEncoderState* enc_state,
+                               ModularFrameEncoder* modular_frame_encoder,
+                               AuxOut* aux_out) {
+  PROFILER_FUNC;
+
+  PassesSharedState& JXL_RESTRICT shared = enc_state->shared;
+
+  enc_state->histogram_idx.resize(shared.frame_dim.num_groups);
+
+  enc_state->x_qm_multiplier =
+      std::pow(1.25f, shared.frame_header.x_qm_scale - 2.0f);
+  enc_state->b_qm_multiplier =
+      std::pow(1.25f, shared.frame_header.b_qm_scale - 2.0f);
+
+  if (enc_state->coeffs.size() < shared.frame_header.passes.num_passes) {
+    enc_state->coeffs.reserve(shared.frame_header.passes.num_passes);
+    for (size_t i = enc_state->coeffs.size();
+         i < shared.frame_header.passes.num_passes; i++) {
+      // Allocate enough coefficients for each group on every row.
+      enc_state->coeffs.emplace_back(make_unique<ACImageT<int32_t>>(
+          kGroupDim * kGroupDim, shared.frame_dim.num_groups));
+    }
+  }
+  while (enc_state->coeffs.size() > shared.frame_header.passes.num_passes) {
+    enc_state->coeffs.pop_back();
+  }
+
+  float scale =
+      shared.quantizer.ScaleGlobalScale(enc_state->cparams.quant_ac_rescale);
+  DequantMatricesScaleDC(&shared.matrices, scale);
+  shared.quantizer.RecomputeFromGlobalScale();
+
+  Image3F dc(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks);
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0, shared.frame_dim.num_groups, ThreadPool::NoInit,
+      [&](size_t group_idx, size_t _) {
+        ComputeCoefficients(group_idx, enc_state, opsin, &dc);
+      },
+      "Compute coeffs"));
+
+  if (shared.frame_header.flags & FrameHeader::kUseDcFrame) {
+    CompressParams cparams = enc_state->cparams;
+    cparams.dots = Override::kOff;
+    cparams.noise = Override::kOff;
+    cparams.patches = Override::kOff;
+    cparams.gaborish = Override::kOff;
+    cparams.epf = 0;
+    cparams.resampling = 1;
+    cparams.ec_resampling = 1;
+    // The DC frame will have alpha=0. Don't erase its contents.
+    cparams.keep_invisible = Override::kOn;
+    JXL_ASSERT(cparams.progressive_dc > 0);
+    cparams.progressive_dc--;
+    // Use kVarDCT in max_error_mode for intermediate progressive DC,
+    // and kModular for the smallest DC (first in the bitstream)
+    if (cparams.progressive_dc == 0) {
+      cparams.modular_mode = true;
+      cparams.speed_tier =
+          SpeedTier(std::max(static_cast<int>(SpeedTier::kTortoise),
+                             static_cast<int>(cparams.speed_tier) - 1));
+      cparams.butteraugli_distance =
+          std::max(kMinButteraugliDistance,
+                   enc_state->cparams.butteraugli_distance * 0.02f);
+    } else {
+      cparams.max_error_mode = true;
+      for (size_t c = 0; c < 3; c++) {
+        cparams.max_error[c] = shared.quantizer.MulDC()[c];
+      }
+      // Guess a distance that produces good initial results.
+      cparams.butteraugli_distance =
+          std::max(kMinButteraugliDistance,
+                   enc_state->cparams.butteraugli_distance * 0.1f);
+    }
+    ImageBundle ib(&shared.metadata->m);
+    // This is a lie - dc is in XYB
+    // (but EncodeFrame will skip RGB->XYB conversion anyway)
+    ib.SetFromImage(
+        std::move(dc),
+        ColorEncoding::LinearSRGB(shared.metadata->m.color_encoding.IsGray()));
+    if (!ib.metadata()->extra_channel_info.empty()) {
+      // Add dummy extra channels to the patch image: dc_level frames do not yet
+      // support extra channels, but the codec expects that the amount of extra
+      // channels in frames matches that in the metadata of the codestream.
+      std::vector<ImageF> extra_channels;
+      extra_channels.reserve(ib.metadata()->extra_channel_info.size());
+      for (size_t i = 0; i < ib.metadata()->extra_channel_info.size(); i++) {
+        extra_channels.emplace_back(ib.xsize(), ib.ysize());
+        // Must initialize the image with data to not affect blending with
+        // uninitialized memory.
+        // TODO(lode): dc_level must copy and use the real extra channels
+        // instead.
+        ZeroFillImage(&extra_channels.back());
+      }
+      ib.SetExtraChannels(std::move(extra_channels));
+    }
+    std::unique_ptr<PassesEncoderState> state =
+        jxl::make_unique<PassesEncoderState>();
+
+    auto special_frame = std::unique_ptr<BitWriter>(new BitWriter());
+    FrameInfo dc_frame_info;
+    dc_frame_info.frame_type = FrameType::kDCFrame;
+    dc_frame_info.dc_level = shared.frame_header.dc_level + 1;
+    dc_frame_info.ib_needs_color_transform = false;
+    dc_frame_info.save_before_color_transform = true;  // Implicitly true
+    AuxOut dc_aux_out;
+    if (aux_out) {
+      dc_aux_out.debug_prefix = aux_out->debug_prefix;
+    }
+    JXL_CHECK(EncodeFrame(cparams, dc_frame_info, shared.metadata, ib,
+                          state.get(), cms, pool, special_frame.get(),
+                          aux_out ? &dc_aux_out : nullptr));
+    if (aux_out) {
+      for (const auto& l : dc_aux_out.layers) {
+        aux_out->layers[kLayerDC].Assimilate(l);
+      }
+    }
+    const Span<const uint8_t> encoded = special_frame->GetSpan();
+    enc_state->special_frames.emplace_back(std::move(special_frame));
+
+    ImageBundle decoded(&shared.metadata->m);
+    std::unique_ptr<PassesDecoderState> dec_state =
+        jxl::make_unique<PassesDecoderState>();
+    JXL_CHECK(
+        dec_state->output_encoding_info.SetFromMetadata(*shared.metadata));
+    const uint8_t* frame_start = encoded.data();
+    size_t encoded_size = encoded.size();
+    for (int i = 0; i <= cparams.progressive_dc; ++i) {
+      JXL_CHECK(DecodeFrame(dec_state.get(), pool, frame_start, encoded_size,
+                            &decoded, *shared.metadata));
+      frame_start += decoded.decoded_bytes();
+      encoded_size -= decoded.decoded_bytes();
+    }
+    // TODO(lode): shared.frame_header.dc_level should be equal to
+    // dec_state.shared->frame_header.dc_level - 1 here, since above we set
+    // dc_frame_info.dc_level = shared.frame_header.dc_level + 1, and
+    // dc_frame_info.dc_level is used by EncodeFrame. However, if EncodeFrame
+    // outputs multiple frames, this assumption could be wrong.
+    shared.dc_storage =
+        CopyImage(dec_state->shared->dc_frames[shared.frame_header.dc_level]);
+    ZeroFillImage(&shared.quant_dc);
+    shared.dc = &shared.dc_storage;
+    JXL_CHECK(encoded_size == 0);
+  } else {
+    auto compute_dc_coeffs = [&](int group_index, int /* thread */) {
+      modular_frame_encoder->AddVarDCTDC(
+          dc, group_index, enc_state->cparams.speed_tier < SpeedTier::kFalcon,
+          enc_state, /*jpeg_transcode=*/false);
+    };
+    JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, shared.frame_dim.num_dc_groups,
+                                  ThreadPool::NoInit, compute_dc_coeffs,
+                                  "Compute DC coeffs"));
+    // TODO(veluca): this is only useful in tests and if inspection is enabled.
+    if (!(shared.frame_header.flags & FrameHeader::kSkipAdaptiveDCSmoothing)) {
+      AdaptiveDCSmoothing(shared.quantizer.MulDC(), &shared.dc_storage, pool);
+    }
+  }
+  auto compute_ac_meta = [&](int group_index, int /* thread */) {
+    modular_frame_encoder->AddACMetadata(group_index, /*jpeg_transcode=*/false,
+                                         enc_state);
+  };
+  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, shared.frame_dim.num_dc_groups,
+                                ThreadPool::NoInit, compute_ac_meta,
+                                "Compute AC Metadata"));
+
+  if (aux_out != nullptr) {
+    aux_out->InspectImage3F("compressed_image:InitializeFrameEncCache:dc_dec",
+                            shared.dc_storage);
+  }
+  return true;
+}
+
+void EncCache::InitOnce() {
+  PROFILER_FUNC;
+
+  if (num_nzeroes.xsize() == 0) {
+    num_nzeroes = Image3I(kGroupDimInBlocks, kGroupDimInBlocks);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_cache.h b/third_party/jpeg-xl/lib/jxl/enc_cache.h
new file mode 100644
index 0000000000..6c7870ba00
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_cache.h
@@ -0,0 +1,93 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_CACHE_H_
+#define LIB_JXL_ENC_CACHE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_heuristics.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_progressive_split.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/passes_state.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Contains encoder state.
+struct PassesEncoderState {
+  PassesSharedState shared;
+
+  ImageF initial_quant_field;    // Invalid in Falcon mode.
+  ImageF initial_quant_masking;  // Invalid in Falcon mode.
+
+  // Per-pass DCT coefficients for the image. One row per group.
+  std::vector<std::unique_ptr<ACImage>> coeffs;
+
+  // Raw data for special (reference+DC) frames.
+  std::vector<std::unique_ptr<BitWriter>> special_frames;
+
+  // For splitting into passes.
+  ProgressiveSplitter progressive_splitter;
+
+  CompressParams cparams;
+
+  struct PassData {
+    std::vector<std::vector<Token>> ac_tokens;
+    std::vector<uint8_t> context_map;
+    EntropyEncodingData codes;
+  };
+
+  std::vector<PassData> passes;
+  std::vector<uint8_t> histogram_idx;
+
+  // Coefficient orders that are non-default.
+  std::vector<uint32_t> used_orders;
+
+  // Multiplier to be applied to the quant matrices of the x channel.
+  float x_qm_multiplier = 1.0f;
+  float b_qm_multiplier = 1.0f;
+
+  // Heuristics to be used by the encoder.
+  std::unique_ptr<EncoderHeuristics> heuristics =
+      make_unique<DefaultEncoderHeuristics>();
+};
+
+// Initialize per-frame information.
+class ModularFrameEncoder;
+Status InitializePassesEncoder(const Image3F& opsin, const JxlCmsInterface& cms,
+                               ThreadPool* pool,
+                               PassesEncoderState* passes_enc_state,
+                               ModularFrameEncoder* modular_frame_encoder,
+                               AuxOut* aux_out);
+
+// Working area for ComputeCoefficients (per-group!)
+struct EncCache {
+  // Allocates memory when first called, shrinks images to current group size.
+  void InitOnce();
+
+  // TokenizeCoefficients
+  Image3I num_nzeroes;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_CACHE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.cc b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.cc
new file mode 100644
index 0000000000..0cdd2a7823
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.cc
@@ -0,0 +1,409 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_chroma_from_luma.h"
+
+#include <float.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_chroma_from_luma.cc"
+#include <hwy/aligned_allocator.h>
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_transforms-inl.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_transforms-inl.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/quantizer.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Ge;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::Lt;
+
+static HWY_FULL(float) df;
+
+struct CFLFunction {
+  static constexpr float kCoeff = 1.f / 3;
+  static constexpr float kThres = 100.0f;
+  static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor;
+  CFLFunction(const float* values_m, const float* values_s, size_t num,
+              float base, float distance_mul)
+      : values_m(values_m),
+        values_s(values_s),
+        num(num),
+        base(base),
+        distance_mul(distance_mul) {}
+
+  // Returns f'(x), where f is 1/3 * sum ((|color residual| + 1)^2-1) +
+  // distance_mul * x^2 * num.
+  float Compute(float x, float eps, float* fpeps, float* fmeps) const {
+    float first_derivative = 2 * distance_mul * num * x;
+    float first_derivative_peps = 2 * distance_mul * num * (x + eps);
+    float first_derivative_meps = 2 * distance_mul * num * (x - eps);
+
+    const auto inv_color_factor = Set(df, kInvColorFactor);
+    const auto thres = Set(df, kThres);
+    const auto coeffx2 = Set(df, kCoeff * 2.0f);
+    const auto one = Set(df, 1.0f);
+    const auto zero = Set(df, 0.0f);
+    const auto base_v = Set(df, base);
+    const auto x_v = Set(df, x);
+    const auto xpe_v = Set(df, x + eps);
+    const auto xme_v = Set(df, x - eps);
+    auto fd_v = Zero(df);
+    auto fdpe_v = Zero(df);
+    auto fdme_v = Zero(df);
+    JXL_ASSERT(num % Lanes(df) == 0);
+
+    for (size_t i = 0; i < num; i += Lanes(df)) {
+      // color residual = ax + b
+      const auto a = Mul(inv_color_factor, Load(df, values_m + i));
+      const auto b =
+          Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i));
+      const auto v = MulAdd(a, x_v, b);
+      const auto vpe = MulAdd(a, xpe_v, b);
+      const auto vme = MulAdd(a, xme_v, b);
+      const auto av = Abs(v);
+      const auto avpe = Abs(vpe);
+      const auto avme = Abs(vme);
+      const auto acoeffx2 = Mul(coeffx2, a);
+      auto d = Mul(acoeffx2, Add(av, one));
+      auto dpe = Mul(acoeffx2, Add(avpe, one));
+      auto dme = Mul(acoeffx2, Add(avme, one));
+      d = IfThenElse(Lt(v, zero), Sub(zero, d), d);
+      dpe = IfThenElse(Lt(vpe, zero), Sub(zero, dpe), dpe);
+      dme = IfThenElse(Lt(vme, zero), Sub(zero, dme), dme);
+      const auto above = Ge(av, thres);
+      // TODO(eustas): use IfThenElseZero
+      fd_v = Add(fd_v, IfThenElse(above, zero, d));
+      fdpe_v = Add(fdpe_v, IfThenElse(above, zero, dpe));
+      fdme_v = Add(fdme_v, IfThenElse(above, zero, dme));
+    }
+
+    *fpeps = first_derivative_peps + GetLane(SumOfLanes(df, fdpe_v));
+    *fmeps = first_derivative_meps + GetLane(SumOfLanes(df, fdme_v));
+    return first_derivative + GetLane(SumOfLanes(df, fd_v));
+  }
+
+  const float* JXL_RESTRICT values_m;
+  const float* JXL_RESTRICT values_s;
+  size_t num;
+  float base;
+  float distance_mul;
+};
+
+// Chroma-from-luma search, values_m will have luma -- and values_s chroma.
+int32_t FindBestMultiplier(const float* values_m, const float* values_s,
+                           size_t num, float base, float distance_mul,
+                           bool fast) {
+  if (num == 0) {
+    return 0;
+  }
+  float x;
+  if (fast) {
+    static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor;
+    auto ca = Zero(df);
+    auto cb = Zero(df);
+    const auto inv_color_factor = Set(df, kInvColorFactor);
+    const auto base_v = Set(df, base);
+    for (size_t i = 0; i < num; i += Lanes(df)) {
+      // color residual = ax + b
+      const auto a = Mul(inv_color_factor, Load(df, values_m + i));
+      const auto b =
+          Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i));
+      ca = MulAdd(a, a, ca);
+      cb = MulAdd(a, b, cb);
+    }
+    // + distance_mul * x^2 * num
+    x = -GetLane(SumOfLanes(df, cb)) /
+        (GetLane(SumOfLanes(df, ca)) + num * distance_mul * 0.5f);
+  } else {
+    constexpr float eps = 100;
+    constexpr float kClamp = 20.0f;
+    CFLFunction fn(values_m, values_s, num, base, distance_mul);
+    x = 0;
+    // Up to 20 Newton iterations, with approximate derivatives.
+    // Derivatives are approximate due to the high amount of noise in the exact
+    // derivatives.
+    for (size_t i = 0; i < 20; i++) {
+      float dfpeps, dfmeps;
+      float df = fn.Compute(x, eps, &dfpeps, &dfmeps);
+      float ddf = (dfpeps - dfmeps) / (2 * eps);
+      float kExperimentalInsignificantStabilizer = 0.85;
+      float step = df / (ddf + kExperimentalInsignificantStabilizer);
+      x -= std::min(kClamp, std::max(-kClamp, step));
+      if (std::abs(step) < 3e-3) break;
+    }
+  }
+  // CFL seems to be tricky for larger transforms for HF components
+  // close to zero. This heuristic brings the solutions closer to zero
+  // and reduces red-green oscillations.
+  float towards_zero = 2.6;
+  if (x >= towards_zero) {
+    x -= towards_zero;
+  } else if (x <= -towards_zero) {
+    x += towards_zero;
+  } else {
+    x = 0;
+  }
+  return std::max(-128.0f, std::min(127.0f, roundf(x)));
+}
+
+void InitDCStorage(size_t num_blocks, ImageF* dc_values) {
+  // First row: Y channel
+  // Second row: X channel
+  // Third row: Y channel
+  // Fourth row: B channel
+  *dc_values = ImageF(RoundUpTo(num_blocks, Lanes(df)), 4);
+
+  JXL_ASSERT(dc_values->xsize() != 0);
+  // Zero-fill the last lanes
+  for (size_t y = 0; y < 4; y++) {
+    for (size_t x = dc_values->xsize() - Lanes(df); x < dc_values->xsize();
+         x++) {
+      dc_values->Row(y)[x] = 0;
+    }
+  }
+}
+
+void ComputeDC(const ImageF& dc_values, bool fast, int32_t* dc_x,
+               int32_t* dc_b) {
+  constexpr float kDistanceMultiplierDC = 1e-5f;
+  const float* JXL_RESTRICT dc_values_yx = dc_values.Row(0);
+  const float* JXL_RESTRICT dc_values_x = dc_values.Row(1);
+  const float* JXL_RESTRICT dc_values_yb = dc_values.Row(2);
+  const float* JXL_RESTRICT dc_values_b = dc_values.Row(3);
+  *dc_x = FindBestMultiplier(dc_values_yx, dc_values_x, dc_values.xsize(), 0.0f,
+                             kDistanceMultiplierDC, fast);
+  *dc_b = FindBestMultiplier(dc_values_yb, dc_values_b, dc_values.xsize(),
+                             kYToBRatio, kDistanceMultiplierDC, fast);
+}
+
+void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant,
+                 const AcStrategyImage* ac_strategy,
+                 const ImageI* raw_quant_field, const Quantizer* quantizer,
+                 const Rect& r, bool fast, bool use_dct8, ImageSB* map_x,
+                 ImageSB* map_b, ImageF* dc_values, float* mem) {
+  static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks,
+                "Invalid color tile dim");
+  size_t xsize_blocks = opsin.xsize() / kBlockDim;
+  constexpr float kDistanceMultiplierAC = 1e-9f;
+
+  const size_t y0 = r.y0();
+  const size_t x0 = r.x0();
+  const size_t x1 = r.x0() + r.xsize();
+  const size_t y1 = r.y0() + r.ysize();
+
+  int ty = y0 / kColorTileDimInBlocks;
+  int tx = x0 / kColorTileDimInBlocks;
+
+  int8_t* JXL_RESTRICT row_out_x = map_x->Row(ty);
+  int8_t* JXL_RESTRICT row_out_b = map_b->Row(ty);
+
+  float* JXL_RESTRICT dc_values_yx = dc_values->Row(0);
+  float* JXL_RESTRICT dc_values_x = dc_values->Row(1);
+  float* JXL_RESTRICT dc_values_yb = dc_values->Row(2);
+  float* JXL_RESTRICT dc_values_b = dc_values->Row(3);
+
+  // All are aligned.
+  float* HWY_RESTRICT block_y = mem;
+  float* HWY_RESTRICT block_x = block_y + AcStrategy::kMaxCoeffArea;
+  float* HWY_RESTRICT block_b = block_x + AcStrategy::kMaxCoeffArea;
+  float* HWY_RESTRICT coeffs_yx = block_b + AcStrategy::kMaxCoeffArea;
+  float* HWY_RESTRICT coeffs_x = coeffs_yx + kColorTileDim * kColorTileDim;
+  float* HWY_RESTRICT coeffs_yb = coeffs_x + kColorTileDim * kColorTileDim;
+  float* HWY_RESTRICT coeffs_b = coeffs_yb + kColorTileDim * kColorTileDim;
+  float* HWY_RESTRICT scratch_space = coeffs_b + kColorTileDim * kColorTileDim;
+  JXL_DASSERT(scratch_space + 2 * AcStrategy::kMaxCoeffArea ==
+              block_y + CfLHeuristics::kItemsPerThread);
+
+  // Small (~256 bytes each)
+  HWY_ALIGN_MAX float
+      dc_y[AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks] = {};
+  HWY_ALIGN_MAX float
+      dc_x[AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks] = {};
+  HWY_ALIGN_MAX float
+      dc_b[AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks] = {};
+  size_t num_ac = 0;
+
+  for (size_t y = y0; y < y1; ++y) {
+    const float* JXL_RESTRICT row_y = opsin.ConstPlaneRow(1, y * kBlockDim);
+    const float* JXL_RESTRICT row_x = opsin.ConstPlaneRow(0, y * kBlockDim);
+    const float* JXL_RESTRICT row_b = opsin.ConstPlaneRow(2, y * kBlockDim);
+    size_t stride = opsin.PixelsPerRow();
+
+    for (size_t x = x0; x < x1; x++) {
+      AcStrategy acs = use_dct8
+                           ? AcStrategy::FromRawStrategy(AcStrategy::Type::DCT)
+                           : ac_strategy->ConstRow(y)[x];
+      if (!acs.IsFirstBlock()) continue;
+      size_t xs = acs.covered_blocks_x();
+      TransformFromPixels(acs.Strategy(), row_y + x * kBlockDim, stride,
+                          block_y, scratch_space);
+      DCFromLowestFrequencies(acs.Strategy(), block_y, dc_y, xs);
+      TransformFromPixels(acs.Strategy(), row_x + x * kBlockDim, stride,
+                          block_x, scratch_space);
+      DCFromLowestFrequencies(acs.Strategy(), block_x, dc_x, xs);
+      TransformFromPixels(acs.Strategy(), row_b + x * kBlockDim, stride,
+                          block_b, scratch_space);
+      DCFromLowestFrequencies(acs.Strategy(), block_b, dc_b, xs);
+      const float* const JXL_RESTRICT qm_x =
+          dequant.InvMatrix(acs.Strategy(), 0);
+      const float* const JXL_RESTRICT qm_b =
+          dequant.InvMatrix(acs.Strategy(), 2);
+      float q_dc_x = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(0);
+      float q_dc_b = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(2);
+
+      // Copy DCs in dc_values.
+      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+        for (size_t ix = 0; ix < xs; ix++) {
+          dc_values_yx[(iy + y) * xsize_blocks + ix + x] =
+              dc_y[iy * xs + ix] * q_dc_x;
+          dc_values_x[(iy + y) * xsize_blocks + ix + x] =
+              dc_x[iy * xs + ix] * q_dc_x;
+          dc_values_yb[(iy + y) * xsize_blocks + ix + x] =
+              dc_y[iy * xs + ix] * q_dc_b;
+          dc_values_b[(iy + y) * xsize_blocks + ix + x] =
+              dc_b[iy * xs + ix] * q_dc_b;
+        }
+      }
+
+      // Do not use this block for computing AC CfL.
+      if (acs.covered_blocks_x() + x0 > x1 ||
+          acs.covered_blocks_y() + y0 > y1) {
+        continue;
+      }
+
+      // Copy AC coefficients in the local block. The order in which
+      // coefficients get stored does not matter.
+      size_t cx = acs.covered_blocks_x();
+      size_t cy = acs.covered_blocks_y();
+      CoefficientLayout(&cy, &cx);
+      // Zero out LFs. This introduces terms in the optimization loop that
+      // don't affect the result, as they are all 0, but allow for simpler
+      // SIMDfication.
+      for (size_t iy = 0; iy < cy; iy++) {
+        for (size_t ix = 0; ix < cx; ix++) {
+          block_y[cx * kBlockDim * iy + ix] = 0;
+          block_x[cx * kBlockDim * iy + ix] = 0;
+          block_b[cx * kBlockDim * iy + ix] = 0;
+        }
+      }
+      // Unclear why this is like it is. (This works slightly better
+      // than the previous approach which was also a hack.)
+      const float qq =
+          (raw_quant_field == nullptr) ? 1.0f : raw_quant_field->Row(y)[x];
+      // Experimentally values 128-130 seem best -- I don't know why we
+      // need this multiplier.
+      const float kStrangeMultiplier = 128;
+      float q = use_dct8 ? 1 : quantizer->Scale() * kStrangeMultiplier * qq;
+      const auto qv = Set(df, q);
+      for (size_t i = 0; i < cx * cy * 64; i += Lanes(df)) {
+        const auto b_y = Load(df, block_y + i);
+        const auto b_x = Load(df, block_x + i);
+        const auto b_b = Load(df, block_b + i);
+        const auto qqm_x = Mul(qv, Load(df, qm_x + i));
+        const auto qqm_b = Mul(qv, Load(df, qm_b + i));
+        Store(Mul(b_y, qqm_x), df, coeffs_yx + num_ac);
+        Store(Mul(b_x, qqm_x), df, coeffs_x + num_ac);
+        Store(Mul(b_y, qqm_b), df, coeffs_yb + num_ac);
+        Store(Mul(b_b, qqm_b), df, coeffs_b + num_ac);
+        num_ac += Lanes(df);
+      }
+    }
+  }
+  JXL_CHECK(num_ac % Lanes(df) == 0);
+  row_out_x[tx] = FindBestMultiplier(coeffs_yx, coeffs_x, num_ac, 0.0f,
+                                     kDistanceMultiplierAC, fast);
+  row_out_b[tx] = FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, kYToBRatio,
+                                     kDistanceMultiplierAC, fast);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(InitDCStorage);
+HWY_EXPORT(ComputeDC);
+HWY_EXPORT(ComputeTile);
+
+void CfLHeuristics::Init(const Image3F& opsin) {
+  size_t xsize_blocks = opsin.xsize() / kBlockDim;
+  size_t ysize_blocks = opsin.ysize() / kBlockDim;
+  HWY_DYNAMIC_DISPATCH(InitDCStorage)
+  (xsize_blocks * ysize_blocks, &dc_values);
+}
+
+void CfLHeuristics::ComputeTile(const Rect& r, const Image3F& opsin,
+                                const DequantMatrices& dequant,
+                                const AcStrategyImage* ac_strategy,
+                                const ImageI* raw_quant_field,
+                                const Quantizer* quantizer, bool fast,
+                                size_t thread, ColorCorrelationMap* cmap) {
+  bool use_dct8 = ac_strategy == nullptr;
+  HWY_DYNAMIC_DISPATCH(ComputeTile)
+  (opsin, dequant, ac_strategy, raw_quant_field, quantizer, r, fast, use_dct8,
+   &cmap->ytox_map, &cmap->ytob_map, &dc_values,
+   mem.get() + thread * kItemsPerThread);
+}
+
+void CfLHeuristics::ComputeDC(bool fast, ColorCorrelationMap* cmap) {
+  int32_t ytob_dc = 0;
+  int32_t ytox_dc = 0;
+  HWY_DYNAMIC_DISPATCH(ComputeDC)(dc_values, fast, &ytox_dc, &ytob_dc);
+  cmap->SetYToBDC(ytob_dc);
+  cmap->SetYToXDC(ytox_dc);
+}
+
+void ColorCorrelationMapEncodeDC(ColorCorrelationMap* map, BitWriter* writer,
+                                 size_t layer, AuxOut* aux_out) {
+  float color_factor = map->GetColorFactor();
+  float base_correlation_x = map->GetBaseCorrelationX();
+  float base_correlation_b = map->GetBaseCorrelationB();
+  int32_t ytox_dc = map->GetYToXDC();
+  int32_t ytob_dc = map->GetYToBDC();
+
+  BitWriter::Allotment allotment(writer, 1 + 2 * kBitsPerByte + 12 + 32);
+  if (ytox_dc == 0 && ytob_dc == 0 && color_factor == kDefaultColorFactor &&
+      base_correlation_x == 0.0f && base_correlation_b == kYToBRatio) {
+    writer->Write(1, 1);
+    allotment.ReclaimAndCharge(writer, layer, aux_out);
+    return;
+  }
+  writer->Write(1, 0);
+  JXL_CHECK(U32Coder::Write(kColorFactorDist, color_factor, writer));
+  JXL_CHECK(F16Coder::Write(base_correlation_x, writer));
+  JXL_CHECK(F16Coder::Write(base_correlation_b, writer));
+  writer->Write(kBitsPerByte, ytox_dc - std::numeric_limits<int8_t>::min());
+  writer->Write(kBitsPerByte, ytob_dc - std::numeric_limits<int8_t>::min());
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h
new file mode 100644
index 0000000000..899b91b041
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h
@@ -0,0 +1,68 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_CHROMA_FROM_LUMA_H_
+#define LIB_JXL_ENC_CHROMA_FROM_LUMA_H_
+
+// Chroma-from-luma, computed using heuristics to determine the best linear
+// model for the X and B channels from the Y channel.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/quant_weights.h"
+
+namespace jxl {
+
+struct AuxOut;
+class Quantizer;
+
+void ColorCorrelationMapEncodeDC(ColorCorrelationMap* map, BitWriter* writer,
+                                 size_t layer, AuxOut* aux_out);
+
+struct CfLHeuristics {
+  void Init(const Image3F& opsin);
+
+  void PrepareForThreads(size_t num_threads) {
+    mem = hwy::AllocateAligned<float>(num_threads * kItemsPerThread);
+  }
+
+  void ComputeTile(const Rect& r, const Image3F& opsin,
+                   const DequantMatrices& dequant,
+                   const AcStrategyImage* ac_strategy,
+                   const ImageI* raw_quant_field, const Quantizer* quantizer,
+                   bool fast, size_t thread, ColorCorrelationMap* cmap);
+
+  void ComputeDC(bool fast, ColorCorrelationMap* cmap);
+
+  ImageF dc_values;
+  hwy::AlignedFreeUniquePtr<float[]> mem;
+
+  // Working set is too large for stack; allocate dynamically.
+  constexpr static size_t kItemsPerThread =
+      AcStrategy::kMaxCoeffArea * 3        // Blocks
+      + kColorTileDim * kColorTileDim * 4  // AC coeff storage
+      + AcStrategy::kMaxCoeffArea * 2;     // Scratch space
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_CHROMA_FROM_LUMA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_cluster.cc b/third_party/jpeg-xl/lib/jxl/enc_cluster.cc
new file mode 100644
index 0000000000..c79b3ac834
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_cluster.cc
@@ -0,0 +1,295 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_cluster.h"
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <map>
+#include <memory>
+#include <numeric>
+#include <queue>
+#include <tuple>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_cluster.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/fast_math-inl.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Eq;
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+
+template <class V>
+V Entropy(V count, V inv_total, V total) {
+  const HWY_CAPPED(float, Histogram::kRounding) d;
+  const auto zero = Set(d, 0.0f);
+  // TODO(eustas): why (0 - x) instead of Neg(x)?
+  return IfThenZeroElse(
+      Eq(count, total),
+      Sub(zero, Mul(count, FastLog2f(d, Mul(inv_total, count)))));
+}
+
+void HistogramEntropy(const Histogram& a) {
+  a.entropy_ = 0.0f;
+  if (a.total_count_ == 0) return;
+
+  const HWY_CAPPED(float, Histogram::kRounding) df;
+  const HWY_CAPPED(int32_t, Histogram::kRounding) di;
+
+  const auto inv_tot = Set(df, 1.0f / a.total_count_);
+  auto entropy_lanes = Zero(df);
+  auto total = Set(df, a.total_count_);
+
+  for (size_t i = 0; i < a.data_.size(); i += Lanes(di)) {
+    const auto counts = LoadU(di, &a.data_[i]);
+    entropy_lanes =
+        Add(entropy_lanes, Entropy(ConvertTo(df, counts), inv_tot, total));
+  }
+  a.entropy_ += GetLane(SumOfLanes(df, entropy_lanes));
+}
+
+float HistogramDistance(const Histogram& a, const Histogram& b) {
+  if (a.total_count_ == 0 || b.total_count_ == 0) return 0;
+
+  const HWY_CAPPED(float, Histogram::kRounding) df;
+  const HWY_CAPPED(int32_t, Histogram::kRounding) di;
+
+  const auto inv_tot = Set(df, 1.0f / (a.total_count_ + b.total_count_));
+  auto distance_lanes = Zero(df);
+  auto total = Set(df, a.total_count_ + b.total_count_);
+
+  for (size_t i = 0; i < std::max(a.data_.size(), b.data_.size());
+       i += Lanes(di)) {
+    const auto a_counts =
+        a.data_.size() > i ? LoadU(di, &a.data_[i]) : Zero(di);
+    const auto b_counts =
+        b.data_.size() > i ? LoadU(di, &b.data_[i]) : Zero(di);
+    const auto counts = ConvertTo(df, Add(a_counts, b_counts));
+    distance_lanes = Add(distance_lanes, Entropy(counts, inv_tot, total));
+  }
+  const float total_distance = GetLane(SumOfLanes(df, distance_lanes));
+  return total_distance - a.entropy_ - b.entropy_;
+}
+
+// First step of a k-means clustering with a fancy distance metric.
+void FastClusterHistograms(const std::vector<Histogram>& in,
+                           size_t max_histograms, std::vector<Histogram>* out,
+                           std::vector<uint32_t>* histogram_symbols) {
+  PROFILER_FUNC;
+  out->clear();
+  out->reserve(max_histograms);
+  histogram_symbols->clear();
+  histogram_symbols->resize(in.size(), max_histograms);
+
+  std::vector<float> dists(in.size(), std::numeric_limits<float>::max());
+  size_t largest_idx = 0;
+  for (size_t i = 0; i < in.size(); i++) {
+    if (in[i].total_count_ == 0) {
+      (*histogram_symbols)[i] = 0;
+      dists[i] = 0.0f;
+      continue;
+    }
+    HistogramEntropy(in[i]);
+    if (in[i].total_count_ > in[largest_idx].total_count_) {
+      largest_idx = i;
+    }
+  }
+
+  constexpr float kMinDistanceForDistinct = 48.0f;
+  while (out->size() < max_histograms) {
+    (*histogram_symbols)[largest_idx] = out->size();
+    out->push_back(in[largest_idx]);
+    dists[largest_idx] = 0.0f;
+    largest_idx = 0;
+    for (size_t i = 0; i < in.size(); i++) {
+      if (dists[i] == 0.0f) continue;
+      dists[i] = std::min(HistogramDistance(in[i], out->back()), dists[i]);
+      if (dists[i] > dists[largest_idx]) largest_idx = i;
+    }
+    if (dists[largest_idx] < kMinDistanceForDistinct) break;
+  }
+
+  for (size_t i = 0; i < in.size(); i++) {
+    if ((*histogram_symbols)[i] != max_histograms) continue;
+    size_t best = 0;
+    float best_dist = HistogramDistance(in[i], (*out)[best]);
+    for (size_t j = 1; j < out->size(); j++) {
+      float dist = HistogramDistance(in[i], (*out)[j]);
+      if (dist < best_dist) {
+        best = j;
+        best_dist = dist;
+      }
+    }
+    (*out)[best].AddHistogram(in[i]);
+    HistogramEntropy((*out)[best]);
+    (*histogram_symbols)[i] = best;
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(FastClusterHistograms);  // Local function
+HWY_EXPORT(HistogramEntropy);       // Local function
+
+float Histogram::ShannonEntropy() const {
+  HWY_DYNAMIC_DISPATCH(HistogramEntropy)(*this);
+  return entropy_;
+}
+
+namespace {
+// -----------------------------------------------------------------------------
+// Histogram refinement
+
+// Reorder histograms in *out so that the new symbols in *symbols come in
+// increasing order.
+void HistogramReindex(std::vector<Histogram>* out,
+                      std::vector<uint32_t>* symbols) {
+  std::vector<Histogram> tmp(*out);
+  std::map<int, int> new_index;
+  int next_index = 0;
+  for (uint32_t symbol : *symbols) {
+    if (new_index.find(symbol) == new_index.end()) {
+      new_index[symbol] = next_index;
+      (*out)[next_index] = tmp[symbol];
+      ++next_index;
+    }
+  }
+  out->resize(next_index);
+  for (uint32_t& symbol : *symbols) {
+    symbol = new_index[symbol];
+  }
+}
+
+}  // namespace
+
+// Clusters similar histograms in 'in' together, the selected histograms are
+// placed in 'out', and for each index in 'in', *histogram_symbols will
+// indicate which of the 'out' histograms is the best approximation.
+void ClusterHistograms(const HistogramParams params,
+                       const std::vector<Histogram>& in, size_t max_histograms,
+                       std::vector<Histogram>* out,
+                       std::vector<uint32_t>* histogram_symbols) {
+  max_histograms = std::min(max_histograms, params.max_histograms);
+  max_histograms = std::min(max_histograms, in.size());
+  if (params.clustering == HistogramParams::ClusteringType::kFastest) {
+    max_histograms = std::min(max_histograms, static_cast<size_t>(4));
+  }
+
+  HWY_DYNAMIC_DISPATCH(FastClusterHistograms)
+  (in, max_histograms, out, histogram_symbols);
+
+  if (params.clustering == HistogramParams::ClusteringType::kBest) {
+    for (size_t i = 0; i < out->size(); i++) {
+      (*out)[i].entropy_ =
+          ANSPopulationCost((*out)[i].data_.data(), (*out)[i].data_.size());
+    }
+    uint32_t next_version = 2;
+    std::vector<uint32_t> version(out->size(), 1);
+    std::vector<uint32_t> renumbering(out->size());
+    std::iota(renumbering.begin(), renumbering.end(), 0);
+
+    // Try to pair up clusters if doing so reduces the total cost.
+
+    struct HistogramPair {
+      // validity of a pair: p.version == max(version[i], version[j])
+      float cost;
+      uint32_t first;
+      uint32_t second;
+      uint32_t version;
+      // We use > because priority queues sort in *decreasing* order, but we
+      // want lower cost elements to appear first.
+      bool operator<(const HistogramPair& other) const {
+        return std::make_tuple(cost, first, second, version) >
+               std::make_tuple(other.cost, other.first, other.second,
+                               other.version);
+      }
+    };
+
+    // Create list of all pairs by increasing merging cost.
+    std::priority_queue<HistogramPair> pairs_to_merge;
+    for (uint32_t i = 0; i < out->size(); i++) {
+      for (uint32_t j = i + 1; j < out->size(); j++) {
+        Histogram histo;
+        histo.AddHistogram((*out)[i]);
+        histo.AddHistogram((*out)[j]);
+        float cost = ANSPopulationCost(histo.data_.data(), histo.data_.size()) -
+                     (*out)[i].entropy_ - (*out)[j].entropy_;
+        // Avoid enqueueing pairs that are not advantageous to merge.
+        if (cost >= 0) continue;
+        pairs_to_merge.push(
+            HistogramPair{cost, i, j, std::max(version[i], version[j])});
+      }
+    }
+
+    // Merge the best pair to merge, add new pairs that get formed as a
+    // consequence.
+    while (!pairs_to_merge.empty()) {
+      uint32_t first = pairs_to_merge.top().first;
+      uint32_t second = pairs_to_merge.top().second;
+      uint32_t ver = pairs_to_merge.top().version;
+      pairs_to_merge.pop();
+      if (ver != std::max(version[first], version[second]) ||
+          version[first] == 0 || version[second] == 0) {
+        continue;
+      }
+      (*out)[first].AddHistogram((*out)[second]);
+      (*out)[first].entropy_ = ANSPopulationCost((*out)[first].data_.data(),
+                                                 (*out)[first].data_.size());
+      for (size_t i = 0; i < renumbering.size(); i++) {
+        if (renumbering[i] == second) {
+          renumbering[i] = first;
+        }
+      }
+      version[second] = 0;
+      version[first] = next_version++;
+      for (uint32_t j = 0; j < out->size(); j++) {
+        if (j == first) continue;
+        if (version[j] == 0) continue;
+        Histogram histo;
+        histo.AddHistogram((*out)[first]);
+        histo.AddHistogram((*out)[j]);
+        float cost = ANSPopulationCost(histo.data_.data(), histo.data_.size()) -
+                     (*out)[first].entropy_ - (*out)[j].entropy_;
+        // Avoid enqueueing pairs that are not advantageous to merge.
+        if (cost >= 0) continue;
+        pairs_to_merge.push(
+            HistogramPair{cost, std::min(first, j), std::max(first, j),
+                          std::max(version[first], version[j])});
+      }
+    }
+    std::vector<uint32_t> reverse_renumbering(out->size(), -1);
+    size_t num_alive = 0;
+    for (size_t i = 0; i < out->size(); i++) {
+      if (version[i] == 0) continue;
+      (*out)[num_alive++] = (*out)[i];
+      reverse_renumbering[i] = num_alive - 1;
+    }
+    out->resize(num_alive);
+    for (size_t i = 0; i < histogram_symbols->size(); i++) {
+      (*histogram_symbols)[i] =
+          reverse_renumbering[renumbering[(*histogram_symbols)[i]]];
+    }
+  }
+
+  // Convert the context map to a canonical form.
+  HistogramReindex(out, histogram_symbols);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_cluster.h b/third_party/jpeg-xl/lib/jxl/enc_cluster.h
new file mode 100644
index 0000000000..4b062e820c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_cluster.h
@@ -0,0 +1,63 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Functions for clustering similar histograms together.
+
+#ifndef LIB_JXL_ENC_CLUSTER_H_
+#define LIB_JXL_ENC_CLUSTER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/enc_ans.h"
+
+namespace jxl {
+
+struct Histogram {
+  Histogram() {
+    total_count_ = 0;
+    entropy_ = 0.0;
+  }
+  void Clear() {
+    data_.clear();
+    total_count_ = 0;
+  }
+  void Add(size_t symbol) {
+    if (data_.size() <= symbol) {
+      data_.resize(DivCeil(symbol + 1, kRounding) * kRounding);
+    }
+    ++data_[symbol];
+    ++total_count_;
+  }
+  void AddHistogram(const Histogram& other) {
+    if (other.data_.size() > data_.size()) {
+      data_.resize(other.data_.size());
+    }
+    for (size_t i = 0; i < other.data_.size(); ++i) {
+      data_[i] += other.data_[i];
+    }
+    total_count_ += other.total_count_;
+  }
+  float PopulationCost() const {
+    return ANSPopulationCost(data_.data(), data_.size());
+  }
+  float ShannonEntropy() const;
+
+  std::vector<ANSHistBin> data_;
+  size_t total_count_;
+  mutable float entropy_;  // WARNING: not kept up-to-date.
+  static constexpr size_t kRounding = 8;
+};
+
+void ClusterHistograms(HistogramParams params, const std::vector<Histogram>& in,
+                       size_t max_histograms, std::vector<Histogram>* out,
+                       std::vector<uint32_t>* histogram_symbols);
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_CLUSTER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_coeff_order.cc b/third_party/jpeg-xl/lib/jxl/enc_coeff_order.cc
new file mode 100644
index 0000000000..389b53598a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_coeff_order.cc
@@ -0,0 +1,291 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <hwy/aligned_allocator.h>
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/lehmer_code.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/modular_image.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+std::pair<uint32_t, uint32_t> ComputeUsedOrders(
+    const SpeedTier speed, const AcStrategyImage& ac_strategy,
+    const Rect& rect) {
+  // Only uses DCT8 = 0, so bitfield = 1.
+  if (speed >= SpeedTier::kFalcon) return {1, 1};
+
+  uint32_t ret = 0;
+  uint32_t ret_customize = 0;
+  size_t xsize_blocks = rect.xsize();
+  size_t ysize_blocks = rect.ysize();
+  // TODO(veluca): precompute when doing DCT.
+  for (size_t by = 0; by < ysize_blocks; ++by) {
+    AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by);
+    for (size_t bx = 0; bx < xsize_blocks; ++bx) {
+      int ord = kStrategyOrder[acs_row[bx].RawStrategy()];
+      // Do not customize coefficient orders for blocks bigger than 32x32.
+      ret |= 1u << ord;
+      if (ord > 6) {
+        continue;
+      }
+      ret_customize |= 1u << ord;
+    }
+  }
+  // Use default orders for small images.
+  if (ac_strategy.xsize() < 5 && ac_strategy.ysize() < 5) return {ret, 0};
+  return {ret, ret_customize};
+}
+
+void ComputeCoeffOrder(SpeedTier speed, const ACImage& acs,
+                       const AcStrategyImage& ac_strategy,
+                       const FrameDimensions& frame_dim, uint32_t& used_orders,
+                       uint16_t used_acs, coeff_order_t* JXL_RESTRICT order) {
+  std::vector<int32_t> num_zeros(kCoeffOrderMaxSize);
+  // If compressing at high speed and only using 8x8 DCTs, only consider a
+  // subset of blocks.
+  double block_fraction = 1.0f;
+  // TODO(veluca): figure out why sampling blocks if non-8x8s are used makes
+  // encoding significantly less dense.
+  if (speed >= SpeedTier::kSquirrel && used_orders == 1) {
+    block_fraction = 0.5f;
+  }
+  // No need to compute number of zero coefficients if all orders are the
+  // default.
+  if (used_orders != 0) {
+    uint64_t threshold =
+        (std::numeric_limits<uint64_t>::max() >> 32) * block_fraction;
+    uint64_t s[2] = {static_cast<uint64_t>(0x94D049BB133111EBull),
+                     static_cast<uint64_t>(0xBF58476D1CE4E5B9ull)};
+    // Xorshift128+ adapted from xorshift128+-inl.h
+    auto use_sample = [&]() {
+      auto s1 = s[0];
+      const auto s0 = s[1];
+      const auto bits = s1 + s0;  // b, c
+      s[0] = s0;
+      s1 ^= s1 << 23;
+      s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
+      s[1] = s1;
+      return (bits >> 32) <= threshold;
+    };
+
+    // Count number of zero coefficients, separately for each DCT band.
+    // TODO(veluca): precompute when doing DCT.
+    for (size_t group_index = 0; group_index < frame_dim.num_groups;
+         group_index++) {
+      const size_t gx = group_index % frame_dim.xsize_groups;
+      const size_t gy = group_index / frame_dim.xsize_groups;
+      const Rect rect(gx * kGroupDimInBlocks, gy * kGroupDimInBlocks,
+                      kGroupDimInBlocks, kGroupDimInBlocks,
+                      frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+      ConstACPtr rows[3];
+      ACType type = acs.Type();
+      for (size_t c = 0; c < 3; c++) {
+        rows[c] = acs.PlaneRow(c, group_index, 0);
+      }
+      size_t ac_offset = 0;
+
+      // TODO(veluca): SIMDfy.
+      for (size_t by = 0; by < rect.ysize(); ++by) {
+        AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by);
+        for (size_t bx = 0; bx < rect.xsize(); ++bx) {
+          AcStrategy acs = acs_row[bx];
+          if (!acs.IsFirstBlock()) continue;
+          if (!use_sample()) continue;
+          size_t size = kDCTBlockSize << acs.log2_covered_blocks();
+          for (size_t c = 0; c < 3; ++c) {
+            const size_t order_offset =
+                CoeffOrderOffset(kStrategyOrder[acs.RawStrategy()], c);
+            if (type == ACType::k16) {
+              for (size_t k = 0; k < size; k++) {
+                bool is_zero = rows[c].ptr16[ac_offset + k] == 0;
+                num_zeros[order_offset + k] += is_zero ? 1 : 0;
+              }
+            } else {
+              for (size_t k = 0; k < size; k++) {
+                bool is_zero = rows[c].ptr32[ac_offset + k] == 0;
+                num_zeros[order_offset + k] += is_zero ? 1 : 0;
+              }
+            }
+            // Ensure LLFs are first in the order.
+            size_t cx = acs.covered_blocks_x();
+            size_t cy = acs.covered_blocks_y();
+            CoefficientLayout(&cy, &cx);
+            for (size_t iy = 0; iy < cy; iy++) {
+              for (size_t ix = 0; ix < cx; ix++) {
+                num_zeros[order_offset + iy * kBlockDim * cx + ix] = -1;
+              }
+            }
+          }
+          ac_offset += size;
+        }
+      }
+    }
+  }
+  struct PosAndCount {
+    uint32_t pos;
+    uint32_t count;
+  };
+  auto mem = hwy::AllocateAligned<PosAndCount>(AcStrategy::kMaxCoeffArea);
+
+  std::vector<coeff_order_t> natural_order_buffer;
+
+  uint16_t computed = 0;
+  for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
+    uint8_t ord = kStrategyOrder[o];
+    if (computed & (1 << ord)) continue;
+    computed |= 1 << ord;
+    AcStrategy acs = AcStrategy::FromRawStrategy(o);
+    size_t sz = kDCTBlockSize * acs.covered_blocks_x() * acs.covered_blocks_y();
+
+    // Do nothing for transforms that don't appear.
+    if ((1 << ord) & ~used_acs) continue;
+
+    if (natural_order_buffer.size() < sz) natural_order_buffer.resize(sz);
+    acs.ComputeNaturalCoeffOrder(natural_order_buffer.data());
+
+    // Ensure natural coefficient order is not permuted if the order is
+    // not transmitted.
+    if ((1 << ord) & ~used_orders) {
+      for (size_t c = 0; c < 3; c++) {
+        size_t offset = CoeffOrderOffset(ord, c);
+        JXL_DASSERT(CoeffOrderOffset(ord, c + 1) - offset == sz);
+        memcpy(&order[offset], natural_order_buffer.data(),
+               sz * sizeof(*order));
+      }
+      continue;
+    }
+
+    bool is_nondefault = false;
+    for (uint8_t c = 0; c < 3; c++) {
+      // Apply zig-zag order.
+      PosAndCount* pos_and_val = mem.get();
+      size_t offset = CoeffOrderOffset(ord, c);
+      JXL_DASSERT(CoeffOrderOffset(ord, c + 1) - offset == sz);
+      float inv_sqrt_sz = 1.0f / std::sqrt(sz);
+      for (size_t i = 0; i < sz; ++i) {
+        size_t pos = natural_order_buffer[i];
+        pos_and_val[i].pos = pos;
+        // We don't care for the exact number -> quantize number of zeros,
+        // to get less permuted order.
+        pos_and_val[i].count = num_zeros[offset + pos] * inv_sqrt_sz + 0.1f;
+      }
+
+      // Stable-sort -> elements with same number of zeros will preserve their
+      // order.
+      auto comparator = [](const PosAndCount& a, const PosAndCount& b) -> bool {
+        return a.count < b.count;
+      };
+      std::stable_sort(pos_and_val, pos_and_val + sz, comparator);
+
+      // Grab indices.
+      for (size_t i = 0; i < sz; ++i) {
+        order[offset + i] = pos_and_val[i].pos;
+        is_nondefault |= natural_order_buffer[i] != pos_and_val[i].pos;
+      }
+    }
+    if (!is_nondefault) {
+      used_orders &= ~(1 << ord);
+    }
+  }
+}
+
+namespace {
+
+void TokenizePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip,
+                         size_t size, std::vector<Token>* tokens) {
+  std::vector<LehmerT> lehmer(size);
+  std::vector<uint32_t> temp(size + 1);
+  ComputeLehmerCode(order, temp.data(), size, lehmer.data());
+  size_t end = size;
+  while (end > skip && lehmer[end - 1] == 0) {
+    --end;
+  }
+  tokens->emplace_back(CoeffOrderContext(size), end - skip);
+  uint32_t last = 0;
+  for (size_t i = skip; i < end; ++i) {
+    tokens->emplace_back(CoeffOrderContext(last), lehmer[i]);
+    last = lehmer[i];
+  }
+}
+
+}  // namespace
+
+void EncodePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip,
+                       size_t size, BitWriter* writer, int layer,
+                       AuxOut* aux_out) {
+  std::vector<std::vector<Token>> tokens(1);
+  TokenizePermutation(order, skip, size, &tokens[0]);
+  std::vector<uint8_t> context_map;
+  EntropyEncodingData codes;
+  BuildAndEncodeHistograms(HistogramParams(), kPermutationContexts, tokens,
+                           &codes, &context_map, writer, layer, aux_out);
+  WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out);
+}
+
+namespace {
+void EncodeCoeffOrder(const coeff_order_t* JXL_RESTRICT order, AcStrategy acs,
+                      std::vector<Token>* tokens, coeff_order_t* order_zigzag,
+                      std::vector<coeff_order_t>& natural_order_lut) {
+  const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y();
+  const size_t size = kDCTBlockSize * llf;
+  for (size_t i = 0; i < size; ++i) {
+    order_zigzag[i] = natural_order_lut[order[i]];
+  }
+  TokenizePermutation(order_zigzag, llf, size, tokens);
+}
+}  // namespace
+
+void EncodeCoeffOrders(uint16_t used_orders,
+                       const coeff_order_t* JXL_RESTRICT order,
+                       BitWriter* writer, size_t layer,
+                       AuxOut* JXL_RESTRICT aux_out) {
+  auto mem = hwy::AllocateAligned<coeff_order_t>(AcStrategy::kMaxCoeffArea);
+  uint16_t computed = 0;
+  std::vector<std::vector<Token>> tokens(1);
+  std::vector<coeff_order_t> natural_order_lut;
+  for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
+    uint8_t ord = kStrategyOrder[o];
+    if (computed & (1 << ord)) continue;
+    computed |= 1 << ord;
+    if ((used_orders & (1 << ord)) == 0) continue;
+    AcStrategy acs = AcStrategy::FromRawStrategy(o);
+    const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y();
+    const size_t size = kDCTBlockSize * llf;
+    if (natural_order_lut.size() < size) natural_order_lut.resize(size);
+    acs.ComputeNaturalCoeffOrderLut(natural_order_lut.data());
+    for (size_t c = 0; c < 3; c++) {
+      EncodeCoeffOrder(&order[CoeffOrderOffset(ord, c)], acs, &tokens[0],
+                       mem.get(), natural_order_lut);
+    }
+  }
+  // Do not write anything if no order is used.
+  if (used_orders != 0) {
+    std::vector<uint8_t> context_map;
+    EntropyEncodingData codes;
+    BuildAndEncodeHistograms(HistogramParams(), kPermutationContexts, tokens,
+                             &codes, &context_map, writer, layer, aux_out);
+    WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_coeff_order.h b/third_party/jpeg-xl/lib/jxl/enc_coeff_order.h
new file mode 100644
index 0000000000..3a43f4f986
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_coeff_order.h
@@ -0,0 +1,54 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_COEFF_ORDER_H_
+#define LIB_JXL_ENC_COEFF_ORDER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_params.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Orders that are actually used in part of image. `rect` is in block units.
+// Returns {orders that are used, orders that might be made non-default}.
+std::pair<uint32_t, uint32_t> ComputeUsedOrders(
+    SpeedTier speed, const AcStrategyImage& ac_strategy, const Rect& rect);
+
+// Modify zig-zag order, so that DCT bands with more zeros go later.
+// Order of DCT bands with same number of zeros is untouched, so
+// permutation will be cheaper to encode.
+void ComputeCoeffOrder(SpeedTier speed, const ACImage& acs,
+                       const AcStrategyImage& ac_strategy,
+                       const FrameDimensions& frame_dim, uint32_t& used_orders,
+                       uint16_t used_acs, coeff_order_t* JXL_RESTRICT order);
+
+void EncodeCoeffOrders(uint16_t used_orders,
+                       const coeff_order_t* JXL_RESTRICT order,
+                       BitWriter* writer, size_t layer,
+                       AuxOut* JXL_RESTRICT aux_out);
+
+// Encoding/decoding of a single permutation. `size`: number of elements in the
+// permutation. `skip`: number of elements to skip from the *beginning* of the
+// permutation.
+void EncodePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip,
+                       size_t size, BitWriter* writer, int layer,
+                       AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_COEFF_ORDER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_color_management.cc b/third_party/jpeg-xl/lib/jxl/enc_color_management.cc
new file mode 100644
index 0000000000..8a23ead473
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_color_management.cc
@@ -0,0 +1,1293 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_color_management.h"
+
+#ifndef JPEGXL_ENABLE_SKCMS
+#define JPEGXL_ENABLE_SKCMS 0
+#endif
+
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <memory>
+#include <string>
+#include <utility>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_color_management.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/matrix_ops.h"
+#include "lib/jxl/transfer_functions-inl.h"
+#if JPEGXL_ENABLE_SKCMS
+#include "lib/jxl/enc_jxl_skcms.h"
+#else  // JPEGXL_ENABLE_SKCMS
+#include "lcms2.h"
+#include "lcms2_plugin.h"
+#endif  // JPEGXL_ENABLE_SKCMS
+
+#define JXL_CMS_VERBOSE 0
+
+// Define these only once. We can't use HWY_ONCE here because it is defined as
+// 1 only on the last pass.
+#ifndef LIB_JXL_ENC_COLOR_MANAGEMENT_CC_
+#define LIB_JXL_ENC_COLOR_MANAGEMENT_CC_
+
+namespace jxl {
+namespace {
+struct JxlCms {
+#if JPEGXL_ENABLE_SKCMS
+  PaddedBytes icc_src, icc_dst;
+  skcms_ICCProfile profile_src, profile_dst;
+#else
+  void* lcms_transform;
+#endif
+
+  // These fields are used when the HLG OOTF or inverse OOTF must be applied.
+  bool apply_hlg_ootf;
+  size_t hlg_ootf_num_channels;
+  // Y component of the primaries.
+  std::array<float, 3> hlg_ootf_luminances;
+
+  size_t channels_src;
+  size_t channels_dst;
+  ImageF buf_src;
+  ImageF buf_dst;
+  float intensity_target;
+  bool skip_lcms = false;
+  ExtraTF preprocess = ExtraTF::kNone;
+  ExtraTF postprocess = ExtraTF::kNone;
+};
+
+Status ApplyHlgOotf(JxlCms* t, float* JXL_RESTRICT buf, size_t xsize,
+                    bool forward);
+}  // namespace
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_COLOR_MANAGEMENT_CC_
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+#if JXL_CMS_VERBOSE >= 2
+const size_t kX = 0;  // pixel index, multiplied by 3 for RGB
+#endif
+
+// xform_src = UndoGammaCompression(buf_src).
+Status BeforeTransform(JxlCms* t, const float* buf_src, float* xform_src,
+                       size_t buf_size) {
+  switch (t->preprocess) {
+    case ExtraTF::kNone:
+      JXL_DASSERT(false);  // unreachable
+      break;
+
+    case ExtraTF::kPQ: {
+      // By default, PQ content has an intensity target of 10000, stored
+      // exactly.
+      HWY_FULL(float) df;
+      const auto multiplier = Set(df, t->intensity_target == 10000.f
+                                          ? 1.0f
+                                          : 10000.f / t->intensity_target);
+      for (size_t i = 0; i < buf_size; i += Lanes(df)) {
+        const auto val = Load(df, buf_src + i);
+        const auto result =
+            Mul(multiplier, TF_PQ().DisplayFromEncoded(df, val));
+        Store(result, df, xform_src + i);
+      }
+#if JXL_CMS_VERBOSE >= 2
+      printf("pre in %.4f %.4f %.4f undoPQ %.4f %.4f %.4f\n", buf_src[3 * kX],
+             buf_src[3 * kX + 1], buf_src[3 * kX + 2], xform_src[3 * kX],
+             xform_src[3 * kX + 1], xform_src[3 * kX + 2]);
+#endif
+      break;
+    }
+
+    case ExtraTF::kHLG:
+      for (size_t i = 0; i < buf_size; ++i) {
+        xform_src[i] = static_cast<float>(
+            TF_HLG().DisplayFromEncoded(static_cast<double>(buf_src[i])));
+      }
+      if (t->apply_hlg_ootf) {
+        JXL_RETURN_IF_ERROR(
+            ApplyHlgOotf(t, xform_src, buf_size, /*forward=*/true));
+      }
+#if JXL_CMS_VERBOSE >= 2
+      printf("pre in %.4f %.4f %.4f undoHLG %.4f %.4f %.4f\n", buf_src[3 * kX],
+             buf_src[3 * kX + 1], buf_src[3 * kX + 2], xform_src[3 * kX],
+             xform_src[3 * kX + 1], xform_src[3 * kX + 2]);
+#endif
+      break;
+
+    case ExtraTF::kSRGB:
+      HWY_FULL(float) df;
+      for (size_t i = 0; i < buf_size; i += Lanes(df)) {
+        const auto val = Load(df, buf_src + i);
+        const auto result = TF_SRGB().DisplayFromEncoded(val);
+        Store(result, df, xform_src + i);
+      }
+#if JXL_CMS_VERBOSE >= 2
+      printf("pre in %.4f %.4f %.4f undoSRGB %.4f %.4f %.4f\n", buf_src[3 * kX],
+             buf_src[3 * kX + 1], buf_src[3 * kX + 2], xform_src[3 * kX],
+             xform_src[3 * kX + 1], xform_src[3 * kX + 2]);
+#endif
+      break;
+  }
+  return true;
+}
+
+// Applies gamma compression in-place.
+Status AfterTransform(JxlCms* t, float* JXL_RESTRICT buf_dst, size_t buf_size) {
+  switch (t->postprocess) {
+    case ExtraTF::kNone:
+      JXL_DASSERT(false);  // unreachable
+      break;
+    case ExtraTF::kPQ: {
+      HWY_FULL(float) df;
+      const auto multiplier =
+          Set(df, t->intensity_target == 10000.f ? 1.0f
+                                                 : t->intensity_target * 1e-4f);
+      for (size_t i = 0; i < buf_size; i += Lanes(df)) {
+        const auto val = Load(df, buf_dst + i);
+        const auto result =
+            TF_PQ().EncodedFromDisplay(df, Mul(multiplier, val));
+        Store(result, df, buf_dst + i);
+      }
+#if JXL_CMS_VERBOSE >= 2
+      printf("after PQ enc %.4f %.4f %.4f\n", buf_dst[3 * kX],
+             buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]);
+#endif
+      break;
+    }
+    case ExtraTF::kHLG:
+      if (t->apply_hlg_ootf) {
+        JXL_RETURN_IF_ERROR(
+            ApplyHlgOotf(t, buf_dst, buf_size, /*forward=*/false));
+      }
+      for (size_t i = 0; i < buf_size; ++i) {
+        buf_dst[i] = static_cast<float>(
+            TF_HLG().EncodedFromDisplay(static_cast<double>(buf_dst[i])));
+      }
+#if JXL_CMS_VERBOSE >= 2
+      printf("after HLG enc %.4f %.4f %.4f\n", buf_dst[3 * kX],
+             buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]);
+#endif
+      break;
+    case ExtraTF::kSRGB:
+      HWY_FULL(float) df;
+      for (size_t i = 0; i < buf_size; i += Lanes(df)) {
+        const auto val = Load(df, buf_dst + i);
+        const auto result =
+            TF_SRGB().EncodedFromDisplay(HWY_FULL(float)(), val);
+        Store(result, df, buf_dst + i);
+      }
+#if JXL_CMS_VERBOSE >= 2
+      printf("after SRGB enc %.4f %.4f %.4f\n", buf_dst[3 * kX],
+             buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]);
+#endif
+      break;
+  }
+  return true;
+}
+
+Status DoColorSpaceTransform(void* cms_data, const size_t thread,
+                             const float* buf_src, float* buf_dst,
+                             size_t xsize) {
+  // No lock needed.
+  JxlCms* t = reinterpret_cast<JxlCms*>(cms_data);
+
+  const float* xform_src = buf_src;  // Read-only.
+  if (t->preprocess != ExtraTF::kNone) {
+    float* mutable_xform_src = t->buf_src.Row(thread);  // Writable buffer.
+    JXL_RETURN_IF_ERROR(BeforeTransform(t, buf_src, mutable_xform_src,
+                                        xsize * t->channels_src));
+    xform_src = mutable_xform_src;
+  }
+
+#if JPEGXL_ENABLE_SKCMS
+  if (t->channels_src == 1 && !t->skip_lcms) {
+    // Expand from 1 to 3 channels, starting from the end in case
+    // xform_src == t->buf_src.Row(thread).
+    float* mutable_xform_src = t->buf_src.Row(thread);
+    for (size_t i = 0; i < xsize; ++i) {
+      const size_t x = xsize - i - 1;
+      mutable_xform_src[x * 3] = mutable_xform_src[x * 3 + 1] =
+          mutable_xform_src[x * 3 + 2] = xform_src[x];
+    }
+    xform_src = mutable_xform_src;
+  }
+#else
+  if (t->channels_src == 4 && !t->skip_lcms) {
+    // LCMS does CMYK in a weird way: 0 = white, 100 = max ink
+    float* mutable_xform_src = t->buf_src.Row(thread);
+    for (size_t x = 0; x < xsize * 4; ++x) {
+      mutable_xform_src[x] = 100.f - 100.f * mutable_xform_src[x];
+    }
+    xform_src = mutable_xform_src;
+  }
+#endif
+
+#if JXL_CMS_VERBOSE >= 2
+  // Save inputs for printing before in-place transforms overwrite them.
+  const float in0 = xform_src[3 * kX + 0];
+  const float in1 = xform_src[3 * kX + 1];
+  const float in2 = xform_src[3 * kX + 2];
+#endif
+
+  if (t->skip_lcms) {
+    if (buf_dst != xform_src) {
+      memcpy(buf_dst, xform_src, xsize * t->channels_src * sizeof(*buf_dst));
+    }  // else: in-place, no need to copy
+  } else {
+#if JPEGXL_ENABLE_SKCMS
+    JXL_CHECK(
+        skcms_Transform(xform_src,
+                        (t->channels_src == 4 ? skcms_PixelFormat_RGBA_ffff
+                                              : skcms_PixelFormat_RGB_fff),
+                        skcms_AlphaFormat_Opaque, &t->profile_src, buf_dst,
+                        skcms_PixelFormat_RGB_fff, skcms_AlphaFormat_Opaque,
+                        &t->profile_dst, xsize));
+#else   // JPEGXL_ENABLE_SKCMS
+    cmsDoTransform(t->lcms_transform, xform_src, buf_dst,
+                   static_cast<cmsUInt32Number>(xsize));
+#endif  // JPEGXL_ENABLE_SKCMS
+  }
+#if JXL_CMS_VERBOSE >= 2
+  printf("xform skip%d: %.4f %.4f %.4f (%p) -> (%p) %.4f %.4f %.4f\n",
+         t->skip_lcms, in0, in1, in2, xform_src, buf_dst, buf_dst[3 * kX],
+         buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]);
+#endif
+
+#if JPEGXL_ENABLE_SKCMS
+  if (t->channels_dst == 1 && !t->skip_lcms) {
+    // Contract back from 3 to 1 channel, this time forward.
+    float* grayscale_buf_dst = t->buf_dst.Row(thread);
+    for (size_t x = 0; x < xsize; ++x) {
+      grayscale_buf_dst[x] = buf_dst[x * 3];
+    }
+    buf_dst = grayscale_buf_dst;
+  }
+#endif
+
+  if (t->postprocess != ExtraTF::kNone) {
+    JXL_RETURN_IF_ERROR(AfterTransform(t, buf_dst, xsize * t->channels_dst));
+  }
+  return true;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+namespace {
+
+HWY_EXPORT(DoColorSpaceTransform);
+int DoColorSpaceTransform(void* t, size_t thread, const float* buf_src,
+                          float* buf_dst, size_t xsize) {
+  return HWY_DYNAMIC_DISPATCH(DoColorSpaceTransform)(t, thread, buf_src,
+                                                     buf_dst, xsize);
+}
+
+// Define to 1 on OS X as a workaround for older LCMS lacking MD5.
+#define JXL_CMS_OLD_VERSION 0
+
+#if JPEGXL_ENABLE_SKCMS
+
+JXL_MUST_USE_RESULT CIExy CIExyFromXYZ(const float XYZ[3]) {
+  const float factor = 1.f / (XYZ[0] + XYZ[1] + XYZ[2]);
+  CIExy xy;
+  xy.x = XYZ[0] * factor;
+  xy.y = XYZ[1] * factor;
+  return xy;
+}
+
+#else  // JPEGXL_ENABLE_SKCMS
+// (LCMS interface requires xyY but we omit the Y for white points/primaries.)
+
+JXL_MUST_USE_RESULT CIExy CIExyFromxyY(const cmsCIExyY& xyY) {
+  CIExy xy;
+  xy.x = xyY.x;
+  xy.y = xyY.y;
+  return xy;
+}
+
+JXL_MUST_USE_RESULT CIExy CIExyFromXYZ(const cmsCIEXYZ& XYZ) {
+  cmsCIExyY xyY;
+  cmsXYZ2xyY(/*Dest=*/&xyY, /*Source=*/&XYZ);
+  return CIExyFromxyY(xyY);
+}
+
+JXL_MUST_USE_RESULT cmsCIEXYZ D50_XYZ() {
+  // Quantized D50 as stored in ICC profiles.
+  return {0.96420288, 1.0, 0.82490540};
+}
+
+// RAII
+
+struct ProfileDeleter {
+  void operator()(void* p) { cmsCloseProfile(p); }
+};
+using Profile = std::unique_ptr<void, ProfileDeleter>;
+
+struct TransformDeleter {
+  void operator()(void* p) { cmsDeleteTransform(p); }
+};
+using Transform = std::unique_ptr<void, TransformDeleter>;
+
+struct CurveDeleter {
+  void operator()(cmsToneCurve* p) { cmsFreeToneCurve(p); }
+};
+using Curve = std::unique_ptr<cmsToneCurve, CurveDeleter>;
+
+Status CreateProfileXYZ(const cmsContext context,
+                        Profile* JXL_RESTRICT profile) {
+  profile->reset(cmsCreateXYZProfileTHR(context));
+  if (profile->get() == nullptr) return JXL_FAILURE("Failed to create XYZ");
+  return true;
+}
+
+#endif  // !JPEGXL_ENABLE_SKCMS
+
+#if JPEGXL_ENABLE_SKCMS
+// IMPORTANT: icc must outlive profile.
+Status DecodeProfile(const uint8_t* icc, size_t size,
+                     skcms_ICCProfile* const profile) {
+  if (!skcms_Parse(icc, size, profile)) {
+    return JXL_FAILURE("Failed to parse ICC profile with %" PRIuS " bytes",
+                       size);
+  }
+  return true;
+}
+#else  // JPEGXL_ENABLE_SKCMS
+Status DecodeProfile(const cmsContext context, const PaddedBytes& icc,
+                     Profile* profile) {
+  profile->reset(cmsOpenProfileFromMemTHR(context, icc.data(), icc.size()));
+  if (profile->get() == nullptr) {
+    return JXL_FAILURE("Failed to decode profile");
+  }
+
+  // WARNING: due to the LCMS MD5 issue mentioned above, many existing
+  // profiles have incorrect MD5, so do not even bother checking them nor
+  // generating warning clutter.
+
+  return true;
+}
+#endif  // JPEGXL_ENABLE_SKCMS
+
+#if JPEGXL_ENABLE_SKCMS
+
+ColorSpace ColorSpaceFromProfile(const skcms_ICCProfile& profile) {
+  switch (profile.data_color_space) {
+    case skcms_Signature_RGB:
+    case skcms_Signature_CMYK:
+      // spec says CMYK is encoded as RGB (the kBlack extra channel signals that
+      // it is actually CMYK)
+      return ColorSpace::kRGB;
+    case skcms_Signature_Gray:
+      return ColorSpace::kGray;
+    default:
+      return ColorSpace::kUnknown;
+  }
+}
+
+// vector_out := matmul(matrix, vector_in)
+void MatrixProduct(const skcms_Matrix3x3& matrix, const float vector_in[3],
+                   float vector_out[3]) {
+  for (int i = 0; i < 3; ++i) {
+    vector_out[i] = 0;
+    for (int j = 0; j < 3; ++j) {
+      vector_out[i] += matrix.vals[i][j] * vector_in[j];
+    }
+  }
+}
+
+// Returns white point that was specified when creating the profile.
+JXL_MUST_USE_RESULT Status UnadaptedWhitePoint(const skcms_ICCProfile& profile,
+                                               CIExy* out) {
+  float media_white_point_XYZ[3];
+  if (!skcms_GetWTPT(&profile, media_white_point_XYZ)) {
+    return JXL_FAILURE("ICC profile does not contain WhitePoint tag");
+  }
+  skcms_Matrix3x3 CHAD;
+  if (!skcms_GetCHAD(&profile, &CHAD)) {
+    // If there is no chromatic adaptation matrix, it means that the white point
+    // is already unadapted.
+    *out = CIExyFromXYZ(media_white_point_XYZ);
+    return true;
+  }
+  // Otherwise, it has been adapted to the PCS white point using said matrix,
+  // and the adaptation needs to be undone.
+  skcms_Matrix3x3 inverse_CHAD;
+  if (!skcms_Matrix3x3_invert(&CHAD, &inverse_CHAD)) {
+    return JXL_FAILURE("Non-invertible ChromaticAdaptation matrix");
+  }
+  float unadapted_white_point_XYZ[3];
+  MatrixProduct(inverse_CHAD, media_white_point_XYZ, unadapted_white_point_XYZ);
+  *out = CIExyFromXYZ(unadapted_white_point_XYZ);
+  return true;
+}
+
+Status IdentifyPrimaries(const skcms_ICCProfile& profile,
+                         const CIExy& wp_unadapted, ColorEncoding* c) {
+  if (!c->HasPrimaries()) return true;
+
+  skcms_Matrix3x3 CHAD, inverse_CHAD;
+  if (skcms_GetCHAD(&profile, &CHAD)) {
+    JXL_RETURN_IF_ERROR(skcms_Matrix3x3_invert(&CHAD, &inverse_CHAD));
+  } else {
+    static constexpr skcms_Matrix3x3 kLMSFromXYZ = {
+        {{0.8951, 0.2664, -0.1614},
+         {-0.7502, 1.7135, 0.0367},
+         {0.0389, -0.0685, 1.0296}}};
+    static constexpr skcms_Matrix3x3 kXYZFromLMS = {
+        {{0.9869929, -0.1470543, 0.1599627},
+         {0.4323053, 0.5183603, 0.0492912},
+         {-0.0085287, 0.0400428, 0.9684867}}};
+    static constexpr float kWpD50XYZ[3] = {0.96420288, 1.0, 0.82490540};
+    float wp_unadapted_XYZ[3];
+    JXL_RETURN_IF_ERROR(CIEXYZFromWhiteCIExy(wp_unadapted, wp_unadapted_XYZ));
+    float wp_D50_LMS[3], wp_unadapted_LMS[3];
+    MatrixProduct(kLMSFromXYZ, kWpD50XYZ, wp_D50_LMS);
+    MatrixProduct(kLMSFromXYZ, wp_unadapted_XYZ, wp_unadapted_LMS);
+    inverse_CHAD = {{{wp_unadapted_LMS[0] / wp_D50_LMS[0], 0, 0},
+                     {0, wp_unadapted_LMS[1] / wp_D50_LMS[1], 0},
+                     {0, 0, wp_unadapted_LMS[2] / wp_D50_LMS[2]}}};
+    inverse_CHAD = skcms_Matrix3x3_concat(&kXYZFromLMS, &inverse_CHAD);
+    inverse_CHAD = skcms_Matrix3x3_concat(&inverse_CHAD, &kLMSFromXYZ);
+  }
+
+  float XYZ[3];
+  PrimariesCIExy primaries;
+  CIExy* const chromaticities[] = {&primaries.r, &primaries.g, &primaries.b};
+  for (int i = 0; i < 3; ++i) {
+    float RGB[3] = {};
+    RGB[i] = 1;
+    skcms_Transform(RGB, skcms_PixelFormat_RGB_fff, skcms_AlphaFormat_Opaque,
+                    &profile, XYZ, skcms_PixelFormat_RGB_fff,
+                    skcms_AlphaFormat_Opaque, skcms_XYZD50_profile(), 1);
+    float unadapted_XYZ[3];
+    MatrixProduct(inverse_CHAD, XYZ, unadapted_XYZ);
+    *chromaticities[i] = CIExyFromXYZ(unadapted_XYZ);
+  }
+  return c->SetPrimaries(primaries);
+}
+
+void DetectTransferFunction(const skcms_ICCProfile& profile,
+                            ColorEncoding* JXL_RESTRICT c) {
+  if (c->tf.SetImplicit()) return;
+
+  float gamma[3] = {};
+  if (profile.has_trc) {
+    const auto IsGamma = [](const skcms_TransferFunction& tf) {
+      return tf.a == 1 && tf.b == 0 &&
+             /* if b and d are zero, it is fine for c not to be */ tf.d == 0 &&
+             tf.e == 0 && tf.f == 0;
+    };
+    for (int i = 0; i < 3; ++i) {
+      if (profile.trc[i].table_entries == 0 &&
+          IsGamma(profile.trc->parametric)) {
+        gamma[i] = 1.f / profile.trc->parametric.g;
+      } else {
+        skcms_TransferFunction approximate_tf;
+        float max_error;
+        if (skcms_ApproximateCurve(&profile.trc[i], &approximate_tf,
+                                   &max_error)) {
+          if (IsGamma(approximate_tf)) {
+            gamma[i] = 1.f / approximate_tf.g;
+          }
+        }
+      }
+    }
+  }
+  if (gamma[0] != 0 && std::abs(gamma[0] - gamma[1]) < 1e-4f &&
+      std::abs(gamma[1] - gamma[2]) < 1e-4f) {
+    if (c->tf.SetGamma(gamma[0])) {
+      skcms_ICCProfile profile_test;
+      PaddedBytes bytes;
+      if (MaybeCreateProfile(*c, &bytes) &&
+          DecodeProfile(bytes.data(), bytes.size(), &profile_test) &&
+          skcms_ApproximatelyEqualProfiles(&profile, &profile_test)) {
+        return;
+      }
+    }
+  }
+
+  for (TransferFunction tf : Values<TransferFunction>()) {
+    // Can only create profile from known transfer function.
+    if (tf == TransferFunction::kUnknown) continue;
+
+    c->tf.SetTransferFunction(tf);
+
+    skcms_ICCProfile profile_test;
+    PaddedBytes bytes;
+    if (MaybeCreateProfile(*c, &bytes) &&
+        DecodeProfile(bytes.data(), bytes.size(), &profile_test) &&
+        skcms_ApproximatelyEqualProfiles(&profile, &profile_test)) {
+      return;
+    }
+  }
+
+  c->tf.SetTransferFunction(TransferFunction::kUnknown);
+}
+
+#else  // JPEGXL_ENABLE_SKCMS
+
+uint32_t Type32(const ColorEncoding& c, bool cmyk) {
+  if (cmyk) return TYPE_CMYK_FLT;
+  if (c.IsGray()) return TYPE_GRAY_FLT;
+  return TYPE_RGB_FLT;
+}
+
+uint32_t Type64(const ColorEncoding& c) {
+  if (c.IsGray()) return TYPE_GRAY_DBL;
+  return TYPE_RGB_DBL;
+}
+
+ColorSpace ColorSpaceFromProfile(const Profile& profile) {
+  switch (cmsGetColorSpace(profile.get())) {
+    case cmsSigRgbData:
+    case cmsSigCmykData:
+      return ColorSpace::kRGB;
+    case cmsSigGrayData:
+      return ColorSpace::kGray;
+    default:
+      return ColorSpace::kUnknown;
+  }
+}
+
+// "profile1" is pre-decoded to save time in DetectTransferFunction.
+Status ProfileEquivalentToICC(const cmsContext context, const Profile& profile1,
+                              const PaddedBytes& icc, const ColorEncoding& c) {
+  const uint32_t type_src = Type64(c);
+
+  Profile profile2;
+  JXL_RETURN_IF_ERROR(DecodeProfile(context, icc, &profile2));
+
+  Profile profile_xyz;
+  JXL_RETURN_IF_ERROR(CreateProfileXYZ(context, &profile_xyz));
+
+  const uint32_t intent = INTENT_RELATIVE_COLORIMETRIC;
+  const uint32_t flags = cmsFLAGS_NOOPTIMIZE | cmsFLAGS_BLACKPOINTCOMPENSATION |
+                         cmsFLAGS_HIGHRESPRECALC;
+  Transform xform1(cmsCreateTransformTHR(context, profile1.get(), type_src,
+                                         profile_xyz.get(), TYPE_XYZ_DBL,
+                                         intent, flags));
+  Transform xform2(cmsCreateTransformTHR(context, profile2.get(), type_src,
+                                         profile_xyz.get(), TYPE_XYZ_DBL,
+                                         intent, flags));
+  if (xform1 == nullptr || xform2 == nullptr) {
+    return JXL_FAILURE("Failed to create transform");
+  }
+
+  double in[3];
+  double out1[3];
+  double out2[3];
+
+  // Uniformly spaced samples from very dark to almost fully bright.
+  const double init = 1E-3;
+  const double step = 0.2;
+
+  if (c.IsGray()) {
+    // Finer sampling and replicate each component.
+    for (in[0] = init; in[0] < 1.0; in[0] += step / 8) {
+      cmsDoTransform(xform1.get(), in, out1, 1);
+      cmsDoTransform(xform2.get(), in, out2, 1);
+      if (!ApproxEq(out1[0], out2[0], 2E-4)) {
+        return false;
+      }
+    }
+  } else {
+    for (in[0] = init; in[0] < 1.0; in[0] += step) {
+      for (in[1] = init; in[1] < 1.0; in[1] += step) {
+        for (in[2] = init; in[2] < 1.0; in[2] += step) {
+          cmsDoTransform(xform1.get(), in, out1, 1);
+          cmsDoTransform(xform2.get(), in, out2, 1);
+          for (size_t i = 0; i < 3; ++i) {
+            if (!ApproxEq(out1[i], out2[i], 2E-4)) {
+              return false;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+// Returns white point that was specified when creating the profile.
+// NOTE: we can't just use cmsSigMediaWhitePointTag because its interpretation
+// differs between ICC versions.
+JXL_MUST_USE_RESULT cmsCIEXYZ UnadaptedWhitePoint(const cmsContext context,
+                                                  const Profile& profile,
+                                                  const ColorEncoding& c) {
+  const cmsCIEXYZ* white_point = static_cast<const cmsCIEXYZ*>(
+      cmsReadTag(profile.get(), cmsSigMediaWhitePointTag));
+  if (white_point != nullptr &&
+      cmsReadTag(profile.get(), cmsSigChromaticAdaptationTag) == nullptr) {
+    // No chromatic adaptation matrix: the white point is already unadapted.
+    return *white_point;
+  }
+
+  cmsCIEXYZ XYZ = {1.0, 1.0, 1.0};
+  Profile profile_xyz;
+  if (!CreateProfileXYZ(context, &profile_xyz)) return XYZ;
+  // Array arguments are one per profile.
+  cmsHPROFILE profiles[2] = {profile.get(), profile_xyz.get()};
+  // Leave white point unchanged - that is what we're trying to extract.
+  cmsUInt32Number intents[2] = {INTENT_ABSOLUTE_COLORIMETRIC,
+                                INTENT_ABSOLUTE_COLORIMETRIC};
+  cmsBool black_compensation[2] = {0, 0};
+  cmsFloat64Number adaption[2] = {0.0, 0.0};
+  // Only transforming a single pixel, so skip expensive optimizations.
+  cmsUInt32Number flags = cmsFLAGS_NOOPTIMIZE | cmsFLAGS_HIGHRESPRECALC;
+  Transform xform(cmsCreateExtendedTransform(
+      context, 2, profiles, black_compensation, intents, adaption, nullptr, 0,
+      Type64(c), TYPE_XYZ_DBL, flags));
+  if (!xform) return XYZ;  // TODO(lode): return error
+
+  // xy are relative, so magnitude does not matter if we ignore output Y.
+  const cmsFloat64Number in[3] = {1.0, 1.0, 1.0};
+  cmsDoTransform(xform.get(), in, &XYZ.X, 1);
+  return XYZ;
+}
+
+Status IdentifyPrimaries(const cmsContext context, const Profile& profile,
+                         const cmsCIEXYZ& wp_unadapted, ColorEncoding* c) {
+  if (!c->HasPrimaries()) return true;
+  if (ColorSpaceFromProfile(profile) == ColorSpace::kUnknown) return true;
+
+  // These were adapted to the profile illuminant before storing in the profile.
+  const cmsCIEXYZ* adapted_r = static_cast<const cmsCIEXYZ*>(
+      cmsReadTag(profile.get(), cmsSigRedColorantTag));
+  const cmsCIEXYZ* adapted_g = static_cast<const cmsCIEXYZ*>(
+      cmsReadTag(profile.get(), cmsSigGreenColorantTag));
+  const cmsCIEXYZ* adapted_b = static_cast<const cmsCIEXYZ*>(
+      cmsReadTag(profile.get(), cmsSigBlueColorantTag));
+
+  cmsCIEXYZ converted_rgb[3];
+  if (adapted_r == nullptr || adapted_g == nullptr || adapted_b == nullptr) {
+    // No colorant tag, determine the XYZ coordinates of the primaries by
+    // converting from the colorspace.
+    Profile profile_xyz;
+    if (!CreateProfileXYZ(context, &profile_xyz)) {
+      return JXL_FAILURE("Failed to retrieve colorants");
+    }
+    // Array arguments are one per profile.
+    cmsHPROFILE profiles[2] = {profile.get(), profile_xyz.get()};
+    cmsUInt32Number intents[2] = {INTENT_RELATIVE_COLORIMETRIC,
+                                  INTENT_RELATIVE_COLORIMETRIC};
+    cmsBool black_compensation[2] = {0, 0};
+    cmsFloat64Number adaption[2] = {0.0, 0.0};
+    // Only transforming three pixels, so skip expensive optimizations.
+    cmsUInt32Number flags = cmsFLAGS_NOOPTIMIZE | cmsFLAGS_HIGHRESPRECALC;
+    Transform xform(cmsCreateExtendedTransform(
+        context, 2, profiles, black_compensation, intents, adaption, nullptr, 0,
+        Type64(*c), TYPE_XYZ_DBL, flags));
+    if (!xform) return JXL_FAILURE("Failed to retrieve colorants");
+
+    const cmsFloat64Number in[9] = {1.0, 0.0, 0.0, 0.0, 1.0,
+                                    0.0, 0.0, 0.0, 1.0};
+    cmsDoTransform(xform.get(), in, &converted_rgb->X, 3);
+    adapted_r = &converted_rgb[0];
+    adapted_g = &converted_rgb[1];
+    adapted_b = &converted_rgb[2];
+  }
+
+  // TODO(janwas): no longer assume Bradford and D50.
+  // Undo the chromatic adaptation.
+  const cmsCIEXYZ d50 = D50_XYZ();
+
+  cmsCIEXYZ r, g, b;
+  cmsAdaptToIlluminant(&r, &d50, &wp_unadapted, adapted_r);
+  cmsAdaptToIlluminant(&g, &d50, &wp_unadapted, adapted_g);
+  cmsAdaptToIlluminant(&b, &d50, &wp_unadapted, adapted_b);
+
+  const PrimariesCIExy rgb = {CIExyFromXYZ(r), CIExyFromXYZ(g),
+                              CIExyFromXYZ(b)};
+  return c->SetPrimaries(rgb);
+}
+
+void DetectTransferFunction(const cmsContext context, const Profile& profile,
+                            ColorEncoding* JXL_RESTRICT c) {
+  if (c->tf.SetImplicit()) return;
+
+  float gamma = 0;
+  if (const auto* gray_trc = reinterpret_cast<const cmsToneCurve*>(
+          cmsReadTag(profile.get(), cmsSigGrayTRCTag))) {
+    const double estimated_gamma =
+        cmsEstimateGamma(gray_trc, /*precision=*/1e-4);
+    if (estimated_gamma > 0) {
+      gamma = 1. / estimated_gamma;
+    }
+  } else {
+    float rgb_gamma[3] = {};
+    int i = 0;
+    for (const auto tag :
+         {cmsSigRedTRCTag, cmsSigGreenTRCTag, cmsSigBlueTRCTag}) {
+      if (const auto* trc = reinterpret_cast<const cmsToneCurve*>(
+              cmsReadTag(profile.get(), tag))) {
+        const double estimated_gamma =
+            cmsEstimateGamma(trc, /*precision=*/1e-4);
+        if (estimated_gamma > 0) {
+          rgb_gamma[i] = 1. / estimated_gamma;
+        }
+      }
+      ++i;
+    }
+    if (rgb_gamma[0] != 0 && std::abs(rgb_gamma[0] - rgb_gamma[1]) < 1e-4f &&
+        std::abs(rgb_gamma[1] - rgb_gamma[2]) < 1e-4f) {
+      gamma = rgb_gamma[0];
+    }
+  }
+
+  if (gamma != 0 && c->tf.SetGamma(gamma)) {
+    PaddedBytes icc_test;
+    if (MaybeCreateProfile(*c, &icc_test) &&
+        ProfileEquivalentToICC(context, profile, icc_test, *c)) {
+      return;
+    }
+  }
+
+  for (TransferFunction tf : Values<TransferFunction>()) {
+    // Can only create profile from known transfer function.
+    if (tf == TransferFunction::kUnknown) continue;
+
+    c->tf.SetTransferFunction(tf);
+
+    PaddedBytes icc_test;
+    if (MaybeCreateProfile(*c, &icc_test) &&
+        ProfileEquivalentToICC(context, profile, icc_test, *c)) {
+      return;
+    }
+  }
+
+  c->tf.SetTransferFunction(TransferFunction::kUnknown);
+}
+
+void ErrorHandler(cmsContext context, cmsUInt32Number code, const char* text) {
+  JXL_WARNING("LCMS error %u: %s", code, text);
+}
+
+// Returns a context for the current thread, creating it if necessary.
+cmsContext GetContext() {
+  static thread_local void* context_;
+  if (context_ == nullptr) {
+    context_ = cmsCreateContext(nullptr, nullptr);
+    JXL_ASSERT(context_ != nullptr);
+
+    cmsSetLogErrorHandlerTHR(static_cast<cmsContext>(context_), &ErrorHandler);
+  }
+  return static_cast<cmsContext>(context_);
+}
+
+#endif  // JPEGXL_ENABLE_SKCMS
+
+Status GetPrimariesLuminances(const ColorEncoding& encoding,
+                              float luminances[3]) {
+  // Explanation:
+  // We know that the three primaries must sum to white:
+  //
+  // [Xr, Xg, Xb;     [1;     [Xw;
+  //  Yr, Yg, Yb;  ×   1;  =   Yw;
+  //  Zr, Zg, Zb]      1]      Zw]
+  //
+  // By noting that X = x·(X+Y+Z), Y = y·(X+Y+Z) and Z = z·(X+Y+Z) (note the
+  // lower case indicating chromaticity), and factoring the totals (X+Y+Z) out
+  // of the left matrix and into the all-ones vector, we get:
+  //
+  // [xr, xg, xb;     [Xr + Yr + Zr;     [Xw;
+  //  yr, yg, yb;  ×   Xg + Yg + Zg;  =   Yw;
+  //  zr, zg, zb]      Xb + Yb + Zb]      Zw]
+  //
+  // Which makes it apparent that we can compute those totals as:
+  //
+  //                  [Xr + Yr + Zr;     inv([xr, xg, xb;      [Xw;
+  //                   Xg + Yg + Zg;  =       yr, yg, yb;   ×   Yw;
+  //                   Xb + Yb + Zb]          zr, zg, zb])      Zw]
+  //
+  // From there, by multiplying each total by its corresponding y, we get Y for
+  // that primary.
+
+  float white_XYZ[3];
+  JXL_RETURN_IF_ERROR(
+      CIEXYZFromWhiteCIExy(encoding.GetWhitePoint(), white_XYZ));
+
+  const PrimariesCIExy primaries = encoding.GetPrimaries();
+  double chromaticities[3][3] = {
+      {primaries.r.x, primaries.g.x, primaries.b.x},
+      {primaries.r.y, primaries.g.y, primaries.b.y},
+      {1 - primaries.r.x - primaries.r.y, 1 - primaries.g.x - primaries.g.y,
+       1 - primaries.b.x - primaries.b.y}};
+  JXL_RETURN_IF_ERROR(Inv3x3Matrix(&chromaticities[0][0]));
+  const double ys[3] = {primaries.r.y, primaries.g.y, primaries.b.y};
+  for (size_t i = 0; i < 3; ++i) {
+    luminances[i] = ys[i] * (chromaticities[i][0] * white_XYZ[0] +
+                             chromaticities[i][1] * white_XYZ[1] +
+                             chromaticities[i][2] * white_XYZ[2]);
+  }
+  return true;
+}
+
+Status ApplyHlgOotf(JxlCms* t, float* JXL_RESTRICT buf, size_t xsize,
+                    bool forward) {
+  if (295 <= t->intensity_target && t->intensity_target <= 305) {
+    // The gamma is approximately 1 so this can essentially be skipped.
+    return true;
+  }
+  float gamma = 1.2f * std::pow(1.111f, std::log2(t->intensity_target * 1e-3f));
+  if (!forward) gamma = 1.f / gamma;
+
+  switch (t->hlg_ootf_num_channels) {
+    case 1:
+      for (size_t x = 0; x < xsize; ++x) {
+        buf[x] = std::pow(buf[x], gamma);
+      }
+      break;
+
+    case 3:
+      for (size_t x = 0; x < xsize; x += 3) {
+        const float luminance = buf[x] * t->hlg_ootf_luminances[0] +
+                                buf[x + 1] * t->hlg_ootf_luminances[1] +
+                                buf[x + 2] * t->hlg_ootf_luminances[2];
+        const float ratio = std::pow(luminance, gamma - 1);
+        if (std::isfinite(ratio)) {
+          buf[x] *= ratio;
+          buf[x + 1] *= ratio;
+          buf[x + 2] *= ratio;
+          if (forward && gamma < 1) {
+            // If gamma < 1, the ratio above will be > 1 which can push bright
+            // saturated highlights out of gamut. There are several possible
+            // ways to bring them back in-gamut; this one preserves hue and
+            // saturation at the slight expense of luminance. If !forward, the
+            // previously-applied forward OOTF with gamma > 1 already pushed
+            // those highlights down and we are simply putting them back where
+            // they were so this is not necessary.
+            const float maximum =
+                std::max(buf[x], std::max(buf[x + 1], buf[x + 2]));
+            if (maximum > 1) {
+              const float normalizer = 1.f / maximum;
+              buf[x] *= normalizer;
+              buf[x + 1] *= normalizer;
+              buf[x + 2] *= normalizer;
+            }
+          }
+        }
+      }
+      break;
+
+    default:
+      return JXL_FAILURE("HLG OOTF not implemented for %" PRIuS " channels",
+                         t->hlg_ootf_num_channels);
+  }
+  return true;
+}
+
+bool ApplyCICP(const uint8_t color_primaries,
+               const uint8_t transfer_characteristics,
+               const uint8_t matrix_coefficients, const uint8_t full_range,
+               ColorEncoding* JXL_RESTRICT c) {
+  if (matrix_coefficients != 0) return false;
+  if (full_range != 1) return false;
+
+  const auto primaries = static_cast<Primaries>(color_primaries);
+  const auto tf = static_cast<TransferFunction>(transfer_characteristics);
+  if (tf == TransferFunction::kUnknown || !EnumValid(tf)) return false;
+  if (primaries == Primaries::kCustom ||
+      !(color_primaries == 12 || EnumValid(primaries))) {
+    return false;
+  }
+  c->SetColorSpace(ColorSpace::kRGB);
+  c->tf.SetTransferFunction(tf);
+  if (primaries == Primaries::kP3) {
+    c->white_point = WhitePoint::kDCI;
+    c->primaries = Primaries::kP3;
+  } else if (color_primaries == 12) {
+    c->white_point = WhitePoint::kD65;
+    c->primaries = Primaries::kP3;
+  } else {
+    c->white_point = WhitePoint::kD65;
+    c->primaries = primaries;
+  }
+  return true;
+}
+
+}  // namespace
+
+Status ColorEncoding::SetFieldsFromICC() {
+  // In case parsing fails, mark the ColorEncoding as invalid.
+  SetColorSpace(ColorSpace::kUnknown);
+  tf.SetTransferFunction(TransferFunction::kUnknown);
+
+  if (icc_.empty()) return JXL_FAILURE("Empty ICC profile");
+
+#if JPEGXL_ENABLE_SKCMS
+  if (icc_.size() < 128) {
+    return JXL_FAILURE("ICC file too small");
+  }
+
+  skcms_ICCProfile profile;
+  JXL_RETURN_IF_ERROR(skcms_Parse(icc_.data(), icc_.size(), &profile));
+
+  // skcms does not return the rendering intent, so get it from the file. It
+  // is encoded as big-endian 32-bit integer in bytes 60..63.
+  uint32_t rendering_intent32 = icc_[67];
+  if (rendering_intent32 > 3 || icc_[64] != 0 || icc_[65] != 0 ||
+      icc_[66] != 0) {
+    return JXL_FAILURE("Invalid rendering intent %u\n", rendering_intent32);
+  }
+  // ICC and RenderingIntent have the same values (0..3).
+  rendering_intent = static_cast<RenderingIntent>(rendering_intent32);
+
+  if (profile.has_CICP && ApplyCICP(profile.CICP.color_primaries,
+                                    profile.CICP.transfer_characteristics,
+                                    profile.CICP.matrix_coefficients,
+                                    profile.CICP.video_full_range_flag, this)) {
+    return true;
+  }
+
+  SetColorSpace(ColorSpaceFromProfile(profile));
+  cmyk_ = (profile.data_color_space == skcms_Signature_CMYK);
+
+  CIExy wp_unadapted;
+  JXL_RETURN_IF_ERROR(UnadaptedWhitePoint(profile, &wp_unadapted));
+  JXL_RETURN_IF_ERROR(SetWhitePoint(wp_unadapted));
+
+  // Relies on color_space.
+  JXL_RETURN_IF_ERROR(IdentifyPrimaries(profile, wp_unadapted, this));
+
+  // Relies on color_space/white point/primaries being set already.
+  DetectTransferFunction(profile, this);
+#else  // JPEGXL_ENABLE_SKCMS
+
+  const cmsContext context = GetContext();
+
+  Profile profile;
+  JXL_RETURN_IF_ERROR(DecodeProfile(context, icc_, &profile));
+
+  static constexpr size_t kCICPSize = 12;
+  static constexpr auto kCICPSignature =
+      static_cast<cmsTagSignature>(0x63696370);
+  uint8_t cicp_buffer[kCICPSize];
+  if (cmsReadRawTag(profile.get(), kCICPSignature, cicp_buffer, kCICPSize) ==
+          kCICPSize &&
+      ApplyCICP(cicp_buffer[8], cicp_buffer[9], cicp_buffer[10],
+                cicp_buffer[11], this)) {
+    return true;
+  }
+
+  const cmsUInt32Number rendering_intent32 =
+      cmsGetHeaderRenderingIntent(profile.get());
+  if (rendering_intent32 > 3) {
+    return JXL_FAILURE("Invalid rendering intent %u\n", rendering_intent32);
+  }
+  // ICC and RenderingIntent have the same values (0..3).
+  rendering_intent = static_cast<RenderingIntent>(rendering_intent32);
+
+  SetColorSpace(ColorSpaceFromProfile(profile));
+  if (cmsGetColorSpace(profile.get()) == cmsSigCmykData) {
+    cmyk_ = true;
+    return true;
+  }
+
+  const cmsCIEXYZ wp_unadapted = UnadaptedWhitePoint(context, profile, *this);
+  JXL_RETURN_IF_ERROR(SetWhitePoint(CIExyFromXYZ(wp_unadapted)));
+
+  // Relies on color_space.
+  JXL_RETURN_IF_ERROR(IdentifyPrimaries(context, profile, wp_unadapted, this));
+
+  // Relies on color_space/white point/primaries being set already.
+  DetectTransferFunction(context, profile, this);
+
+#endif  // JPEGXL_ENABLE_SKCMS
+
+  return true;
+}
+
+void ColorEncoding::DecideIfWantICC() {
+  PaddedBytes icc_new;
+#if JPEGXL_ENABLE_SKCMS
+  skcms_ICCProfile profile;
+  if (!DecodeProfile(ICC().data(), ICC().size(), &profile)) return;
+  if (!MaybeCreateProfile(*this, &icc_new)) return;
+#else   // JPEGXL_ENABLE_SKCMS
+  const cmsContext context = GetContext();
+  Profile profile;
+  if (!DecodeProfile(context, ICC(), &profile)) return;
+  if (cmsGetColorSpace(profile.get()) == cmsSigCmykData) return;
+  if (!MaybeCreateProfile(*this, &icc_new)) return;
+#endif  // JPEGXL_ENABLE_SKCMS
+
+  want_icc_ = false;
+}
+
+namespace {
+
+void JxlCmsDestroy(void* cms_data) {
+  if (cms_data == nullptr) return;
+  JxlCms* t = reinterpret_cast<JxlCms*>(cms_data);
+#if !JPEGXL_ENABLE_SKCMS
+  TransformDeleter()(t->lcms_transform);
+#endif
+  delete t;
+}
+
+void* JxlCmsInit(void* init_data, size_t num_threads, size_t xsize,
+                 const JxlColorProfile* input, const JxlColorProfile* output,
+                 float intensity_target) {
+  auto t = jxl::make_unique<JxlCms>();
+  PaddedBytes icc_src, icc_dst;
+  icc_src.assign(input->icc.data, input->icc.data + input->icc.size);
+  ColorEncoding c_src;
+  if (!c_src.SetICC(std::move(icc_src))) {
+    JXL_NOTIFY_ERROR("JxlCmsInit: failed to parse input ICC");
+    return nullptr;
+  }
+  icc_dst.assign(output->icc.data, output->icc.data + output->icc.size);
+  ColorEncoding c_dst;
+  if (!c_dst.SetICC(std::move(icc_dst))) {
+    JXL_NOTIFY_ERROR("JxlCmsInit: failed to parse output ICC");
+    return nullptr;
+  }
+#if JXL_CMS_VERBOSE
+  printf("%s -> %s\n", Description(c_src).c_str(), Description(c_dst).c_str());
+#endif
+
+#if JPEGXL_ENABLE_SKCMS
+  if (!DecodeProfile(input->icc.data, input->icc.size, &t->profile_src)) {
+    JXL_NOTIFY_ERROR("JxlCmsInit: skcms failed to parse input ICC");
+    return nullptr;
+  }
+  if (!DecodeProfile(output->icc.data, output->icc.size, &t->profile_dst)) {
+    JXL_NOTIFY_ERROR("JxlCmsInit: skcms failed to parse output ICC");
+    return nullptr;
+  }
+#else   // JPEGXL_ENABLE_SKCMS
+  const cmsContext context = GetContext();
+  Profile profile_src, profile_dst;
+  if (!DecodeProfile(context, c_src.ICC(), &profile_src)) {
+    JXL_NOTIFY_ERROR("JxlCmsInit: lcms failed to parse input ICC");
+    return nullptr;
+  }
+  if (!DecodeProfile(context, c_dst.ICC(), &profile_dst)) {
+    JXL_NOTIFY_ERROR("JxlCmsInit: lcms failed to parse output ICC");
+    return nullptr;
+  }
+#endif  // JPEGXL_ENABLE_SKCMS
+
+  t->skip_lcms = false;
+  if (c_src.SameColorEncoding(c_dst)) {
+    t->skip_lcms = true;
+#if JXL_CMS_VERBOSE
+    printf("Skip CMS\n");
+#endif
+  }
+
+  t->apply_hlg_ootf = c_src.tf.IsHLG() != c_dst.tf.IsHLG();
+  if (t->apply_hlg_ootf) {
+    const ColorEncoding* c_hlg = c_src.tf.IsHLG() ? &c_src : &c_dst;
+    t->hlg_ootf_num_channels = c_hlg->Channels();
+    if (t->hlg_ootf_num_channels == 3 &&
+        !GetPrimariesLuminances(*c_hlg, t->hlg_ootf_luminances.data())) {
+      JXL_NOTIFY_ERROR(
+          "JxlCmsInit: failed to compute the luminances of primaries");
+      return nullptr;
+    }
+  }
+
+  // Special-case SRGB <=> linear if the primaries / white point are the same,
+  // or any conversion where PQ or HLG is involved:
+  bool src_linear = c_src.tf.IsLinear();
+  const bool dst_linear = c_dst.tf.IsLinear();
+
+  if (c_src.tf.IsPQ() || c_src.tf.IsHLG() ||
+      (c_src.tf.IsSRGB() && dst_linear && c_src.SameColorSpace(c_dst))) {
+    // Construct new profile as if the data were already/still linear.
+    ColorEncoding c_linear_src = c_src;
+    c_linear_src.tf.SetTransferFunction(TransferFunction::kLinear);
+#if JPEGXL_ENABLE_SKCMS
+    skcms_ICCProfile new_src;
+#else  // JPEGXL_ENABLE_SKCMS
+    Profile new_src;
+#endif  // JPEGXL_ENABLE_SKCMS
+        // Only enable ExtraTF if profile creation succeeded.
+    if (MaybeCreateProfile(c_linear_src, &icc_src) &&
+#if JPEGXL_ENABLE_SKCMS
+        DecodeProfile(icc_src.data(), icc_src.size(), &new_src)) {
+#else   // JPEGXL_ENABLE_SKCMS
+        DecodeProfile(context, icc_src, &new_src)) {
+#endif  // JPEGXL_ENABLE_SKCMS
+#if JXL_CMS_VERBOSE
+      printf("Special HLG/PQ/sRGB -> linear\n");
+#endif
+#if JPEGXL_ENABLE_SKCMS
+      t->icc_src = std::move(icc_src);
+      t->profile_src = new_src;
+#else   // JPEGXL_ENABLE_SKCMS
+      profile_src.swap(new_src);
+#endif  // JPEGXL_ENABLE_SKCMS
+      t->preprocess = c_src.tf.IsSRGB()
+                          ? ExtraTF::kSRGB
+                          : (c_src.tf.IsPQ() ? ExtraTF::kPQ : ExtraTF::kHLG);
+      c_src = c_linear_src;
+      src_linear = true;
+    } else {
+      if (t->apply_hlg_ootf) {
+        JXL_NOTIFY_ERROR(
+            "Failed to create extra linear source profile, and HLG OOTF "
+            "required");
+        return nullptr;
+      }
+      JXL_WARNING("Failed to create extra linear destination profile");
+    }
+  }
+
+  if (c_dst.tf.IsPQ() || c_dst.tf.IsHLG() ||
+      (c_dst.tf.IsSRGB() && src_linear && c_src.SameColorSpace(c_dst))) {
+    ColorEncoding c_linear_dst = c_dst;
+    c_linear_dst.tf.SetTransferFunction(TransferFunction::kLinear);
+#if JPEGXL_ENABLE_SKCMS
+    skcms_ICCProfile new_dst;
+#else   // JPEGXL_ENABLE_SKCMS
+    Profile new_dst;
+#endif  // JPEGXL_ENABLE_SKCMS
+    // Only enable ExtraTF if profile creation succeeded.
+    if (MaybeCreateProfile(c_linear_dst, &icc_dst) &&
+#if JPEGXL_ENABLE_SKCMS
+        DecodeProfile(icc_dst.data(), icc_dst.size(), &new_dst)) {
+#else   // JPEGXL_ENABLE_SKCMS
+        DecodeProfile(context, icc_dst, &new_dst)) {
+#endif  // JPEGXL_ENABLE_SKCMS
+#if JXL_CMS_VERBOSE
+      printf("Special linear -> HLG/PQ/sRGB\n");
+#endif
+#if JPEGXL_ENABLE_SKCMS
+      t->icc_dst = std::move(icc_dst);
+      t->profile_dst = new_dst;
+#else   // JPEGXL_ENABLE_SKCMS
+      profile_dst.swap(new_dst);
+#endif  // JPEGXL_ENABLE_SKCMS
+      t->postprocess = c_dst.tf.IsSRGB()
+                           ? ExtraTF::kSRGB
+                           : (c_dst.tf.IsPQ() ? ExtraTF::kPQ : ExtraTF::kHLG);
+      c_dst = c_linear_dst;
+    } else {
+      if (t->apply_hlg_ootf) {
+        JXL_NOTIFY_ERROR(
+            "Failed to create extra linear destination profile, and inverse "
+            "HLG OOTF required");
+        return nullptr;
+      }
+      JXL_WARNING("Failed to create extra linear destination profile");
+    }
+  }
+
+  if (c_src.SameColorEncoding(c_dst)) {
+#if JXL_CMS_VERBOSE
+    printf("Same intermediary linear profiles, skipping CMS\n");
+#endif
+    t->skip_lcms = true;
+  }
+
+#if JPEGXL_ENABLE_SKCMS
+  if (!skcms_MakeUsableAsDestination(&t->profile_dst)) {
+    JXL_NOTIFY_ERROR(
+        "Failed to make %s usable as a color transform destination",
+        Description(c_dst).c_str());
+    return nullptr;
+  }
+#endif  // JPEGXL_ENABLE_SKCMS
+
+  // Not including alpha channel (copied separately).
+  const size_t channels_src = (c_src.IsCMYK() ? 4 : c_src.Channels());
+  const size_t channels_dst = c_dst.Channels();
+  JXL_CHECK(channels_src == channels_dst ||
+            (channels_src == 4 && channels_dst == 3));
+#if JXL_CMS_VERBOSE
+  printf("Channels: %" PRIuS "; Threads: %" PRIuS "\n", channels_src,
+         num_threads);
+#endif
+
+#if !JPEGXL_ENABLE_SKCMS
+  // Type includes color space (XYZ vs RGB), so can be different.
+  const uint32_t type_src = Type32(c_src, channels_src == 4);
+  const uint32_t type_dst = Type32(c_dst, false);
+  const uint32_t intent = static_cast<uint32_t>(c_dst.rendering_intent);
+  // Use cmsFLAGS_NOCACHE to disable the 1-pixel cache and make calling
+  // cmsDoTransform() thread-safe.
+  const uint32_t flags = cmsFLAGS_NOCACHE | cmsFLAGS_BLACKPOINTCOMPENSATION |
+                         cmsFLAGS_HIGHRESPRECALC;
+  t->lcms_transform =
+      cmsCreateTransformTHR(context, profile_src.get(), type_src,
+                            profile_dst.get(), type_dst, intent, flags);
+  if (t->lcms_transform == nullptr) {
+    JXL_NOTIFY_ERROR("Failed to create transform");
+    return nullptr;
+  }
+#endif  // !JPEGXL_ENABLE_SKCMS
+
+  // Ideally LCMS would convert directly from External to Image3. However,
+  // cmsDoTransformLineStride only accepts 32-bit BytesPerPlaneIn, whereas our
+  // planes can be more than 4 GiB apart. Hence, transform inputs/outputs must
+  // be interleaved. Calling cmsDoTransform for each pixel is expensive
+  // (indirect call). We therefore transform rows, which requires per-thread
+  // buffers. To avoid separate allocations, we use the rows of an image.
+  // Because LCMS apparently also cannot handle <= 16 bit inputs and 32-bit
+  // outputs (or vice versa), we use floating point input/output.
+  t->channels_src = channels_src;
+  t->channels_dst = channels_dst;
+#if JPEGXL_ENABLE_SKCMS
+  // SkiaCMS doesn't support grayscale float buffers, so we create space for RGB
+  // float buffers anyway.
+  t->buf_src = ImageF(xsize * (channels_src == 4 ? 4 : 3), num_threads);
+  t->buf_dst = ImageF(xsize * 3, num_threads);
+#else
+  t->buf_src = ImageF(xsize * channels_src, num_threads);
+  t->buf_dst = ImageF(xsize * channels_dst, num_threads);
+#endif
+  t->intensity_target = intensity_target;
+  return t.release();
+}
+
+float* JxlCmsGetSrcBuf(void* cms_data, size_t thread) {
+  JxlCms* t = reinterpret_cast<JxlCms*>(cms_data);
+  return t->buf_src.Row(thread);
+}
+
+float* JxlCmsGetDstBuf(void* cms_data, size_t thread) {
+  JxlCms* t = reinterpret_cast<JxlCms*>(cms_data);
+  return t->buf_dst.Row(thread);
+}
+
+}  // namespace
+
+const JxlCmsInterface& GetJxlCms() {
+  static constexpr JxlCmsInterface kInterface = {
+      /*init_data=*/nullptr,
+      /*init=*/&JxlCmsInit,
+      /*get_src_buf=*/&JxlCmsGetSrcBuf,
+      /*get_dst_buf=*/&JxlCmsGetDstBuf,
+      /*run=*/&DoColorSpaceTransform,
+      /*destroy=*/&JxlCmsDestroy};
+  return kInterface;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_color_management.h b/third_party/jpeg-xl/lib/jxl/enc_color_management.h
new file mode 100644
index 0000000000..6f6e9023a6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_color_management.h
@@ -0,0 +1,90 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_COLOR_MANAGEMENT_H_
+#define LIB_JXL_ENC_COLOR_MANAGEMENT_H_
+
+// ICC profiles and color space conversions.
+
+#include <jxl/cms_interface.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+// Internal C++ wrapper for a JxlCmsInterface.
+class ColorSpaceTransform {
+ public:
+  explicit ColorSpaceTransform(const JxlCmsInterface& cms) : cms_(cms) {}
+  ~ColorSpaceTransform() {
+    if (cms_data_ != nullptr) {
+      cms_.destroy(cms_data_);
+    }
+  }
+
+  // Cannot copy.
+  ColorSpaceTransform(const ColorSpaceTransform&) = delete;
+  ColorSpaceTransform& operator=(const ColorSpaceTransform&) = delete;
+
+  Status Init(const ColorEncoding& c_src, const ColorEncoding& c_dst,
+              float intensity_target, size_t xsize, size_t num_threads) {
+    xsize_ = xsize;
+    JxlColorProfile input_profile;
+    icc_src_ = c_src.ICC();
+    input_profile.icc.data = icc_src_.data();
+    input_profile.icc.size = icc_src_.size();
+    ConvertInternalToExternalColorEncoding(c_src,
+                                           &input_profile.color_encoding);
+    input_profile.num_channels = c_src.IsCMYK() ? 4 : c_src.Channels();
+    JxlColorProfile output_profile;
+    icc_dst_ = c_dst.ICC();
+    output_profile.icc.data = icc_dst_.data();
+    output_profile.icc.size = icc_dst_.size();
+    ConvertInternalToExternalColorEncoding(c_dst,
+                                           &output_profile.color_encoding);
+    if (c_dst.IsCMYK())
+      return JXL_FAILURE("Conversion to CMYK is not supported");
+    output_profile.num_channels = c_dst.Channels();
+    cms_data_ = cms_.init(cms_.init_data, num_threads, xsize, &input_profile,
+                          &output_profile, intensity_target);
+    JXL_RETURN_IF_ERROR(cms_data_ != nullptr);
+    return true;
+  }
+
+  float* BufSrc(const size_t thread) const {
+    return cms_.get_src_buf(cms_data_, thread);
+  }
+
+  float* BufDst(const size_t thread) const {
+    return cms_.get_dst_buf(cms_data_, thread);
+  }
+
+  Status Run(const size_t thread, const float* buf_src, float* buf_dst) {
+    return cms_.run(cms_data_, thread, buf_src, buf_dst, xsize_);
+  }
+
+ private:
+  JxlCmsInterface cms_;
+  void* cms_data_ = nullptr;
+  // The interface may retain pointers into these.
+  PaddedBytes icc_src_;
+  PaddedBytes icc_dst_;
+  size_t xsize_;
+};
+
+const JxlCmsInterface& GetJxlCms();
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_COLOR_MANAGEMENT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_comparator.cc b/third_party/jpeg-xl/lib/jxl/enc_comparator.cc
new file mode 100644
index 0000000000..cbdd0f78d9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_comparator.cc
@@ -0,0 +1,130 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_comparator.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/enc_gamma_correct.h"
+#include "lib/jxl/enc_image_bundle.h"
+
+namespace jxl {
+namespace {
+
+// color is linear, but blending happens in gamma-compressed space using
+// (gamma-compressed) grayscale background color, alpha image represents
+// weights of the sRGB colors in the [0 .. (1 << bit_depth) - 1] interval,
+// output image is in linear space.
+void AlphaBlend(const Image3F& in, const size_t c, float background_linear,
+                const ImageF& alpha, Image3F* out) {
+  const float background = LinearToSrgb8Direct(background_linear);
+
+  for (size_t y = 0; y < out->ysize(); ++y) {
+    const float* JXL_RESTRICT row_a = alpha.ConstRow(y);
+    const float* JXL_RESTRICT row_i = in.ConstPlaneRow(c, y);
+    float* JXL_RESTRICT row_o = out->PlaneRow(c, y);
+    for (size_t x = 0; x < out->xsize(); ++x) {
+      const float a = row_a[x];
+      if (a <= 0.f) {
+        row_o[x] = background_linear;
+      } else if (a >= 1.f) {
+        row_o[x] = row_i[x];
+      } else {
+        const float w_fg = a;
+        const float w_bg = 1.0f - w_fg;
+        const float fg = w_fg * LinearToSrgb8Direct(row_i[x]);
+        const float bg = w_bg * background;
+        row_o[x] = Srgb8ToLinearDirect(fg + bg);
+      }
+    }
+  }
+}
+
+void AlphaBlend(float background_linear, ImageBundle* io_linear_srgb) {
+  // No alpha => all opaque.
+  if (!io_linear_srgb->HasAlpha()) return;
+
+  for (size_t c = 0; c < 3; ++c) {
+    AlphaBlend(*io_linear_srgb->color(), c, background_linear,
+               *io_linear_srgb->alpha(), io_linear_srgb->color());
+  }
+}
+
+float ComputeScoreImpl(const ImageBundle& rgb0, const ImageBundle& rgb1,
+                       Comparator* comparator, ImageF* distmap) {
+  JXL_CHECK(comparator->SetReferenceImage(rgb0));
+  float score;
+  JXL_CHECK(comparator->CompareWith(rgb1, distmap, &score));
+  return score;
+}
+
+}  // namespace
+
+float ComputeScore(const ImageBundle& rgb0, const ImageBundle& rgb1,
+                   Comparator* comparator, const JxlCmsInterface& cms,
+                   ImageF* diffmap, ThreadPool* pool) {
+  PROFILER_FUNC;
+  // Convert to linear sRGB (unless already in that space)
+  ImageMetadata metadata0 = *rgb0.metadata();
+  ImageBundle store0(&metadata0);
+  const ImageBundle* linear_srgb0;
+  JXL_CHECK(TransformIfNeeded(rgb0, ColorEncoding::LinearSRGB(rgb0.IsGray()),
+                              cms, pool, &store0, &linear_srgb0));
+  ImageMetadata metadata1 = *rgb1.metadata();
+  ImageBundle store1(&metadata1);
+  const ImageBundle* linear_srgb1;
+  JXL_CHECK(TransformIfNeeded(rgb1, ColorEncoding::LinearSRGB(rgb1.IsGray()),
+                              cms, pool, &store1, &linear_srgb1));
+
+  // No alpha: skip blending, only need a single call to Butteraugli.
+  if (!rgb0.HasAlpha() && !rgb1.HasAlpha()) {
+    return ComputeScoreImpl(*linear_srgb0, *linear_srgb1, comparator, diffmap);
+  }
+
+  // Blend on black and white backgrounds
+
+  const float black = 0.0f;
+  ImageBundle blended_black0 = linear_srgb0->Copy();
+  ImageBundle blended_black1 = linear_srgb1->Copy();
+  AlphaBlend(black, &blended_black0);
+  AlphaBlend(black, &blended_black1);
+
+  const float white = 1.0f;
+  ImageBundle blended_white0 = linear_srgb0->Copy();
+  ImageBundle blended_white1 = linear_srgb1->Copy();
+
+  AlphaBlend(white, &blended_white0);
+  AlphaBlend(white, &blended_white1);
+
+  ImageF diffmap_black, diffmap_white;
+  const float dist_black = ComputeScoreImpl(blended_black0, blended_black1,
+                                            comparator, &diffmap_black);
+  const float dist_white = ComputeScoreImpl(blended_white0, blended_white1,
+                                            comparator, &diffmap_white);
+
+  // diffmap and return values are the max of diffmap_black/white.
+  if (diffmap != nullptr) {
+    const size_t xsize = rgb0.xsize();
+    const size_t ysize = rgb0.ysize();
+    *diffmap = ImageF(xsize, ysize);
+    for (size_t y = 0; y < ysize; ++y) {
+      const float* JXL_RESTRICT row_black = diffmap_black.ConstRow(y);
+      const float* JXL_RESTRICT row_white = diffmap_white.ConstRow(y);
+      float* JXL_RESTRICT row_out = diffmap->Row(y);
+      for (size_t x = 0; x < xsize; ++x) {
+        row_out[x] = std::max(row_black[x], row_white[x]);
+      }
+    }
+  }
+  return std::max(dist_black, dist_white);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_comparator.h b/third_party/jpeg-xl/lib/jxl/enc_comparator.h
new file mode 100644
index 0000000000..0ac4df8296
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_comparator.h
@@ -0,0 +1,52 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_COMPARATOR_H_
+#define LIB_JXL_ENC_COMPARATOR_H_
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+class Comparator {
+ public:
+  virtual ~Comparator() = default;
+
+  // Sets the reference image, the first to compare
+  // Image must be in linear sRGB (gamma expanded) in range 0.0f-1.0f as
+  // the range from standard black point to standard white point, but values
+  // outside permitted.
+  virtual Status SetReferenceImage(const ImageBundle& ref) = 0;
+
+  // Sets the actual image (with loss), the second to compare
+  // Image must be in linear sRGB (gamma expanded) in range 0.0f-1.0f as
+  // the range from standard black point to standard white point, but values
+  // outside permitted.
+  // In diffmap it outputs the local score per pixel, while in score it outputs
+  // a single score. Any one may be set to nullptr to not compute it.
+  virtual Status CompareWith(const ImageBundle& actual, ImageF* diffmap,
+                             float* score) = 0;
+
+  // Quality thresholds for diffmap and score values.
+  // The good score must represent a value where the images are considered to
+  // be perceptually indistinguishable (but not identical)
+  // The bad value must be larger than good to indicate "lower means better"
+  // and smaller than good to indicate "higher means better"
+  virtual float GoodQualityScore() const = 0;
+  virtual float BadQualityScore() const = 0;
+};
+
+// Computes the score given images in any RGB color model, optionally with
+// alpha channel.
+float ComputeScore(const ImageBundle& rgb0, const ImageBundle& rgb1,
+                   Comparator* comparator, const JxlCmsInterface& cms,
+                   ImageF* diffmap = nullptr, ThreadPool* pool = nullptr);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_COMPARATOR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_context_map.cc b/third_party/jpeg-xl/lib/jxl/enc_context_map.cc
new file mode 100644
index 0000000000..842dd12423
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_context_map.cc
@@ -0,0 +1,141 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Library to encode the context map.
+
+#include "lib/jxl/enc_context_map.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <vector>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/entropy_coder.h"
+
+namespace jxl {
+
+namespace {
+
+size_t IndexOf(const std::vector<uint8_t>& v, uint8_t value) {
+  size_t i = 0;
+  for (; i < v.size(); ++i) {
+    if (v[i] == value) return i;
+  }
+  return i;
+}
+
+void MoveToFront(std::vector<uint8_t>* v, size_t index) {
+  uint8_t value = (*v)[index];
+  for (size_t i = index; i != 0; --i) {
+    (*v)[i] = (*v)[i - 1];
+  }
+  (*v)[0] = value;
+}
+
+std::vector<uint8_t> MoveToFrontTransform(const std::vector<uint8_t>& v) {
+  if (v.empty()) return v;
+  uint8_t max_value = *std::max_element(v.begin(), v.end());
+  std::vector<uint8_t> mtf(max_value + 1);
+  for (size_t i = 0; i <= max_value; ++i) mtf[i] = i;
+  std::vector<uint8_t> result(v.size());
+  for (size_t i = 0; i < v.size(); ++i) {
+    size_t index = IndexOf(mtf, v[i]);
+    JXL_ASSERT(index < mtf.size());
+    result[i] = static_cast<uint8_t>(index);
+    MoveToFront(&mtf, index);
+  }
+  return result;
+}
+
+}  // namespace
+
+void EncodeContextMap(const std::vector<uint8_t>& context_map,
+                      size_t num_histograms, BitWriter* writer, size_t layer,
+                      AuxOut* aux_out) {
+  if (num_histograms == 1) {
+    // Simple code
+    writer->Write(1, 1);
+    // 0 bits per entry.
+    writer->Write(2, 0);
+    return;
+  }
+
+  std::vector<uint8_t> transformed_symbols = MoveToFrontTransform(context_map);
+  std::vector<std::vector<Token>> tokens(1), mtf_tokens(1);
+  EntropyEncodingData codes;
+  std::vector<uint8_t> dummy_context_map;
+  for (size_t i = 0; i < context_map.size(); i++) {
+    tokens[0].emplace_back(0, context_map[i]);
+  }
+  for (size_t i = 0; i < transformed_symbols.size(); i++) {
+    mtf_tokens[0].emplace_back(0, transformed_symbols[i]);
+  }
+  HistogramParams params;
+  params.uint_method = HistogramParams::HybridUintMethod::kContextMap;
+  size_t ans_cost = BuildAndEncodeHistograms(
+      params, 1, tokens, &codes, &dummy_context_map, nullptr, 0, nullptr);
+  size_t mtf_cost = BuildAndEncodeHistograms(
+      params, 1, mtf_tokens, &codes, &dummy_context_map, nullptr, 0, nullptr);
+  bool use_mtf = mtf_cost < ans_cost;
+  // Rebuild token list.
+  tokens[0].clear();
+  for (size_t i = 0; i < transformed_symbols.size(); i++) {
+    tokens[0].emplace_back(0,
+                           use_mtf ? transformed_symbols[i] : context_map[i]);
+  }
+  size_t entry_bits = CeilLog2Nonzero(num_histograms);
+  size_t simple_cost = entry_bits * context_map.size();
+  if (entry_bits < 4 && simple_cost < ans_cost && simple_cost < mtf_cost) {
+    writer->Write(1, 1);
+    writer->Write(2, entry_bits);
+    for (size_t i = 0; i < context_map.size(); i++) {
+      writer->Write(entry_bits, context_map[i]);
+    }
+  } else {
+    writer->Write(1, 0);
+    writer->Write(1, use_mtf);  // Use/don't use MTF.
+    BuildAndEncodeHistograms(params, 1, tokens, &codes, &dummy_context_map,
+                             writer, layer, aux_out);
+    WriteTokens(tokens[0], codes, dummy_context_map, writer);
+  }
+}
+
+void EncodeBlockCtxMap(const BlockCtxMap& block_ctx_map, BitWriter* writer,
+                       AuxOut* aux_out) {
+  auto& dct = block_ctx_map.dc_thresholds;
+  auto& qft = block_ctx_map.qf_thresholds;
+  auto& ctx_map = block_ctx_map.ctx_map;
+  BitWriter::Allotment allotment(
+      writer,
+      (dct[0].size() + dct[1].size() + dct[2].size() + qft.size()) * 34 + 1 +
+          4 + 4 + ctx_map.size() * 10 + 1024);
+  if (dct[0].empty() && dct[1].empty() && dct[2].empty() && qft.empty() &&
+      ctx_map.size() == 21 &&
+      std::equal(ctx_map.begin(), ctx_map.end(), BlockCtxMap::kDefaultCtxMap)) {
+    writer->Write(1, 1);  // default
+    allotment.ReclaimAndCharge(writer, kLayerAC, aux_out);
+    return;
+  }
+  writer->Write(1, 0);
+  for (int j : {0, 1, 2}) {
+    writer->Write(4, dct[j].size());
+    for (int i : dct[j]) {
+      JXL_CHECK(U32Coder::Write(kDCThresholdDist, PackSigned(i), writer));
+    }
+  }
+  writer->Write(4, qft.size());
+  for (uint32_t i : qft) {
+    JXL_CHECK(U32Coder::Write(kQFThresholdDist, i - 1, writer));
+  }
+  EncodeContextMap(ctx_map, block_ctx_map.num_ctxs, writer, kLayerAC, aux_out);
+  allotment.ReclaimAndCharge(writer, kLayerAC, aux_out);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_context_map.h b/third_party/jpeg-xl/lib/jxl/enc_context_map.h
new file mode 100644
index 0000000000..041e71de7a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_context_map.h
@@ -0,0 +1,35 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_CONTEXT_MAP_H_
+#define LIB_JXL_ENC_CONTEXT_MAP_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/enc_bit_writer.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Max limit is 255 because encoding assumes numbers < 255
+// More clusters can help compression, but makes encode/decode somewhat slower
+static const size_t kClustersLimit = 128;
+
+// Encodes the given context map to the bit stream. The number of different
+// histogram ids is given by num_histograms.
+void EncodeContextMap(const std::vector<uint8_t>& context_map,
+                      size_t num_histograms, BitWriter* writer, size_t layer,
+                      AuxOut* aux_out);
+
+void EncodeBlockCtxMap(const BlockCtxMap& block_ctx_map, BitWriter* writer,
+                       AuxOut* aux_out);
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_CONTEXT_MAP_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_detect_dots.cc b/third_party/jpeg-xl/lib/jxl/enc_detect_dots.cc
new file mode 100644
index 0000000000..5819036987
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_detect_dots.cc
@@ -0,0 +1,626 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_detect_dots.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdio>
+#include <utility>
+#include <vector>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_detect_dots.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/enc_linalg.h"
+#include "lib/jxl/enc_optimize.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+
+// Set JXL_DEBUG_DOT_DETECT to 1 to enable debugging.
+#ifndef JXL_DEBUG_DOT_DETECT
+#define JXL_DEBUG_DOT_DETECT 0
+#endif
+
+#if JXL_DEBUG_DOT_DETECT
+#include "lib/jxl/enc_aux_out.h"
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::Sub;
+
+ImageF SumOfSquareDifferences(const Image3F& forig, const Image3F& smooth,
+                              ThreadPool* pool) {
+  const HWY_FULL(float) d;
+  const auto color_coef0 = Set(d, 0.0f);
+  const auto color_coef1 = Set(d, 10.0f);
+  const auto color_coef2 = Set(d, 0.0f);
+
+  ImageF sum_of_squares(forig.xsize(), forig.ysize());
+  JXL_CHECK(RunOnPool(
+      pool, 0, forig.ysize(), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t thread) {
+        const size_t y = static_cast<size_t>(task);
+        const float* JXL_RESTRICT orig_row0 = forig.Plane(0).ConstRow(y);
+        const float* JXL_RESTRICT orig_row1 = forig.Plane(1).ConstRow(y);
+        const float* JXL_RESTRICT orig_row2 = forig.Plane(2).ConstRow(y);
+        const float* JXL_RESTRICT smooth_row0 = smooth.Plane(0).ConstRow(y);
+        const float* JXL_RESTRICT smooth_row1 = smooth.Plane(1).ConstRow(y);
+        const float* JXL_RESTRICT smooth_row2 = smooth.Plane(2).ConstRow(y);
+        float* JXL_RESTRICT sos_row = sum_of_squares.Row(y);
+
+        for (size_t x = 0; x < forig.xsize(); x += Lanes(d)) {
+          auto v0 = Sub(Load(d, orig_row0 + x), Load(d, smooth_row0 + x));
+          auto v1 = Sub(Load(d, orig_row1 + x), Load(d, smooth_row1 + x));
+          auto v2 = Sub(Load(d, orig_row2 + x), Load(d, smooth_row2 + x));
+          v0 = Mul(Mul(v0, v0), color_coef0);
+          v1 = Mul(Mul(v1, v1), color_coef1);
+          v2 = Mul(Mul(v2, v2), color_coef2);
+          const auto sos =
+              Add(v0, Add(v1, v2));  // weighted sum of square diffs
+          Store(sos, d, sos_row + x);
+        }
+      },
+      "ComputeEnergyImage"));
+  return sum_of_squares;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(SumOfSquareDifferences);  // Local function
+
+const int kEllipseWindowSize = 5;
+
+namespace {
+struct GaussianEllipse {
+  double x;                         // position in x
+  double y;                         // position in y
+  double sigma_x;                   // scale in x
+  double sigma_y;                   // scale in y
+  double angle;                     // ellipse rotation in radians
+  std::array<double, 3> intensity;  // intensity in each channel
+
+  // The following variables do not need to be encoded
+  double l2_loss;  // error after the Gaussian was fit
+  double l1_loss;
+  double ridge_loss;              // the l2_loss plus regularization term
+  double custom_loss;             // experimental custom loss
+  std::array<double, 3> bgColor;  // best background color
+  size_t neg_pixels;  // number of negative pixels when subtracting dot
+  std::array<double, 3> neg_value;  // debt due to channel truncation
+};
+double DotGaussianModel(double dx, double dy, double ct, double st,
+                        double sigma_x, double sigma_y, double intensity) {
+  double rx = ct * dx + st * dy;
+  double ry = -st * dx + ct * dy;
+  double md = (rx * rx / sigma_x) + (ry * ry / sigma_y);
+  double value = intensity * exp(-0.5 * md);
+  return value;
+}
+
+constexpr bool kOptimizeBackground = true;
+
+// Gaussian that smooths noise but preserves dots
+const WeightsSeparable5& WeightsSeparable5Gaussian0_65() {
+  constexpr float w0 = 0.558311f;
+  constexpr float w1 = 0.210395f;
+  constexpr float w2 = 0.010449f;
+  static constexpr WeightsSeparable5 weights = {
+      {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
+      {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}};
+  return weights;
+}
+
+// (Iterated) Gaussian that removes dots.
+const WeightsSeparable5& WeightsSeparable5Gaussian3() {
+  constexpr float w0 = 0.222338f;
+  constexpr float w1 = 0.210431f;
+  constexpr float w2 = 0.1784f;
+  static constexpr WeightsSeparable5 weights = {
+      {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
+      {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}};
+  return weights;
+}
+
+ImageF ComputeEnergyImage(const Image3F& orig, Image3F* smooth,
+                          ThreadPool* pool) {
+  PROFILER_FUNC;
+
+  // Prepare guidance images for dot selection.
+  Image3F forig(orig.xsize(), orig.ysize());
+  *smooth = Image3F(orig.xsize(), orig.ysize());
+  Rect rect(orig);
+
+  const auto& weights1 = WeightsSeparable5Gaussian0_65();
+  const auto& weights3 = WeightsSeparable5Gaussian3();
+
+  for (size_t c = 0; c < 3; ++c) {
+    // Use forig as temporary storage to reduce memory and keep it warmer.
+    Separable5(orig.Plane(c), rect, weights3, pool, &forig.Plane(c));
+    Separable5(forig.Plane(c), rect, weights3, pool, &smooth->Plane(c));
+    Separable5(orig.Plane(c), rect, weights1, pool, &forig.Plane(c));
+  }
+
+#if JXL_DEBUG_DOT_DETECT
+  AuxOut aux;
+  aux.debug_prefix = "/tmp/sebastian/";
+  aux.DumpImage("filtered", forig);
+  aux.DumpImage("sm", *smooth);
+#endif
+
+  return HWY_DYNAMIC_DISPATCH(SumOfSquareDifferences)(forig, *smooth, pool);
+}
+
+struct Pixel {
+  int x;
+  int y;
+};
+
+Pixel operator+(const Pixel& a, const Pixel& b) {
+  return Pixel{a.x + b.x, a.y + b.y};
+}
+
+// Maximum area in pixels of a ellipse
+const size_t kMaxCCSize = 1000;
+
+// Extracts a connected component from a Binary image where seed is part
+// of the component
+bool ExtractComponent(ImageF* img, std::vector<Pixel>* pixels,
+                      const Pixel& seed, double threshold) {
+  PROFILER_FUNC;
+  static const std::vector<Pixel> neighbors{{1, -1}, {1, 0},   {1, 1},  {0, -1},
+                                            {0, 1},  {-1, -1}, {-1, 1}, {1, 0}};
+  std::vector<Pixel> q{seed};
+  while (!q.empty()) {
+    Pixel current = q.back();
+    q.pop_back();
+    pixels->push_back(current);
+    if (pixels->size() > kMaxCCSize) return false;
+    for (const Pixel& delta : neighbors) {
+      Pixel child = current + delta;
+      if (child.x >= 0 && static_cast<size_t>(child.x) < img->xsize() &&
+          child.y >= 0 && static_cast<size_t>(child.y) < img->ysize()) {
+        float* value = &img->Row(child.y)[child.x];
+        if (*value > threshold) {
+          *value = 0.0;
+          q.push_back(child);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+inline bool PointInRect(const Rect& r, const Pixel& p) {
+  return (static_cast<size_t>(p.x) >= r.x0() &&
+          static_cast<size_t>(p.x) < (r.x0() + r.xsize()) &&
+          static_cast<size_t>(p.y) >= r.y0() &&
+          static_cast<size_t>(p.y) < (r.y0() + r.ysize()));
+}
+
+struct ConnectedComponent {
+  ConnectedComponent(const Rect& bounds, const std::vector<Pixel>&& pixels)
+      : bounds(bounds), pixels(pixels) {}
+  Rect bounds;
+  std::vector<Pixel> pixels;
+  float maxEnergy;
+  float meanEnergy;
+  float varEnergy;
+  float meanBg;
+  float varBg;
+  float score;
+  Pixel mode;
+
+  void CompStats(const ImageF& energy, int extra) {
+    PROFILER_FUNC;
+    maxEnergy = 0.0;
+    meanEnergy = 0.0;
+    varEnergy = 0.0;
+    meanBg = 0.0;
+    varBg = 0.0;
+    int nIn = 0;
+    int nOut = 0;
+    mode.x = 0;
+    mode.y = 0;
+    for (int sy = -extra; sy < (static_cast<int>(bounds.ysize()) + extra);
+         sy++) {
+      int y = sy + static_cast<int>(bounds.y0());
+      if (y < 0 || static_cast<size_t>(y) >= energy.ysize()) continue;
+      const float* JXL_RESTRICT erow = energy.ConstRow(y);
+      for (int sx = -extra; sx < (static_cast<int>(bounds.xsize()) + extra);
+           sx++) {
+        int x = sx + static_cast<int>(bounds.x0());
+        if (x < 0 || static_cast<size_t>(x) >= energy.xsize()) continue;
+        if (erow[x] > maxEnergy) {
+          maxEnergy = erow[x];
+          mode.x = x;
+          mode.y = y;
+        }
+        if (PointInRect(bounds, Pixel{x, y})) {
+          meanEnergy += erow[x];
+          varEnergy += erow[x] * erow[x];
+          nIn++;
+        } else {
+          meanBg += erow[x];
+          varBg += erow[x] * erow[x];
+          nOut++;
+        }
+      }
+    }
+    meanEnergy = meanEnergy / nIn;
+    meanBg = meanBg / nOut;
+    varEnergy = (varEnergy / nIn) - meanEnergy * meanEnergy;
+    varBg = (varBg / nOut) - meanBg * meanBg;
+    score = (meanEnergy - meanBg) / std::sqrt(varBg);
+  }
+};
+
+Rect BoundingRectangle(const std::vector<Pixel>& pixels) {
+  PROFILER_FUNC;
+  JXL_ASSERT(!pixels.empty());
+  int low_x, high_x, low_y, high_y;
+  low_x = high_x = pixels[0].x;
+  low_y = high_y = pixels[0].y;
+  for (const Pixel& p : pixels) {
+    low_x = std::min(low_x, p.x);
+    high_x = std::max(high_x, p.x);
+    low_y = std::min(low_y, p.y);
+    high_y = std::max(high_y, p.y);
+  }
+  return Rect(low_x, low_y, high_x - low_x + 1, high_y - low_y + 1);
+}
+
+std::vector<ConnectedComponent> FindCC(const ImageF& energy, double t_low,
+                                       double t_high, uint32_t maxWindow,
+                                       double minScore) {
+  PROFILER_FUNC;
+  const int kExtraRect = 4;
+  ImageF img = CopyImage(energy);
+  std::vector<ConnectedComponent> ans;
+  for (size_t y = 0; y < img.ysize(); y++) {
+    float* JXL_RESTRICT row = img.Row(y);
+    for (size_t x = 0; x < img.xsize(); x++) {
+      if (row[x] > t_high) {
+        std::vector<Pixel> pixels;
+        row[x] = 0.0;
+        bool success = ExtractComponent(
+            &img, &pixels, Pixel{static_cast<int>(x), static_cast<int>(y)},
+            t_low);
+        if (!success) continue;
+#if JXL_DEBUG_DOT_DETECT
+        for (size_t i = 0; i < pixels.size(); i++) {
+          fprintf(stderr, "(%d,%d) ", pixels[i].x, pixels[i].y);
+        }
+        fprintf(stderr, "\n");
+#endif  // JXL_DEBUG_DOT_DETECT
+        Rect bounds = BoundingRectangle(pixels);
+        if (bounds.xsize() < maxWindow && bounds.ysize() < maxWindow) {
+          ConnectedComponent cc{bounds, std::move(pixels)};
+          cc.CompStats(energy, kExtraRect);
+          if (cc.score < minScore) continue;
+          JXL_DEBUG(JXL_DEBUG_DOT_DETECT,
+                    "cc mode: (%d,%d), max: %f, bgMean: %f bgVar: "
+                    "%f bound:(%" PRIuS ",%" PRIuS ",%" PRIuS ",%" PRIuS ")\n",
+                    cc.mode.x, cc.mode.y, cc.maxEnergy, cc.meanEnergy,
+                    cc.varEnergy, cc.bounds.x0(), cc.bounds.y0(),
+                    cc.bounds.xsize(), cc.bounds.ysize());
+          ans.push_back(cc);
+        }
+      }
+    }
+  }
+  return ans;
+}
+
+// TODO (sggonzalez): Adapt this function for the different color spaces or
+// remove it if the color space with the best performance does not need it
+void ComputeDotLosses(GaussianEllipse* ellipse, const ConnectedComponent& cc,
+                      const Image3F& img, const Image3F& background) {
+  PROFILER_FUNC;
+  const int rectBounds = 2;
+  const double kIntensityR = 0.0;   // 0.015;
+  const double kSigmaR = 0.0;       // 0.01;
+  const double kZeroEpsilon = 0.1;  // Tolerance to consider a value negative
+  double ct = cos(ellipse->angle), st = sin(ellipse->angle);
+  const std::array<double, 3> channelGains{{1.0, 1.0, 1.0}};
+  int N = 0;
+  ellipse->l1_loss = 0.0;
+  ellipse->l2_loss = 0.0;
+  ellipse->neg_pixels = 0;
+  ellipse->neg_value.fill(0.0);
+  double distMeanModeSq = (cc.mode.x - ellipse->x) * (cc.mode.x - ellipse->x) +
+                          (cc.mode.y - ellipse->y) * (cc.mode.y - ellipse->y);
+  ellipse->custom_loss = 0.0;
+  for (int c = 0; c < 3; c++) {
+    for (int sy = -rectBounds;
+         sy < (static_cast<int>(cc.bounds.ysize()) + rectBounds); sy++) {
+      int y = sy + cc.bounds.y0();
+      if (y < 0 || static_cast<size_t>(y) >= img.ysize()) continue;
+      const float* JXL_RESTRICT row = img.ConstPlaneRow(c, y);
+      // bgrow is only used if kOptimizeBackground is false.
+      // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+      const float* JXL_RESTRICT bgrow = background.ConstPlaneRow(c, y);
+      for (int sx = -rectBounds;
+           sx < (static_cast<int>(cc.bounds.xsize()) + rectBounds); sx++) {
+        int x = sx + cc.bounds.x0();
+        if (x < 0 || static_cast<size_t>(x) >= img.xsize()) continue;
+        double target = row[x];
+        double dotDelta = DotGaussianModel(
+            x - ellipse->x, y - ellipse->y, ct, st, ellipse->sigma_x,
+            ellipse->sigma_y, ellipse->intensity[c]);
+        if (dotDelta > target + kZeroEpsilon) {
+          ellipse->neg_pixels++;
+          ellipse->neg_value[c] += dotDelta - target;
+        }
+        double bkg = kOptimizeBackground ? ellipse->bgColor[c] : bgrow[x];
+        double pred = bkg + dotDelta;
+        double diff = target - pred;
+        double l2 = channelGains[c] * diff * diff;
+        double l1 = channelGains[c] * std::fabs(diff);
+        ellipse->l2_loss += l2;
+        ellipse->l1_loss += l1;
+        double w = DotGaussianModel(x - cc.mode.x, y - cc.mode.y, 1.0, 0.0,
+                                    1.0 + ellipse->sigma_x,
+                                    1.0 + ellipse->sigma_y, 1.0);
+        ellipse->custom_loss += w * l2;
+        N++;
+      }
+    }
+  }
+  ellipse->l2_loss /= N;
+  ellipse->custom_loss /= N;
+  ellipse->custom_loss += 20.0 * distMeanModeSq + ellipse->neg_value[1];
+  ellipse->l1_loss /= N;
+  double ridgeTerm = kSigmaR * ellipse->sigma_x + kSigmaR * ellipse->sigma_y;
+  for (int c = 0; c < 3; c++) {
+    ridgeTerm += kIntensityR * ellipse->intensity[c] * ellipse->intensity[c];
+  }
+  ellipse->ridge_loss = ellipse->l2_loss + ridgeTerm;
+}
+
+GaussianEllipse FitGaussianFast(const ConnectedComponent& cc,
+                                const ImageF& energy, const Image3F& img,
+                                const Image3F& background) {
+  PROFILER_FUNC;
+  constexpr bool leastSqIntensity = true;
+  constexpr double kEpsilon = 1e-6;
+  GaussianEllipse ans;
+  constexpr int kRectBounds = (kEllipseWindowSize >> 1);
+
+  // Compute the 1st and 2nd moments of the CC
+  double sum = 0.0;
+  int N = 0;
+  std::array<double, 3> m1{{0.0, 0.0, 0.0}};
+  std::array<double, 3> m2{{0.0, 0.0, 0.0}};
+  std::array<double, 3> color{{0.0, 0.0, 0.0}};
+  std::array<double, 3> bgColor{{0.0, 0.0, 0.0}};
+
+  JXL_DEBUG(JXL_DEBUG_DOT_DETECT,
+            "%" PRIuS " %" PRIuS " %" PRIuS " %" PRIuS "\n", cc.bounds.x0(),
+            cc.bounds.y0(), cc.bounds.xsize(), cc.bounds.ysize());
+  for (int c = 0; c < 3; c++) {
+    color[c] = img.ConstPlaneRow(c, cc.mode.y)[cc.mode.x] -
+               background.ConstPlaneRow(c, cc.mode.y)[cc.mode.x];
+  }
+  double sign = (color[1] > 0) ? 1 : -1;
+  for (int sy = -kRectBounds; sy <= kRectBounds; sy++) {
+    int y = sy + cc.mode.y;
+    if (y < 0 || static_cast<size_t>(y) >= energy.ysize()) continue;
+    const float* JXL_RESTRICT row = img.ConstPlaneRow(1, y);
+    const float* JXL_RESTRICT bgrow = background.ConstPlaneRow(1, y);
+    for (int sx = -kRectBounds; sx <= kRectBounds; sx++) {
+      int x = sx + cc.mode.x;
+      if (x < 0 || static_cast<size_t>(x) >= energy.xsize()) continue;
+      double w = std::max(kEpsilon, sign * (row[x] - bgrow[x]));
+      sum += w;
+
+      m1[0] += w * x;
+      m1[1] += w * y;
+      m2[0] += w * x * x;
+      m2[1] += w * x * y;
+      m2[2] += w * y * y;
+      for (int c = 0; c < 3; c++) {
+        bgColor[c] += background.ConstPlaneRow(c, y)[x];
+      }
+      N++;
+    }
+  }
+  JXL_CHECK(N > 0);
+
+  for (int i = 0; i < 3; i++) {
+    m1[i] /= sum;
+    m2[i] /= sum;
+    bgColor[i] /= N;
+  }
+
+  // Some magic constants
+  constexpr double kSigmaMult = 1.0;
+  constexpr std::array<double, 3> kScaleMult{{1.1, 1.1, 1.1}};
+
+  // Now set the parameters of the Gaussian
+  ans.x = m1[0];
+  ans.y = m1[1];
+  for (int j = 0; j < 3; j++) {
+    ans.intensity[j] = kScaleMult[j] * color[j];
+  }
+
+  ImageD Sigma(2, 2), D(1, 2), U(2, 2);
+  Sigma.Row(0)[0] = m2[0] - m1[0] * m1[0];
+  Sigma.Row(1)[1] = m2[2] - m1[1] * m1[1];
+  Sigma.Row(0)[1] = Sigma.Row(1)[0] = m2[1] - m1[0] * m1[1];
+  ConvertToDiagonal(Sigma, &D, &U);
+  const double* JXL_RESTRICT d = D.ConstRow(0);
+  const double* JXL_RESTRICT u = U.ConstRow(1);
+  int p1 = 0, p2 = 1;
+  if (d[0] < d[1]) std::swap(p1, p2);
+  ans.sigma_x = kSigmaMult * d[p1];
+  ans.sigma_y = kSigmaMult * d[p2];
+  ans.angle = std::atan2(u[p1], u[p2]);
+  ans.l2_loss = 0.0;
+  ans.bgColor = bgColor;
+  if (leastSqIntensity) {
+    GaussianEllipse* ellipse = &ans;
+    double ct = cos(ans.angle), st = sin(ans.angle);
+    // Estimate intensity with least squares (fixed background)
+    for (int c = 0; c < 3; c++) {
+      double gg = 0.0;
+      double gd = 0.0;
+      int yc = static_cast<int>(cc.mode.y);
+      int xc = static_cast<int>(cc.mode.x);
+      for (int y = yc - kRectBounds; y <= yc + kRectBounds; y++) {
+        if (y < 0 || static_cast<size_t>(y) >= img.ysize()) continue;
+        const float* JXL_RESTRICT row = img.ConstPlaneRow(c, y);
+        const float* JXL_RESTRICT bgrow = background.ConstPlaneRow(c, y);
+        for (int x = xc - kRectBounds; x <= xc + kRectBounds; x++) {
+          if (x < 0 || static_cast<size_t>(x) >= img.xsize()) continue;
+          double target = row[x] - bgrow[x];
+          double gaussian =
+              DotGaussianModel(x - ellipse->x, y - ellipse->y, ct, st,
+                               ellipse->sigma_x, ellipse->sigma_y, 1.0);
+          gg += gaussian * gaussian;
+          gd += gaussian * target;
+        }
+      }
+      ans.intensity[c] = gd / (gg + 1e-6);  // Regularized least squares
+    }
+  }
+  ComputeDotLosses(&ans, cc, img, background);
+  return ans;
+}
+
+GaussianEllipse FitGaussian(const ConnectedComponent& cc, const ImageF& energy,
+                            const Image3F& img, const Image3F& background) {
+  auto ellipse = FitGaussianFast(cc, energy, img, background);
+  if (ellipse.sigma_x < ellipse.sigma_y) {
+    std::swap(ellipse.sigma_x, ellipse.sigma_y);
+    ellipse.angle += kPi / 2.0;
+  }
+  ellipse.angle -= kPi * std::floor(ellipse.angle / kPi);
+  if (fabs(ellipse.angle - kPi) < 1e-6 || fabs(ellipse.angle) < 1e-6) {
+    ellipse.angle = 0.0;
+  }
+  JXL_CHECK(ellipse.angle >= 0 && ellipse.angle <= kPi &&
+            ellipse.sigma_x >= ellipse.sigma_y);
+  JXL_DEBUG(JXL_DEBUG_DOT_DETECT,
+            "Ellipse mu=(%lf,%lf) sigma=(%lf,%lf) angle=%lf "
+            "intensity=(%lf,%lf,%lf) bg=(%lf,%lf,%lf) l2_loss=%lf "
+            "custom_loss=%lf, neg_pix=%" PRIuS ", neg_v=(%lf,%lf,%lf)\n",
+            ellipse.x, ellipse.y, ellipse.sigma_x, ellipse.sigma_y,
+            ellipse.angle, ellipse.intensity[0], ellipse.intensity[1],
+            ellipse.intensity[2], ellipse.bgColor[0], ellipse.bgColor[1],
+            ellipse.bgColor[2], ellipse.l2_loss, ellipse.custom_loss,
+            ellipse.neg_pixels, ellipse.neg_value[0], ellipse.neg_value[1],
+            ellipse.neg_value[2]);
+  return ellipse;
+}
+
+}  // namespace
+
+std::vector<PatchInfo> DetectGaussianEllipses(
+    const Image3F& opsin, const GaussianDetectParams& params,
+    const EllipseQuantParams& qParams, ThreadPool* pool) {
+  PROFILER_FUNC;
+  std::vector<PatchInfo> dots;
+  Image3F smooth(opsin.xsize(), opsin.ysize());
+  ImageF energy = ComputeEnergyImage(opsin, &smooth, pool);
+#if JXL_DEBUG_DOT_DETECT
+  AuxOut aux;
+  aux.debug_prefix = "/tmp/sebastian/";
+  aux.DumpXybImage("smooth", smooth);
+  aux.DumpPlaneNormalized("energy", energy);
+#endif  // JXL_DEBUG_DOT_DETECT
+  std::vector<ConnectedComponent> components = FindCC(
+      energy, params.t_low, params.t_high, params.maxWinSize, params.minScore);
+  size_t numCC =
+      std::min(params.maxCC, (components.size() * params.percCC) / 100);
+  if (components.size() > numCC) {
+    std::sort(
+        components.begin(), components.end(),
+        [](const ConnectedComponent& a, const ConnectedComponent& b) -> bool {
+          return a.score > b.score;
+        });
+    components.erase(components.begin() + numCC, components.end());
+  }
+  for (const auto& cc : components) {
+    GaussianEllipse ellipse = FitGaussian(cc, energy, opsin, smooth);
+    if (ellipse.x < 0.0 ||
+        std::ceil(ellipse.x) >= static_cast<double>(opsin.xsize()) ||
+        ellipse.y < 0.0 ||
+        std::ceil(ellipse.y) >= static_cast<double>(opsin.ysize())) {
+      continue;
+    }
+    if (ellipse.neg_pixels > params.maxNegPixels) continue;
+    double intensity = 0.21 * ellipse.intensity[0] +
+                       0.72 * ellipse.intensity[1] +
+                       0.07 * ellipse.intensity[2];
+    double intensitySq = intensity * intensity;
+    // for (int c = 0; c < 3; c++) {
+    //  intensitySq += ellipse.intensity[c] * ellipse.intensity[c];
+    //}
+    double sqDistMeanMode = (ellipse.x - cc.mode.x) * (ellipse.x - cc.mode.x) +
+                            (ellipse.y - cc.mode.y) * (ellipse.y - cc.mode.y);
+    if (ellipse.l2_loss < params.maxL2Loss &&
+        ellipse.custom_loss < params.maxCustomLoss &&
+        intensitySq > (params.minIntensity * params.minIntensity) &&
+        sqDistMeanMode < params.maxDistMeanMode * params.maxDistMeanMode) {
+      size_t x0 = cc.bounds.x0();
+      size_t y0 = cc.bounds.y0();
+      dots.emplace_back();
+      dots.back().second.emplace_back(x0, y0);
+      QuantizedPatch& patch = dots.back().first;
+      patch.xsize = cc.bounds.xsize();
+      patch.ysize = cc.bounds.ysize();
+      for (size_t y = 0; y < patch.ysize; y++) {
+        for (size_t x = 0; x < patch.xsize; x++) {
+          for (size_t c = 0; c < 3; c++) {
+            patch.fpixels[c][y * patch.xsize + x] =
+                opsin.ConstPlaneRow(c, y0 + y)[x0 + x] -
+                smooth.ConstPlaneRow(c, y0 + y)[x0 + x];
+          }
+        }
+      }
+    }
+  }
+#if JXL_DEBUG_DOT_DETECT
+  JXL_DEBUG(JXL_DEBUG_DOT_DETECT, "Candidates: %" PRIuS ", Dots: %" PRIuS "\n",
+            components.size(), dots.size());
+  ApplyGaussianEllipses(&smooth, dots, 1.0);
+  aux.DumpXybImage("draw", smooth);
+  ApplyGaussianEllipses(&smooth, dots, -1.0);
+
+  auto qdots = QuantizeGaussianEllipses(dots, qParams);
+  auto deq = DequantizeGaussianEllipses(qdots, qParams);
+  ApplyGaussianEllipses(&smooth, deq, 1.0);
+  aux.DumpXybImage("qdraw", smooth);
+  ApplyGaussianEllipses(&smooth, deq, -1.0);
+#endif  // JXL_DEBUG_DOT_DETECT
+  return dots;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_detect_dots.h b/third_party/jpeg-xl/lib/jxl/enc_detect_dots.h
new file mode 100644
index 0000000000..c3071d9a2f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_detect_dots.h
@@ -0,0 +1,67 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// We attempt to remove dots, or speckle from images using Gaussian blur.
+#ifndef LIB_JXL_ENC_DETECT_DOTS_H_
+#define LIB_JXL_ENC_DETECT_DOTS_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <array>
+#include <vector>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/dec_patch_dictionary.h"
+#include "lib/jxl/enc_patch_dictionary.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+struct GaussianDetectParams {
+  double t_high = 0;  // at least one pixel must have larger energy than t_high
+  double t_low = 0;   // all pixels must have a larger energy than tLow
+  uint32_t maxWinSize = 0;  // discard dots larger than this containing window
+  double maxL2Loss = 0;
+  double maxCustomLoss = 0;
+  double minIntensity = 0;     // If the intensity is too low, discard it
+  double maxDistMeanMode = 0;  // The mean and the mode must be close
+  size_t maxNegPixels = 0;     // Maximum number of negative pixel
+  size_t minScore = 0;
+  size_t maxCC = 50;   // Maximum number of CC to keep
+  size_t percCC = 15;  // Percentage in [0,100] of CC to keep
+};
+
+// Ellipse Quantization Params
+struct EllipseQuantParams {
+  size_t xsize;      // Image size in x
+  size_t ysize;      // Image size in y
+  size_t qPosition;  // Position quantization delta
+  // Quantization for the Gaussian sigma parameters
+  double minSigma;
+  double maxSigma;
+  size_t qSigma;  // number of quantization levels
+  // Quantization for the rotation angle (between -pi and pi)
+  size_t qAngle;
+  // Quantization for the intensity
+  std::array<double, 3> minIntensity;
+  std::array<double, 3> maxIntensity;
+  std::array<size_t, 3> qIntensity;  // number of quantization levels
+  // Extra parameters for the encoding
+  bool subtractQuantized;  // Should we subtract quantized or detected dots?
+  float ytox;
+  float ytob;
+
+  void QuantPositionSize(size_t* xsize, size_t* ysize) const;
+};
+
+// Detects dots in XYB image.
+std::vector<PatchInfo> DetectGaussianEllipses(
+    const Image3F& opsin, const GaussianDetectParams& params,
+    const EllipseQuantParams& qParams, ThreadPool* pool);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_DETECT_DOTS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.cc b/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.cc
new file mode 100644
index 0000000000..2d22c1edb8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.cc
@@ -0,0 +1,71 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_dot_dictionary.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include <array>
+#include <utility>
+
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_detect_dots.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+// Private implementation of Dictionary Encode/Decode
+namespace {
+
+/* Quantization constants for Ellipse dots */
+const size_t kEllipsePosQ = 2;        // Quantization level for the position
+const double kEllipseMinSigma = 0.1;  // Minimum sigma value
+const double kEllipseMaxSigma = 3.1;  // Maximum Sigma value
+const size_t kEllipseSigmaQ = 16;     // Number of quantization levels for sigma
+const size_t kEllipseAngleQ = 8;      // Quantization level for the angle
+// TODO: fix these values.
+const std::array<double, 3> kEllipseMinIntensity{{-0.05, 0.0, -0.5}};
+const std::array<double, 3> kEllipseMaxIntensity{{0.05, 1.0, 0.4}};
+const std::array<size_t, 3> kEllipseIntensityQ{{10, 36, 10}};
+}  // namespace
+
+std::vector<PatchInfo> FindDotDictionary(const CompressParams& cparams,
+                                         const Image3F& opsin,
+                                         const ColorCorrelationMap& cmap,
+                                         ThreadPool* pool) {
+  if (ApplyOverride(cparams.dots,
+                    cparams.butteraugli_distance >= kMinButteraugliForDots)) {
+    GaussianDetectParams ellipse_params;
+    ellipse_params.t_high = 0.04;
+    ellipse_params.t_low = 0.02;
+    ellipse_params.maxWinSize = 5;
+    ellipse_params.maxL2Loss = 0.005;
+    ellipse_params.maxCustomLoss = 300;
+    ellipse_params.minIntensity = 0.12;
+    ellipse_params.maxDistMeanMode = 1.0;
+    ellipse_params.maxNegPixels = 0;
+    ellipse_params.minScore = 12.0;
+    ellipse_params.maxCC = 100;
+    ellipse_params.percCC = 100;
+    EllipseQuantParams qParams{
+        opsin.xsize(),      opsin.ysize(),        kEllipsePosQ,
+        kEllipseMinSigma,   kEllipseMaxSigma,     kEllipseSigmaQ,
+        kEllipseAngleQ,     kEllipseMinIntensity, kEllipseMaxIntensity,
+        kEllipseIntensityQ, kEllipsePosQ <= 5,    cmap.YtoXRatio(0),
+        cmap.YtoBRatio(0)};
+
+    return DetectGaussianEllipses(opsin, ellipse_params, qParams, pool);
+  }
+  return {};
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.h b/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.h
new file mode 100644
index 0000000000..2ba4393f30
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.h
@@ -0,0 +1,34 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_DOT_DICTIONARY_H_
+#define LIB_JXL_ENC_DOT_DICTIONARY_H_
+
+// Dots are stored in a dictionary to avoid storing similar dots multiple
+// times.
+
+#include <stddef.h>
+
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_patch_dictionary.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_patch_dictionary.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+std::vector<PatchInfo> FindDotDictionary(const CompressParams& cparams,
+                                         const Image3F& opsin,
+                                         const ColorCorrelationMap& cmap,
+                                         ThreadPool* pool);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_DOT_DICTIONARY_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.cc b/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.cc
new file mode 100644
index 0000000000..c634445e83
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.cc
@@ -0,0 +1,274 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_entropy_coder.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_entropy_coder.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_context_map.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/epf.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::AndNot;
+using hwy::HWY_NAMESPACE::Eq;
+using hwy::HWY_NAMESPACE::GetLane;
+
+// Returns number of non-zero coefficients (but skip LLF).
+// We cannot rely on block[] being all-zero bits, so first truncate to integer.
+// Also writes the per-8x8 block nzeros starting at nzeros_pos.
+int32_t NumNonZeroExceptLLF(const size_t cx, const size_t cy,
+                            const AcStrategy acs, const size_t covered_blocks,
+                            const size_t log2_covered_blocks,
+                            const int32_t* JXL_RESTRICT block,
+                            const size_t nzeros_stride,
+                            int32_t* JXL_RESTRICT nzeros_pos) {
+  const HWY_CAPPED(int32_t, kBlockDim) di;
+
+  const auto zero = Zero(di);
+  // Add FF..FF for every zero coefficient, negate to get #zeros.
+  auto neg_sum_zero = zero;
+
+  {
+    // Mask sufficient for one row of coefficients.
+    HWY_ALIGN const int32_t
+        llf_mask_lanes[AcStrategy::kMaxCoeffBlocks * (1 + kBlockDim)] = {
+            -1, -1, -1, -1};
+    // First cx=1,2,4 elements are FF..FF, others 0.
+    const int32_t* llf_mask_pos =
+        llf_mask_lanes + AcStrategy::kMaxCoeffBlocks - cx;
+
+    // Rows with LLF: mask out the LLF
+    for (size_t y = 0; y < cy; y++) {
+      for (size_t x = 0; x < cx * kBlockDim; x += Lanes(di)) {
+        const auto llf_mask = LoadU(di, llf_mask_pos + x);
+
+        // LLF counts as zero so we don't include it in nzeros.
+        const auto coef =
+            AndNot(llf_mask, Load(di, &block[y * cx * kBlockDim + x]));
+
+        neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
+      }
+    }
+  }
+
+  // Remaining rows: no mask
+  for (size_t y = cy; y < cy * kBlockDim; y++) {
+    for (size_t x = 0; x < cx * kBlockDim; x += Lanes(di)) {
+      const auto coef = Load(di, &block[y * cx * kBlockDim + x]);
+      neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
+    }
+  }
+
+  // We want area - sum_zero, add because neg_sum_zero is already negated.
+  const int32_t nzeros =
+      int32_t(cx * cy * kDCTBlockSize) + GetLane(SumOfLanes(di, neg_sum_zero));
+
+  const int32_t shifted_nzeros = static_cast<int32_t>(
+      (nzeros + covered_blocks - 1) >> log2_covered_blocks);
+  // Need non-canonicalized dimensions!
+  for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
+    for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
+      nzeros_pos[x + y * nzeros_stride] = shifted_nzeros;
+    }
+  }
+
+  return nzeros;
+}
+
+// Specialization for 8x8, where only top-left is LLF/DC.
+// About 1% overall speedup vs. NumNonZeroExceptLLF.
+int32_t NumNonZero8x8ExceptDC(const int32_t* JXL_RESTRICT block,
+                              int32_t* JXL_RESTRICT nzeros_pos) {
+  const HWY_CAPPED(int32_t, kBlockDim) di;
+
+  const auto zero = Zero(di);
+  // Add FF..FF for every zero coefficient, negate to get #zeros.
+  auto neg_sum_zero = zero;
+
+  {
+    // First row has DC, so mask
+    const size_t y = 0;
+    HWY_ALIGN const int32_t dc_mask_lanes[kBlockDim] = {-1};
+
+    for (size_t x = 0; x < kBlockDim; x += Lanes(di)) {
+      const auto dc_mask = Load(di, dc_mask_lanes + x);
+
+      // DC counts as zero so we don't include it in nzeros.
+      const auto coef = AndNot(dc_mask, Load(di, &block[y * kBlockDim + x]));
+
+      neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
+    }
+  }
+
+  // Remaining rows: no mask
+  for (size_t y = 1; y < kBlockDim; y++) {
+    for (size_t x = 0; x < kBlockDim; x += Lanes(di)) {
+      const auto coef = Load(di, &block[y * kBlockDim + x]);
+      neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
+    }
+  }
+
+  // We want 64 - sum_zero, add because neg_sum_zero is already negated.
+  const int32_t nzeros =
+      int32_t(kDCTBlockSize) + GetLane(SumOfLanes(di, neg_sum_zero));
+
+  *nzeros_pos = nzeros;
+
+  return nzeros;
+}
+
+// The number of nonzeros of each block is predicted from the top and the left
+// blocks, with opportune scaling to take into account the number of blocks of
+// each strategy.  The predicted number of nonzeros divided by two is used as a
+// context; if this number is above 63, a specific context is used.  If the
+// number of nonzeros of a strategy is above 63, it is written directly using a
+// fixed number of bits (that depends on the size of the strategy).
+void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders,
+                          const Rect& rect,
+                          const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows,
+                          const AcStrategyImage& ac_strategy,
+                          YCbCrChromaSubsampling cs,
+                          Image3I* JXL_RESTRICT tmp_num_nzeroes,
+                          std::vector<Token>* JXL_RESTRICT output,
+                          const ImageB& qdc, const ImageI& qf,
+                          const BlockCtxMap& block_ctx_map) {
+  const size_t xsize_blocks = rect.xsize();
+  const size_t ysize_blocks = rect.ysize();
+
+  // TODO(user): update the estimate: usually less coefficients are used.
+  output->reserve(output->size() +
+                  3 * xsize_blocks * ysize_blocks * kDCTBlockSize);
+
+  size_t offset[3] = {};
+  const size_t nzeros_stride = tmp_num_nzeroes->PixelsPerRow();
+  for (size_t by = 0; by < ysize_blocks; ++by) {
+    size_t sby[3] = {by >> cs.VShift(0), by >> cs.VShift(1),
+                     by >> cs.VShift(2)};
+    int32_t* JXL_RESTRICT row_nzeros[3] = {
+        tmp_num_nzeroes->PlaneRow(0, sby[0]),
+        tmp_num_nzeroes->PlaneRow(1, sby[1]),
+        tmp_num_nzeroes->PlaneRow(2, sby[2]),
+    };
+    const int32_t* JXL_RESTRICT row_nzeros_top[3] = {
+        sby[0] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(0, sby[0] - 1),
+        sby[1] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(1, sby[1] - 1),
+        sby[2] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(2, sby[2] - 1),
+    };
+    const uint8_t* JXL_RESTRICT row_qdc =
+        qdc.ConstRow(rect.y0() + by) + rect.x0();
+    const int32_t* JXL_RESTRICT row_qf = rect.ConstRow(qf, by);
+    AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by);
+    for (size_t bx = 0; bx < xsize_blocks; ++bx) {
+      AcStrategy acs = acs_row[bx];
+      if (!acs.IsFirstBlock()) continue;
+      size_t sbx[3] = {bx >> cs.HShift(0), bx >> cs.HShift(1),
+                       bx >> cs.HShift(2)};
+      size_t cx = acs.covered_blocks_x();
+      size_t cy = acs.covered_blocks_y();
+      const size_t covered_blocks = cx * cy;  // = #LLF coefficients
+      const size_t log2_covered_blocks =
+          Num0BitsBelowLS1Bit_Nonzero(covered_blocks);
+      const size_t size = covered_blocks * kDCTBlockSize;
+
+      CoefficientLayout(&cy, &cx);  // swap cx/cy to canonical order
+
+      for (int c : {1, 0, 2}) {
+        if (sbx[c] << cs.HShift(c) != bx) continue;
+        if (sby[c] << cs.VShift(c) != by) continue;
+        const int32_t* JXL_RESTRICT block = ac_rows[c] + offset[c];
+
+        int32_t nzeros =
+            (covered_blocks == 1)
+                ? NumNonZero8x8ExceptDC(block, row_nzeros[c] + sbx[c])
+                : NumNonZeroExceptLLF(cx, cy, acs, covered_blocks,
+                                      log2_covered_blocks, block, nzeros_stride,
+                                      row_nzeros[c] + sbx[c]);
+
+        int ord = kStrategyOrder[acs.RawStrategy()];
+        const coeff_order_t* JXL_RESTRICT order =
+            &orders[CoeffOrderOffset(ord, c)];
+
+        int32_t predicted_nzeros =
+            PredictFromTopAndLeft(row_nzeros_top[c], row_nzeros[c], sbx[c], 32);
+        size_t block_ctx =
+            block_ctx_map.Context(row_qdc[bx], row_qf[sbx[c]], ord, c);
+        const int32_t nzero_ctx =
+            block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx);
+
+        output->emplace_back(nzero_ctx, nzeros);
+        const size_t histo_offset =
+            block_ctx_map.ZeroDensityContextsOffset(block_ctx);
+        // Skip LLF.
+        size_t prev = (nzeros > static_cast<ssize_t>(size / 16) ? 0 : 1);
+        for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
+          int32_t coeff = block[order[k]];
+          size_t ctx =
+              histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
+                                                log2_covered_blocks, prev);
+          uint32_t u_coeff = PackSigned(coeff);
+          output->emplace_back(ctx, u_coeff);
+          prev = coeff != 0;
+          nzeros -= prev;
+        }
+        JXL_DASSERT(nzeros == 0);
+        offset[c] += size;
+      }
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(TokenizeCoefficients);
+void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders,
+                          const Rect& rect,
+                          const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows,
+                          const AcStrategyImage& ac_strategy,
+                          YCbCrChromaSubsampling cs,
+                          Image3I* JXL_RESTRICT tmp_num_nzeroes,
+                          std::vector<Token>* JXL_RESTRICT output,
+                          const ImageB& qdc, const ImageI& qf,
+                          const BlockCtxMap& block_ctx_map) {
+  return HWY_DYNAMIC_DISPATCH(TokenizeCoefficients)(
+      orders, rect, ac_rows, ac_strategy, cs, tmp_num_nzeroes, output, qdc, qf,
+      block_ctx_map);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.h b/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.h
new file mode 100644
index 0000000000..7dfc71c726
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.h
@@ -0,0 +1,46 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_ENTROPY_CODER_H_
+#define LIB_JXL_ENC_ENTROPY_CODER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ac_context.h"  // BlockCtxMap
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/frame_header.h"  // YCbCrChromaSubsampling
+#include "lib/jxl/image.h"
+
+// Entropy coding and context modeling of DC and AC coefficients, as well as AC
+// strategy and quantization field.
+
+namespace jxl {
+
+// Generate DCT NxN quantized AC values tokens.
+// Only the subset "rect" [in units of blocks] within all images.
+// See also DecodeACVarBlock.
+void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders,
+                          const Rect& rect,
+                          const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows,
+                          const AcStrategyImage& ac_strategy,
+                          YCbCrChromaSubsampling cs,
+                          Image3I* JXL_RESTRICT tmp_num_nzeroes,
+                          std::vector<Token>* JXL_RESTRICT output,
+                          const ImageB& qdc, const ImageI& qf,
+                          const BlockCtxMap& block_ctx_map);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_ENTROPY_CODER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_external_image.cc b/third_party/jpeg-xl/lib/jxl/enc_external_image.cc
new file mode 100644
index 0000000000..1408746476
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_external_image.cc
@@ -0,0 +1,183 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_external_image.h"
+
+#include <jxl/types.h>
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/float.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+
+namespace jxl {
+namespace {
+
+size_t JxlDataTypeBytes(JxlDataType data_type) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      return 1;
+    case JXL_TYPE_UINT16:
+      return 2;
+    case JXL_TYPE_FLOAT16:
+      return 2;
+    case JXL_TYPE_FLOAT:
+      return 4;
+    default:
+      return 0;
+  }
+}
+
+}  // namespace
+
+Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
+                           size_t ysize, size_t bits_per_sample,
+                           JxlPixelFormat format, size_t c, ThreadPool* pool,
+                           ImageF* channel) {
+  if (format.data_type == JXL_TYPE_UINT8) {
+    JXL_RETURN_IF_ERROR(bits_per_sample > 0 && bits_per_sample <= 8);
+  } else if (format.data_type == JXL_TYPE_UINT16) {
+    JXL_RETURN_IF_ERROR(bits_per_sample > 8 && bits_per_sample <= 16);
+  } else if (format.data_type == JXL_TYPE_FLOAT16) {
+    JXL_RETURN_IF_ERROR(bits_per_sample == 16);
+  } else if (format.data_type == JXL_TYPE_FLOAT) {
+    JXL_RETURN_IF_ERROR(bits_per_sample == 32);
+  } else {
+    JXL_FAILURE("unsupported pixel format data type %d", format.data_type);
+  }
+  size_t bytes_per_channel = JxlDataTypeBytes(format.data_type);
+  size_t bytes_per_pixel = format.num_channels * bytes_per_channel;
+  size_t pixel_offset = c * bytes_per_channel;
+  // Only for uint8/16.
+  float scale = 1. / ((1ull << bits_per_sample) - 1);
+
+  const size_t last_row_size = xsize * bytes_per_pixel;
+  const size_t align = format.align;
+  const size_t row_size =
+      (align > 1 ? jxl::DivCeil(last_row_size, align) * align : last_row_size);
+  const size_t bytes_to_read = row_size * (ysize - 1) + last_row_size;
+  if (xsize == 0 || ysize == 0) return JXL_FAILURE("Empty image");
+  if (bytes.size() < bytes_to_read) {
+    return JXL_FAILURE("Buffer size is too small, expected: %" PRIuS
+                       " got: %" PRIuS " (Image: %" PRIuS "x%" PRIuS
+                       "x%u, bytes_per_channel: %" PRIuS ")",
+                       bytes_to_read, bytes.size(), xsize, ysize,
+                       format.num_channels, bytes_per_channel);
+  }
+  JXL_ASSERT(channel->xsize() == xsize);
+  JXL_ASSERT(channel->ysize() == ysize);
+  // Too large buffer is likely an application bug, so also fail for that.
+  // Do allow padding to stride in last row though.
+  if (bytes.size() > row_size * ysize) {
+    return JXL_FAILURE("Buffer size is too large");
+  }
+
+  const bool little_endian =
+      format.endianness == JXL_LITTLE_ENDIAN ||
+      (format.endianness == JXL_NATIVE_ENDIAN && IsLittleEndian());
+
+  const uint8_t* const in = bytes.data();
+
+  std::atomic<size_t> error_count = {0};
+
+  const auto convert_row = [&](const uint32_t task, size_t /*thread*/) {
+    const size_t y = task;
+    size_t offset = row_size * task + pixel_offset;
+    float* JXL_RESTRICT row_out = channel->Row(y);
+    const auto save_value = [&](size_t index, float value) {
+      row_out[index] = value;
+    };
+    if (!LoadFloatRow(in + offset, xsize, bytes_per_pixel, format.data_type,
+                      little_endian, scale, save_value)) {
+      error_count++;
+    }
+  };
+  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(ysize),
+                                ThreadPool::NoInit, convert_row,
+                                "ConvertExtraChannel"));
+
+  if (error_count) {
+    JXL_FAILURE("unsupported pixel format data type");
+  }
+
+  return true;
+}
+Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
+                           size_t ysize, const ColorEncoding& c_current,
+                           size_t bits_per_sample, JxlPixelFormat format,
+                           ThreadPool* pool, ImageBundle* ib) {
+  const size_t color_channels = c_current.Channels();
+  bool has_alpha = format.num_channels == 2 || format.num_channels == 4;
+  if (format.num_channels < color_channels) {
+    return JXL_FAILURE("Expected %" PRIuS
+                       " color channels, received only %u channels",
+                       color_channels, format.num_channels);
+  }
+
+  Image3F color(xsize, ysize);
+  for (size_t c = 0; c < color_channels; ++c) {
+    JXL_RETURN_IF_ERROR(ConvertFromExternal(bytes, xsize, ysize,
+                                            bits_per_sample, format, c, pool,
+                                            &color.Plane(c)));
+  }
+  if (color_channels == 1) {
+    CopyImageTo(color.Plane(0), &color.Plane(1));
+    CopyImageTo(color.Plane(0), &color.Plane(2));
+  }
+  ib->SetFromImage(std::move(color), c_current);
+
+  // Passing an interleaved image with an alpha channel to an image that doesn't
+  // have alpha channel just discards the passed alpha channel.
+  if (has_alpha && ib->HasAlpha()) {
+    ImageF alpha(xsize, ysize);
+    JXL_RETURN_IF_ERROR(
+        ConvertFromExternal(bytes, xsize, ysize, bits_per_sample, format,
+                            format.num_channels - 1, pool, &alpha));
+    ib->SetAlpha(std::move(alpha));
+  } else if (!has_alpha && ib->HasAlpha()) {
+    // if alpha is not passed, but it is expected, then assume
+    // it is all-opaque
+    ImageF alpha(xsize, ysize);
+    FillImage(1.0f, &alpha);
+    ib->SetAlpha(std::move(alpha));
+  }
+
+  return true;
+}
+
+Status BufferToImageF(const JxlPixelFormat& pixel_format, size_t xsize,
+                      size_t ysize, const void* buffer, size_t size,
+                      ThreadPool* pool, ImageF* channel) {
+  size_t bitdepth = JxlDataTypeBytes(pixel_format.data_type) * kBitsPerByte;
+  return ConvertFromExternal(
+      jxl::Span<const uint8_t>(static_cast<const uint8_t*>(buffer), size),
+      xsize, ysize, bitdepth, pixel_format, 0, pool, channel);
+}
+
+Status BufferToImageBundle(const JxlPixelFormat& pixel_format, uint32_t xsize,
+                           uint32_t ysize, const void* buffer, size_t size,
+                           jxl::ThreadPool* pool,
+                           const jxl::ColorEncoding& c_current,
+                           jxl::ImageBundle* ib) {
+  size_t bitdepth = JxlDataTypeBytes(pixel_format.data_type) * kBitsPerByte;
+  JXL_RETURN_IF_ERROR(ConvertFromExternal(
+      jxl::Span<const uint8_t>(static_cast<const uint8_t*>(buffer), size),
+      xsize, ysize, c_current, bitdepth, pixel_format, pool, ib));
+  ib->VerifyMetadata();
+
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_external_image.h b/third_party/jpeg-xl/lib/jxl/enc_external_image.h
new file mode 100644
index 0000000000..3b2b295076
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_external_image.h
@@ -0,0 +1,45 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_EXTERNAL_IMAGE_H_
+#define LIB_JXL_ENC_EXTERNAL_IMAGE_H_
+
+// Interleaved image for color transforms and Codec.
+
+#include <jxl/types.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
+                           size_t ysize, size_t bits_per_sample,
+                           JxlPixelFormat format, size_t c, ThreadPool* pool,
+                           ImageF* channel);
+
+// Convert an interleaved pixel buffer to the internal ImageBundle
+// representation. This is the opposite of ConvertToExternal().
+Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
+                           size_t ysize, const ColorEncoding& c_current,
+                           size_t bits_per_sample, JxlPixelFormat format,
+                           ThreadPool* pool, ImageBundle* ib);
+Status BufferToImageF(const JxlPixelFormat& pixel_format, size_t xsize,
+                      size_t ysize, const void* buffer, size_t size,
+                      ThreadPool* pool, ImageF* channel);
+Status BufferToImageBundle(const JxlPixelFormat& pixel_format, uint32_t xsize,
+                           uint32_t ysize, const void* buffer, size_t size,
+                           jxl::ThreadPool* pool,
+                           const jxl::ColorEncoding& c_current,
+                           jxl::ImageBundle* ib);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_EXTERNAL_IMAGE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_external_image_gbench.cc b/third_party/jpeg-xl/lib/jxl/enc_external_image_gbench.cc
new file mode 100644
index 0000000000..4b7147817a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_external_image_gbench.cc
@@ -0,0 +1,46 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "benchmark/benchmark.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+namespace {
+
+// Encoder case, deinterleaves a buffer.
+void BM_EncExternalImage_ConvertImageRGBA(benchmark::State& state) {
+  const size_t kNumIter = 5;
+  size_t xsize = state.range();
+  size_t ysize = state.range();
+
+  ImageMetadata im;
+  im.SetAlphaBits(8);
+  ImageBundle ib(&im);
+
+  std::vector<uint8_t> interleaved(xsize * ysize * 4);
+  JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
+  for (auto _ : state) {
+    for (size_t i = 0; i < kNumIter; ++i) {
+      JXL_CHECK(ConvertFromExternal(
+          Span<const uint8_t>(interleaved.data(), interleaved.size()), xsize,
+          ysize,
+          /*c_current=*/ColorEncoding::SRGB(),
+          /*bits_per_sample=*/8, format,
+          /*pool=*/nullptr, &ib));
+    }
+  }
+
+  // Pixels per second.
+  state.SetItemsProcessed(kNumIter * state.iterations() * xsize * ysize);
+  state.SetBytesProcessed(kNumIter * state.iterations() * interleaved.size());
+}
+
+BENCHMARK(BM_EncExternalImage_ConvertImageRGBA)
+    ->RangeMultiplier(2)
+    ->Range(256, 2048);
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_external_image_test.cc b/third_party/jpeg-xl/lib/jxl/enc_external_image_test.cc
new file mode 100644
index 0000000000..7be8d45f2d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_external_image_test.cc
@@ -0,0 +1,79 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_external_image.h"
+
+#include <array>
+#include <new>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+#if !defined(JXL_CRASH_ON_ERROR)
+TEST(ExternalImageTest, InvalidSize) {
+  ImageMetadata im;
+  im.SetAlphaBits(8);
+  ImageBundle ib(&im);
+
+  JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  const uint8_t buf[10 * 100 * 8] = {};
+  EXPECT_FALSE(ConvertFromExternal(
+      Span<const uint8_t>(buf, 10), /*xsize=*/10, /*ysize=*/100,
+      /*c_current=*/ColorEncoding::SRGB(),
+      /*bits_per_sample=*/16, format, nullptr, &ib));
+  EXPECT_FALSE(ConvertFromExternal(
+      Span<const uint8_t>(buf, sizeof(buf) - 1), /*xsize=*/10, /*ysize=*/100,
+      /*c_current=*/ColorEncoding::SRGB(),
+      /*bits_per_sample=*/16, format, nullptr, &ib));
+  EXPECT_TRUE(
+      ConvertFromExternal(Span<const uint8_t>(buf, sizeof(buf)), /*xsize=*/10,
+                          /*ysize=*/100, /*c_current=*/ColorEncoding::SRGB(),
+                          /*bits_per_sample=*/16, format, nullptr, &ib));
+}
+#endif
+
+TEST(ExternalImageTest, AlphaMissing) {
+  ImageMetadata im;
+  im.SetAlphaBits(0);  // No alpha
+  ImageBundle ib(&im);
+
+  const size_t xsize = 10;
+  const size_t ysize = 20;
+  const uint8_t buf[xsize * ysize * 4] = {};
+
+  JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0};
+  // has_alpha is true but the ImageBundle has no alpha. Alpha channel should
+  // be ignored.
+  EXPECT_TRUE(ConvertFromExternal(Span<const uint8_t>(buf, sizeof(buf)), xsize,
+                                  ysize,
+                                  /*c_current=*/ColorEncoding::SRGB(),
+                                  /*bits_per_sample=*/8, format, nullptr, &ib));
+  EXPECT_FALSE(ib.HasAlpha());
+}
+
+TEST(ExternalImageTest, AlphaPremultiplied) {
+  ImageMetadata im;
+  im.SetAlphaBits(8, true);
+
+  ImageBundle ib(&im);
+  const size_t xsize = 10;
+  const size_t ysize = 20;
+  const size_t size = xsize * ysize * 8;
+  const uint8_t buf[size] = {};
+
+  JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  EXPECT_TRUE(BufferToImageBundle(format, xsize, ysize, buf, size, nullptr,
+                                  ColorEncoding::SRGB(), &ib));
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc
new file mode 100644
index 0000000000..286990ee8a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc
@@ -0,0 +1,3860 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef FJXL_SELF_INCLUDE
+
+#include "lib/jxl/enc_fast_lossless.h"
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <memory>
+#include <vector>
+
+// Enable NEON and AVX2/AVX512 if not asked to do otherwise and the compilers
+// support it.
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include <arm_neon.h>
+
+#ifndef FJXL_ENABLE_NEON
+#define FJXL_ENABLE_NEON 1
+#endif
+
+#elif (defined(__x86_64__) || defined(_M_X64)) && !defined(_MSC_VER)
+#include <immintrin.h>
+
+// manually add _mm512_cvtsi512_si32 definition if missing
+// (e.g. with Xcode on macOS Mojave)
+// copied from gcc 11.1.0 include/avx512fintrin.h line 14367-14373
+#if defined(__clang__) &&                                           \
+    ((!defined(__apple_build_version__) && __clang_major__ < 10) || \
+     (defined(__apple_build_version__) && __apple_build_version__ < 12000032))
+inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsi512_si32(__m512i __A) {
+  __v16si __B = (__v16si)__A;
+  return __B[0];
+}
+#endif
+
+// TODO(veluca): MSVC support for dynamic dispatch.
+#if defined(__clang__) || defined(__GNUC__)
+
+#ifndef FJXL_ENABLE_AVX2
+#define FJXL_ENABLE_AVX2 1
+#endif
+
+#ifndef FJXL_ENABLE_AVX512
+// On clang-7 or earlier, and gcc-10 or earlier, AVX512 seems broken.
+#if (defined(__clang__) &&                                             \
+         (!defined(__apple_build_version__) && __clang_major__ > 7) || \
+     (defined(__apple_build_version__) &&                              \
+      __apple_build_version__ > 10010046)) ||                          \
+    (defined(__GNUC__) && __GNUC__ > 10)
+#define FJXL_ENABLE_AVX512 1
+#endif
+#endif
+
+#endif
+
+#endif
+
+#ifndef FJXL_ENABLE_NEON
+#define FJXL_ENABLE_NEON 0
+#endif
+
+#ifndef FJXL_ENABLE_AVX2
+#define FJXL_ENABLE_AVX2 0
+#endif
+
+#ifndef FJXL_ENABLE_AVX512
+#define FJXL_ENABLE_AVX512 0
+#endif
+
+namespace {
+#if defined(_MSC_VER) && !defined(__clang__)
+#define FJXL_INLINE __forceinline
+FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
+  unsigned long index;
+  _BitScanReverse(&index, v);
+  return index;
+}
+FJXL_INLINE uint32_t CtzNonZero(uint64_t v) {
+  unsigned long index;
+  _BitScanForward(&index, v);
+  return index;
+}
+#else
+#define FJXL_INLINE inline __attribute__((always_inline))
+FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
+  return v ? 31 - __builtin_clz(v) : 0;
+}
+FJXL_INLINE uint32_t CtzNonZero(uint64_t v) { return __builtin_ctzll(v); }
+#endif
+
+// Compiles to a memcpy on little-endian systems.
+FJXL_INLINE void StoreLE64(uint8_t* tgt, uint64_t data) {
+#if (!defined(__BYTE_ORDER__) || (__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__))
+  for (int i = 0; i < 8; i++) {
+    tgt[i] = (data >> (i * 8)) & 0xFF;
+  }
+#else
+  memcpy(tgt, &data, 8);
+#endif
+}
+
+FJXL_INLINE size_t AddBits(uint32_t count, uint64_t bits, uint8_t* data_buf,
+                           size_t& bits_in_buffer, uint64_t& bit_buffer) {
+  bit_buffer |= bits << bits_in_buffer;
+  bits_in_buffer += count;
+  StoreLE64(data_buf, bit_buffer);
+  size_t bytes_in_buffer = bits_in_buffer / 8;
+  bits_in_buffer -= bytes_in_buffer * 8;
+  bit_buffer >>= bytes_in_buffer * 8;
+  return bytes_in_buffer;
+}
+
+struct BitWriter {
+  void Allocate(size_t maximum_bit_size) {
+    assert(data == nullptr);
+    // Leave some padding.
+    data.reset(static_cast<uint8_t*>(malloc(maximum_bit_size / 8 + 64)));
+  }
+
+  void Write(uint32_t count, uint64_t bits) {
+    bytes_written += AddBits(count, bits, data.get() + bytes_written,
+                             bits_in_buffer, buffer);
+  }
+
+  void ZeroPadToByte() {
+    if (bits_in_buffer != 0) {
+      Write(8 - bits_in_buffer, 0);
+    }
+  }
+
+  FJXL_INLINE void WriteMultiple(const uint64_t* nbits, const uint64_t* bits,
+                                 size_t n) {
+    // Necessary because Write() is only guaranteed to work with <=56 bits.
+    // Trying to SIMD-fy this code results in lower speed (and definitely less
+    // clarity).
+    {
+      for (size_t i = 0; i < n; i++) {
+        this->buffer |= bits[i] << this->bits_in_buffer;
+        memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
+        uint64_t shift = 64 - this->bits_in_buffer;
+        this->bits_in_buffer += nbits[i];
+        // This `if` seems to be faster than using ternaries.
+        if (this->bits_in_buffer >= 64) {
+          uint64_t next_buffer = bits[i] >> shift;
+          this->buffer = next_buffer;
+          this->bits_in_buffer -= 64;
+          this->bytes_written += 8;
+        }
+      }
+      memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
+      size_t bytes_in_buffer = this->bits_in_buffer / 8;
+      this->bits_in_buffer -= bytes_in_buffer * 8;
+      this->buffer >>= bytes_in_buffer * 8;
+      this->bytes_written += bytes_in_buffer;
+    }
+  }
+
+  std::unique_ptr<uint8_t[], void (*)(void*)> data = {nullptr, free};
+  size_t bytes_written = 0;
+  size_t bits_in_buffer = 0;
+  uint64_t buffer = 0;
+};
+
+}  // namespace
+
+extern "C" {
+
+struct JxlFastLosslessFrameState {
+  size_t width;
+  size_t height;
+  size_t nb_chans;
+  size_t bitdepth;
+  BitWriter header;
+  std::vector<std::array<BitWriter, 4>> group_data;
+  size_t current_bit_writer = 0;
+  size_t bit_writer_byte_pos = 0;
+  size_t bits_in_buffer = 0;
+  uint64_t bit_buffer = 0;
+};
+
+size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState* frame) {
+  size_t total_size_groups = 0;
+  for (size_t i = 0; i < frame->group_data.size(); i++) {
+    size_t sz = 0;
+    for (size_t j = 0; j < frame->nb_chans; j++) {
+      const auto& writer = frame->group_data[i][j];
+      sz += writer.bytes_written * 8 + writer.bits_in_buffer;
+    }
+    sz = (sz + 7) / 8;
+    total_size_groups += sz;
+  }
+  return frame->header.bytes_written + total_size_groups;
+}
+
+size_t JxlFastLosslessMaxRequiredOutput(
+    const JxlFastLosslessFrameState* frame) {
+  return JxlFastLosslessOutputSize(frame) + 32;
+}
+
+void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame,
+                                  int add_image_header, int is_last) {
+  BitWriter* output = &frame->header;
+  output->Allocate(1000 + frame->group_data.size() * 32);
+
+  std::vector<size_t> group_sizes(frame->group_data.size());
+  for (size_t i = 0; i < frame->group_data.size(); i++) {
+    size_t sz = 0;
+    for (size_t j = 0; j < frame->nb_chans; j++) {
+      const auto& writer = frame->group_data[i][j];
+      sz += writer.bytes_written * 8 + writer.bits_in_buffer;
+    }
+    sz = (sz + 7) / 8;
+    group_sizes[i] = sz;
+  }
+
+  bool have_alpha = (frame->nb_chans == 2 || frame->nb_chans == 4);
+
+  if (add_image_header) {
+    // Signature
+    output->Write(16, 0x0AFF);
+
+    // Size header, hand-crafted.
+    // Not small
+    output->Write(1, 0);
+
+    auto wsz = [output](size_t size) {
+      if (size - 1 < (1 << 9)) {
+        output->Write(2, 0b00);
+        output->Write(9, size - 1);
+      } else if (size - 1 < (1 << 13)) {
+        output->Write(2, 0b01);
+        output->Write(13, size - 1);
+      } else if (size - 1 < (1 << 18)) {
+        output->Write(2, 0b10);
+        output->Write(18, size - 1);
+      } else {
+        output->Write(2, 0b11);
+        output->Write(30, size - 1);
+      }
+    };
+
+    wsz(frame->height);
+
+    // No special ratio.
+    output->Write(3, 0);
+
+    wsz(frame->width);
+
+    // Hand-crafted ImageMetadata.
+    output->Write(1, 0);  // all_default
+    output->Write(1, 0);  // extra_fields
+    output->Write(1, 0);  // bit_depth.floating_point_sample
+    if (frame->bitdepth == 8) {
+      output->Write(2, 0b00);  // bit_depth.bits_per_sample = 8
+    } else if (frame->bitdepth == 10) {
+      output->Write(2, 0b01);  // bit_depth.bits_per_sample = 10
+    } else if (frame->bitdepth == 12) {
+      output->Write(2, 0b10);  // bit_depth.bits_per_sample = 12
+    } else {
+      output->Write(2, 0b11);  // 1 + u(6)
+      output->Write(6, frame->bitdepth - 1);
+    }
+    if (frame->bitdepth <= 14) {
+      output->Write(1, 1);  // 16-bit-buffer sufficient
+    } else {
+      output->Write(1, 0);  // 16-bit-buffer NOT sufficient
+    }
+    if (have_alpha) {
+      output->Write(2, 0b01);  // One extra channel
+      output->Write(1, 1);     // ... all_default (ie. 8-bit alpha)
+    } else {
+      output->Write(2, 0b00);  // No extra channel
+    }
+    output->Write(1, 0);  // Not XYB
+    if (frame->nb_chans > 2) {
+      output->Write(1, 1);  // color_encoding.all_default (sRGB)
+    } else {
+      output->Write(1, 0);     // color_encoding.all_default false
+      output->Write(1, 0);     // color_encoding.want_icc false
+      output->Write(2, 1);     // grayscale
+      output->Write(2, 1);     // D65
+      output->Write(1, 0);     // no gamma transfer function
+      output->Write(2, 0b10);  // tf: 2 + u(4)
+      output->Write(4, 11);    // tf of sRGB
+      output->Write(2, 1);     // relative rendering intent
+    }
+    output->Write(2, 0b00);  // No extensions.
+
+    output->Write(1, 1);  // all_default transform data
+
+    // No ICC, no preview. Frame should start at byte boundery.
+    output->ZeroPadToByte();
+  }
+
+  // Handcrafted frame header.
+  output->Write(1, 0);     // all_default
+  output->Write(2, 0b00);  // regular frame
+  output->Write(1, 1);     // modular
+  output->Write(2, 0b00);  // default flags
+  output->Write(1, 0);     // not YCbCr
+  output->Write(2, 0b00);  // no upsampling
+  if (have_alpha) {
+    output->Write(2, 0b00);  // no alpha upsampling
+  }
+  output->Write(2, 0b01);  // default group size
+  output->Write(2, 0b00);  // exactly one pass
+  output->Write(1, 0);     // no custom size or origin
+  output->Write(2, 0b00);  // kReplace blending mode
+  if (have_alpha) {
+    output->Write(2, 0b00);  // kReplace blending mode for alpha channel
+  }
+  output->Write(1, is_last);  // is_last
+  output->Write(2, 0b00);     // a frame has no name
+  output->Write(1, 0);        // loop filter is not all_default
+  output->Write(1, 0);        // no gaborish
+  output->Write(2, 0);        // 0 EPF iters
+  output->Write(2, 0b00);     // No LF extensions
+  output->Write(2, 0b00);     // No FH extensions
+
+  output->Write(1, 0);      // No TOC permutation
+  output->ZeroPadToByte();  // TOC is byte-aligned.
+  for (size_t i = 0; i < frame->group_data.size(); i++) {
+    size_t sz = group_sizes[i];
+    if (sz < (1 << 10)) {
+      output->Write(2, 0b00);
+      output->Write(10, sz);
+    } else if (sz - 1024 < (1 << 14)) {
+      output->Write(2, 0b01);
+      output->Write(14, sz - 1024);
+    } else if (sz - 17408 < (1 << 22)) {
+      output->Write(2, 0b10);
+      output->Write(22, sz - 17408);
+    } else {
+      output->Write(2, 0b11);
+      output->Write(30, sz - 4211712);
+    }
+  }
+  output->ZeroPadToByte();  // Groups are byte-aligned.
+}
+
+#if FJXL_ENABLE_AVX512
+__attribute__((target("avx512vbmi2"))) static size_t AppendBytesWithBitOffset(
+    const uint8_t* data, size_t n, size_t bit_buffer_nbits,
+    unsigned char* output, uint64_t& bit_buffer) {
+  if (n < 128) {
+    return 0;
+  }
+
+  size_t i = 0;
+  __m512i shift = _mm512_set1_epi64(64 - bit_buffer_nbits);
+  __m512i carry = _mm512_set1_epi64(bit_buffer << (64 - bit_buffer_nbits));
+
+  for (; i + 64 <= n; i += 64) {
+    __m512i current = _mm512_loadu_si512(data + i);
+    __m512i previous_u64 = _mm512_alignr_epi64(current, carry, 7);
+    carry = current;
+    __m512i out = _mm512_shrdv_epi64(previous_u64, current, shift);
+    _mm512_storeu_si512(output + i, out);
+  }
+
+  bit_buffer = data[i - 1] >> (8 - bit_buffer_nbits);
+
+  return i;
+}
+#endif
+
+size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState* frame,
+                                  unsigned char* output, size_t output_size) {
+  assert(output_size >= 32);
+  unsigned char* initial_output = output;
+  size_t (*append_bytes_with_bit_offset)(const uint8_t*, size_t, size_t,
+                                         unsigned char*, uint64_t&) = nullptr;
+
+#if FJXL_ENABLE_AVX512
+  if (__builtin_cpu_supports("avx512vbmi2")) {
+    append_bytes_with_bit_offset = AppendBytesWithBitOffset;
+  }
+#endif
+
+  while (true) {
+    size_t& cur = frame->current_bit_writer;
+    size_t& bw_pos = frame->bit_writer_byte_pos;
+    if (cur >= 1 + frame->group_data.size() * frame->nb_chans) {
+      return output - initial_output;
+    }
+    if (output_size <= 8) {
+      return output - initial_output;
+    }
+    size_t nbc = frame->nb_chans;
+    const BitWriter& writer =
+        cur == 0 ? frame->header
+                 : frame->group_data[(cur - 1) / nbc][(cur - 1) % nbc];
+    size_t full_byte_count =
+        std::min(output_size - 8, writer.bytes_written - bw_pos);
+    if (frame->bits_in_buffer == 0) {
+      memcpy(output, writer.data.get() + bw_pos, full_byte_count);
+    } else {
+      size_t i = 0;
+      if (append_bytes_with_bit_offset) {
+        i += append_bytes_with_bit_offset(
+            writer.data.get() + bw_pos, full_byte_count, frame->bits_in_buffer,
+            output, frame->bit_buffer);
+      }
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+      // Copy 8 bytes at a time until we reach the border.
+      for (; i + 8 < full_byte_count; i += 8) {
+        uint64_t chunk;
+        memcpy(&chunk, writer.data.get() + bw_pos + i, 8);
+        uint64_t out = frame->bit_buffer | (chunk << frame->bits_in_buffer);
+        memcpy(output + i, &out, 8);
+        frame->bit_buffer = chunk >> (64 - frame->bits_in_buffer);
+      }
+#endif
+      for (; i < full_byte_count; i++) {
+        AddBits(8, writer.data.get()[bw_pos + i], output + i,
+                frame->bits_in_buffer, frame->bit_buffer);
+      }
+    }
+    output += full_byte_count;
+    output_size -= full_byte_count;
+    bw_pos += full_byte_count;
+    if (bw_pos == writer.bytes_written) {
+      auto write = [&](size_t num, uint64_t bits) {
+        size_t n = AddBits(num, bits, output, frame->bits_in_buffer,
+                           frame->bit_buffer);
+        output += n;
+        output_size -= n;
+      };
+      if (writer.bits_in_buffer) {
+        write(writer.bits_in_buffer, writer.buffer);
+      }
+      bw_pos = 0;
+      cur++;
+      if ((cur - 1) % nbc == 0 && frame->bits_in_buffer != 0) {
+        write(8 - frame->bits_in_buffer, 0);
+      }
+    }
+  }
+}
+
+void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState* frame) {
+  delete frame;
+}
+
+}  // extern "C"
+
+#endif
+
+#ifdef FJXL_SELF_INCLUDE
+
+namespace {
+
+constexpr size_t kNumRawSymbols = 19;
+constexpr size_t kNumLZ77 = 33;
+constexpr size_t kLZ77CacheSize = 32;
+
+constexpr size_t kLZ77Offset = 224;
+constexpr size_t kLZ77MinLength = 7;
+
+void EncodeHybridUintLZ77(uint32_t value, uint32_t* token, uint32_t* nbits,
+                          uint32_t* bits) {
+  // 400 config
+  uint32_t n = FloorLog2(value);
+  *token = value < 16 ? value : 16 + n - 4;
+  *nbits = value < 16 ? 0 : n;
+  *bits = value < 16 ? 0 : value - (1 << *nbits);
+}
+
+struct PrefixCode {
+  uint8_t raw_nbits[kNumRawSymbols] = {};
+  uint8_t raw_bits[kNumRawSymbols] = {};
+
+  alignas(64) uint8_t raw_nbits_simd[16] = {};
+  alignas(64) uint8_t raw_bits_simd[16] = {};
+
+  uint8_t lz77_nbits[kNumLZ77] = {};
+  uint16_t lz77_bits[kNumLZ77] = {};
+
+  uint64_t lz77_cache_bits[kLZ77CacheSize] = {};
+  uint8_t lz77_cache_nbits[kLZ77CacheSize] = {};
+
+  static uint16_t BitReverse(size_t nbits, uint16_t bits) {
+    constexpr uint16_t kNibbleLookup[16] = {
+        0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110,
+        0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111,
+    };
+    uint16_t rev16 = (kNibbleLookup[bits & 0xF] << 12) |
+                     (kNibbleLookup[(bits >> 4) & 0xF] << 8) |
+                     (kNibbleLookup[(bits >> 8) & 0xF] << 4) |
+                     (kNibbleLookup[bits >> 12]);
+    return rev16 >> (16 - nbits);
+  }
+
+  // Create the prefix codes given the code lengths.
+  // Supports the code lengths being split into two halves.
+  static void ComputeCanonicalCode(const uint8_t* first_chunk_nbits,
+                                   uint8_t* first_chunk_bits,
+                                   size_t first_chunk_size,
+                                   const uint8_t* second_chunk_nbits,
+                                   uint16_t* second_chunk_bits,
+                                   size_t second_chunk_size) {
+    constexpr size_t kMaxCodeLength = 15;
+    uint8_t code_length_counts[kMaxCodeLength + 1] = {};
+    for (size_t i = 0; i < first_chunk_size; i++) {
+      code_length_counts[first_chunk_nbits[i]]++;
+      assert(first_chunk_nbits[i] <= kMaxCodeLength);
+      assert(first_chunk_nbits[i] <= 8);
+      assert(first_chunk_nbits[i] > 0);
+    }
+    for (size_t i = 0; i < second_chunk_size; i++) {
+      code_length_counts[second_chunk_nbits[i]]++;
+      assert(second_chunk_nbits[i] <= kMaxCodeLength);
+    }
+
+    uint16_t next_code[kMaxCodeLength + 1] = {};
+
+    uint16_t code = 0;
+    for (size_t i = 1; i < kMaxCodeLength + 1; i++) {
+      code = (code + code_length_counts[i - 1]) << 1;
+      next_code[i] = code;
+    }
+
+    for (size_t i = 0; i < first_chunk_size; i++) {
+      first_chunk_bits[i] =
+          BitReverse(first_chunk_nbits[i], next_code[first_chunk_nbits[i]]++);
+    }
+    for (size_t i = 0; i < second_chunk_size; i++) {
+      second_chunk_bits[i] =
+          BitReverse(second_chunk_nbits[i], next_code[second_chunk_nbits[i]]++);
+    }
+  }
+
+  template <typename T>
+  static void ComputeCodeLengthsNonZeroImpl(const uint64_t* freqs, size_t n,
+                                            size_t precision, T infty,
+                                            uint8_t* min_limit,
+                                            uint8_t* max_limit,
+                                            uint8_t* nbits) {
+    std::vector<T> dynp(((1U << precision) + 1) * (n + 1), infty);
+    auto d = [&](size_t sym, size_t off) -> T& {
+      return dynp[sym * ((1 << precision) + 1) + off];
+    };
+    d(0, 0) = 0;
+    for (size_t sym = 0; sym < n; sym++) {
+      for (T bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
+        size_t off_delta = 1U << (precision - bits);
+        for (size_t off = 0; off + off_delta <= (1U << precision); off++) {
+          d(sym + 1, off + off_delta) =
+              std::min(d(sym, off) + static_cast<T>(freqs[sym]) * bits,
+                       d(sym + 1, off + off_delta));
+        }
+      }
+    }
+
+    size_t sym = n;
+    size_t off = 1U << precision;
+
+    assert(d(sym, off) != infty);
+
+    while (sym-- > 0) {
+      assert(off > 0);
+      for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
+        size_t off_delta = 1U << (precision - bits);
+        if (off_delta <= off &&
+            d(sym + 1, off) == d(sym, off - off_delta) + freqs[sym] * bits) {
+          off -= off_delta;
+          nbits[sym] = bits;
+          break;
+        }
+      }
+    }
+  }
+
+  // Computes nbits[i] for i <= n, subject to min_limit[i] <= nbits[i] <=
+  // max_limit[i] and sum 2**-nbits[i] == 1, so to minimize sum(nbits[i] *
+  // freqs[i]).
+  static void ComputeCodeLengthsNonZero(const uint64_t* freqs, size_t n,
+                                        uint8_t* min_limit, uint8_t* max_limit,
+                                        uint8_t* nbits) {
+    size_t precision = 0;
+    size_t shortest_length = 255;
+    uint64_t freqsum = 0;
+    for (size_t i = 0; i < n; i++) {
+      assert(freqs[i] != 0);
+      freqsum += freqs[i];
+      if (min_limit[i] < 1) min_limit[i] = 1;
+      assert(min_limit[i] <= max_limit[i]);
+      precision = std::max<size_t>(max_limit[i], precision);
+      shortest_length = std::min<size_t>(min_limit[i], shortest_length);
+    }
+    // If all the minimum limits are greater than 1, shift precision so that we
+    // behave as if the shortest was 1.
+    precision -= shortest_length - 1;
+    uint64_t infty = freqsum * precision;
+    if (infty < std::numeric_limits<uint32_t>::max() / 2) {
+      ComputeCodeLengthsNonZeroImpl(freqs, n, precision,
+                                    static_cast<uint32_t>(infty), min_limit,
+                                    max_limit, nbits);
+    } else {
+      ComputeCodeLengthsNonZeroImpl(freqs, n, precision, infty, min_limit,
+                                    max_limit, nbits);
+    }
+  }
+
+  static constexpr size_t kMaxNumSymbols =
+      kNumRawSymbols + 1 < kNumLZ77 ? kNumLZ77 : kNumRawSymbols + 1;
+  static void ComputeCodeLengths(const uint64_t* freqs, size_t n,
+                                 const uint8_t* min_limit_in,
+                                 const uint8_t* max_limit_in, uint8_t* nbits) {
+    assert(n <= kMaxNumSymbols);
+    uint64_t compact_freqs[kMaxNumSymbols];
+    uint8_t min_limit[kMaxNumSymbols];
+    uint8_t max_limit[kMaxNumSymbols];
+    size_t ni = 0;
+    for (size_t i = 0; i < n; i++) {
+      if (freqs[i]) {
+        compact_freqs[ni] = freqs[i];
+        min_limit[ni] = min_limit_in[i];
+        max_limit[ni] = max_limit_in[i];
+        ni++;
+      }
+    }
+    uint8_t num_bits[kMaxNumSymbols] = {};
+    ComputeCodeLengthsNonZero(compact_freqs, ni, min_limit, max_limit,
+                              num_bits);
+    ni = 0;
+    for (size_t i = 0; i < n; i++) {
+      nbits[i] = 0;
+      if (freqs[i]) {
+        nbits[i] = num_bits[ni++];
+      }
+    }
+  }
+
+  // Invalid code, used to construct arrays.
+  PrefixCode() {}
+
+  template <typename BitDepth>
+  PrefixCode(BitDepth, uint64_t* raw_counts, uint64_t* lz77_counts) {
+    // "merge" together all the lz77 counts in a single symbol for the level 1
+    // table (containing just the raw symbols, up to length 7).
+    uint64_t level1_counts[kNumRawSymbols + 1];
+    memcpy(level1_counts, raw_counts, kNumRawSymbols * sizeof(uint64_t));
+    size_t numraw = kNumRawSymbols;
+    while (numraw > 0 && level1_counts[numraw - 1] == 0) numraw--;
+
+    level1_counts[numraw] = 0;
+    for (size_t i = 0; i < kNumLZ77; i++) {
+      level1_counts[numraw] += lz77_counts[i];
+    }
+    uint8_t level1_nbits[kNumRawSymbols + 1] = {};
+    ComputeCodeLengths(level1_counts, numraw + 1, BitDepth::kMinRawLength,
+                       BitDepth::kMaxRawLength, level1_nbits);
+
+    uint8_t level2_nbits[kNumLZ77] = {};
+    uint8_t min_lengths[kNumLZ77] = {};
+    uint8_t l = 15 - level1_nbits[numraw];
+    uint8_t max_lengths[kNumLZ77];
+    for (size_t i = 0; i < kNumLZ77; i++) {
+      max_lengths[i] = l;
+    }
+    size_t num_lz77 = kNumLZ77;
+    while (num_lz77 > 0 && lz77_counts[num_lz77 - 1] == 0) num_lz77--;
+    ComputeCodeLengths(lz77_counts, num_lz77, min_lengths, max_lengths,
+                       level2_nbits);
+    for (size_t i = 0; i < numraw; i++) {
+      raw_nbits[i] = level1_nbits[i];
+    }
+    for (size_t i = 0; i < num_lz77; i++) {
+      lz77_nbits[i] =
+          level2_nbits[i] ? level1_nbits[numraw] + level2_nbits[i] : 0;
+    }
+
+    ComputeCanonicalCode(raw_nbits, raw_bits, numraw, lz77_nbits, lz77_bits,
+                         kNumLZ77);
+    BitDepth::PrepareForSimd(raw_nbits, raw_bits, numraw, raw_nbits_simd,
+                             raw_bits_simd);
+
+    // Prepare lz77 cache
+    for (size_t count = 0; count < kLZ77CacheSize; count++) {
+      unsigned token, nbits, bits;
+      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
+      lz77_cache_nbits[count] = lz77_nbits[token] + nbits + raw_nbits[0];
+      lz77_cache_bits[count] =
+          (((bits << lz77_nbits[token]) | lz77_bits[token]) << raw_nbits[0]) |
+          raw_bits[0];
+    }
+  }
+
+  void WriteTo(BitWriter* writer) const {
+    uint64_t code_length_counts[18] = {};
+    code_length_counts[17] = 3 + 2 * (kNumLZ77 - 1);
+    for (size_t i = 0; i < kNumRawSymbols; i++) {
+      code_length_counts[raw_nbits[i]]++;
+    }
+    for (size_t i = 0; i < kNumLZ77; i++) {
+      code_length_counts[lz77_nbits[i]]++;
+    }
+    uint8_t code_length_nbits[18] = {};
+    uint8_t code_length_nbits_min[18] = {};
+    uint8_t code_length_nbits_max[18] = {
+        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    };
+    ComputeCodeLengths(code_length_counts, 18, code_length_nbits_min,
+                       code_length_nbits_max, code_length_nbits);
+    writer->Write(2, 0b00);  // HSKIP = 0, i.e. don't skip code lengths.
+
+    // As per Brotli RFC.
+    uint8_t code_length_order[18] = {1, 2, 3, 4,  0,  5,  17, 6,  16,
+                                     7, 8, 9, 10, 11, 12, 13, 14, 15};
+    uint8_t code_length_length_nbits[] = {2, 4, 3, 2, 2, 4};
+    uint8_t code_length_length_bits[] = {0, 7, 3, 2, 1, 15};
+
+    // Encode lengths of code lengths.
+    size_t num_code_lengths = 18;
+    while (code_length_nbits[code_length_order[num_code_lengths - 1]] == 0) {
+      num_code_lengths--;
+    }
+    for (size_t i = 0; i < num_code_lengths; i++) {
+      int symbol = code_length_nbits[code_length_order[i]];
+      writer->Write(code_length_length_nbits[symbol],
+                    code_length_length_bits[symbol]);
+    }
+
+    // Compute the canonical codes for the codes that represent the lengths of
+    // the actual codes for data.
+    uint16_t code_length_bits[18] = {};
+    ComputeCanonicalCode(nullptr, nullptr, 0, code_length_nbits,
+                         code_length_bits, 18);
+    // Encode raw bit code lengths.
+    for (size_t i = 0; i < kNumRawSymbols; i++) {
+      writer->Write(code_length_nbits[raw_nbits[i]],
+                    code_length_bits[raw_nbits[i]]);
+    }
+    size_t num_lz77 = kNumLZ77;
+    while (lz77_nbits[num_lz77 - 1] == 0) {
+      num_lz77--;
+    }
+    // Encode 0s until 224 (start of LZ77 symbols). This is in total 224-19 =
+    // 205.
+    static_assert(kLZ77Offset == 224, "");
+    static_assert(kNumRawSymbols == 19, "");
+    writer->Write(code_length_nbits[17], code_length_bits[17]);
+    writer->Write(3, 0b010);  // 5
+    writer->Write(code_length_nbits[17], code_length_bits[17]);
+    writer->Write(3, 0b000);  // (5-2)*8 + 3 = 27
+    writer->Write(code_length_nbits[17], code_length_bits[17]);
+    writer->Write(3, 0b010);  // (27-2)*8 + 5 = 205
+    // Encode LZ77 symbols, with values 224+i.
+    for (size_t i = 0; i < num_lz77; i++) {
+      writer->Write(code_length_nbits[lz77_nbits[i]],
+                    code_length_bits[lz77_nbits[i]]);
+    }
+  }
+};
+
+template <typename T>
+struct VecPair {
+  T low;
+  T hi;
+};
+
+#ifdef FJXL_GENERIC_SIMD
+#undef FJXL_GENERIC_SIMD
+#endif
+
+#ifdef FJXL_AVX512
+#define FJXL_GENERIC_SIMD
+struct SIMDVec32;
+struct Mask32 {
+  __mmask16 mask;
+  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
+  size_t CountPrefix() const {
+    return CtzNonZero(~uint64_t{_cvtmask16_u32(mask)});
+  }
+};
+
+struct SIMDVec32 {
+  __m512i vec;
+
+  static constexpr size_t kLanes = 16;
+
+  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
+    return SIMDVec32{_mm512_loadu_si512((__m512i*)data)};
+  }
+  FJXL_INLINE void Store(uint32_t* data) {
+    _mm512_storeu_si512((__m512i*)data, vec);
+  }
+  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
+    return SIMDVec32{_mm512_set1_epi32(v)};
+  }
+  FJXL_INLINE SIMDVec32 ValToToken() const {
+    return SIMDVec32{
+        _mm512_sub_epi32(_mm512_set1_epi32(32), _mm512_lzcnt_epi32(vec))};
+  }
+  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{_mm512_sub_epi32(_mm512_max_epu32(vec, to_subtract.vec),
+                                      to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{_mm512_sub_epi32(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
+    return SIMDVec32{_mm512_add_epi32(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
+    return SIMDVec32{_mm512_xor_epi32(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
+    return Mask32{_mm512_cmpeq_epi32_mask(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
+    return Mask32{_mm512_cmpgt_epi32_mask(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Pow2() const {
+    return SIMDVec32{_mm512_sllv_epi32(_mm512_set1_epi32(1), vec)};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
+    return SIMDVec32{_mm512_srai_epi32(vec, i)};
+  }
+};
+
+struct SIMDVec16;
+
+struct Mask16 {
+  __mmask32 mask;
+  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
+  Mask16 And(const Mask16& oth) const {
+    return Mask16{_kand_mask32(mask, oth.mask)};
+  }
+  size_t CountPrefix() const {
+    return CtzNonZero(~uint64_t{_cvtmask32_u32(mask)});
+  }
+};
+
+struct SIMDVec16 {
+  __m512i vec;
+
+  static constexpr size_t kLanes = 32;
+
+  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
+    return SIMDVec16{_mm512_loadu_si512((__m512i*)data)};
+  }
+  FJXL_INLINE void Store(uint16_t* data) {
+    _mm512_storeu_si512((__m512i*)data, vec);
+  }
+  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
+    return SIMDVec16{_mm512_set1_epi16(v)};
+  }
+  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
+                                         const SIMDVec32& hi) {
+    auto tmp = _mm512_packus_epi32(lo.vec, hi.vec);
+    alignas(64) uint64_t perm[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+    return SIMDVec16{
+        _mm512_permutex2var_epi64(tmp, _mm512_load_si512((__m512i*)perm), tmp)};
+  }
+
+  FJXL_INLINE SIMDVec16 ValToToken() const {
+    auto c16 = _mm512_set1_epi32(16);
+    auto c32 = _mm512_set1_epi32(32);
+    auto low16bit = _mm512_set1_epi32(0x0000FFFF);
+    auto lzhi =
+        _mm512_sub_epi32(c16, _mm512_min_epu32(c16, _mm512_lzcnt_epi32(vec)));
+    auto lzlo = _mm512_sub_epi32(
+        c32, _mm512_lzcnt_epi32(_mm512_and_si512(low16bit, vec)));
+    return SIMDVec16{_mm512_or_si512(lzlo, _mm512_slli_epi32(lzhi, 16))};
+  }
+
+  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{_mm512_subs_epu16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{_mm512_sub_epi16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_add_epi16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_min_epu16(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
+    return Mask16{_mm512_cmpeq_epi16_mask(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
+    return Mask16{_mm512_cmpgt_epi16_mask(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Pow2() const {
+    return SIMDVec16{_mm512_sllv_epi16(_mm512_set1_epi16(1), vec)};
+  }
+  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_or_si512(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_xor_si512(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_and_si512(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_srai_epi16(_mm512_add_epi16(vec, oth.vec), 1)};
+  }
+  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
+    return SIMDVec16{_mm512_or_si512(vec, _mm512_set1_epi16(0xFF00))};
+  }
+  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
+    return SIMDVec16{_mm512_shuffle_epi8(
+        _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)table)), vec)};
+  }
+  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
+    auto lo = _mm512_unpacklo_epi16(low.vec, vec);
+    auto hi = _mm512_unpackhi_epi16(low.vec, vec);
+    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
+    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
+    return {SIMDVec16{_mm512_permutex2var_epi64(
+                lo, _mm512_load_si512((__m512i*)perm1), hi)},
+            SIMDVec16{_mm512_permutex2var_epi64(
+                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
+  }
+  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
+    auto lo = _mm512_unpacklo_epi16(vec, _mm512_setzero_si512());
+    auto hi = _mm512_unpackhi_epi16(vec, _mm512_setzero_si512());
+    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
+    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
+    return {SIMDVec32{_mm512_permutex2var_epi64(
+                lo, _mm512_load_si512((__m512i*)perm1), hi)},
+            SIMDVec32{_mm512_permutex2var_epi64(
+                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
+    return SIMDVec16{_mm512_srai_epi16(vec, i)};
+  }
+
+  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
+    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
+    return {SIMDVec16{_mm512_cvtepu8_epi16(bytes)}};
+  }
+  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
+    return {Load((const uint16_t*)data)};
+  }
+
+  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
+    __m512i bytes = _mm512_loadu_si512((__m512i*)data);
+    __m512i gray = _mm512_and_si512(bytes, _mm512_set1_epi16(0xFF));
+    __m512i alpha = _mm512_srli_epi16(bytes, 8);
+    return {SIMDVec16{gray}, SIMDVec16{alpha}};
+  }
+  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
+    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
+    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
+    __m512i g_mask = _mm512_set1_epi32(0xFFFF);
+    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
+    __m512i g = _mm512_permutexvar_epi64(
+        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, g_mask),
+                                        _mm512_and_si512(bytes2, g_mask)));
+    __m512i a = _mm512_permutexvar_epi64(
+        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
+                                        _mm512_srli_epi32(bytes2, 16)));
+    return {SIMDVec16{g}, SIMDVec16{a}};
+  }
+
+  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
+    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
+    __m512i bytes1 =
+        _mm512_zextsi256_si512(_mm256_loadu_si256((__m256i*)(data + 64)));
+
+    // 0x7A = element of upper half of second vector = 0 after lookup; still in
+    // the upper half once we add 1 or 2.
+    uint8_t z = 0x7A;
+    __m512i ridx =
+        _mm512_set_epi8(z, 93, z, 90, z, 87, z, 84, z, 81, z, 78, z, 75, z, 72,
+                        z, 69, z, 66, z, 63, z, 60, z, 57, z, 54, z, 51, z, 48,
+                        z, 45, z, 42, z, 39, z, 36, z, 33, z, 30, z, 27, z, 24,
+                        z, 21, z, 18, z, 15, z, 12, z, 9, z, 6, z, 3, z, 0);
+    __m512i gidx = _mm512_add_epi8(ridx, _mm512_set1_epi8(1));
+    __m512i bidx = _mm512_add_epi8(gidx, _mm512_set1_epi8(1));
+    __m512i r = _mm512_permutex2var_epi8(bytes0, ridx, bytes1);
+    __m512i g = _mm512_permutex2var_epi8(bytes0, gidx, bytes1);
+    __m512i b = _mm512_permutex2var_epi8(bytes0, bidx, bytes1);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
+  }
+  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
+    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
+    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
+    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
+
+    __m512i ridx_lo = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 60, 57,
+                                       54, 51, 48, 45, 42, 39, 36, 33, 30, 27,
+                                       24, 21, 18, 15, 12, 9, 6, 3, 0);
+    // -1 is such that when adding 1 or 2, we get the correct index for
+    // green/blue.
+    __m512i ridx_hi =
+        _mm512_set_epi16(29, 26, 23, 20, 17, 14, 11, 8, 5, 2, -1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    __m512i gidx_lo = _mm512_add_epi16(ridx_lo, _mm512_set1_epi16(1));
+    __m512i gidx_hi = _mm512_add_epi16(ridx_hi, _mm512_set1_epi16(1));
+    __m512i bidx_lo = _mm512_add_epi16(gidx_lo, _mm512_set1_epi16(1));
+    __m512i bidx_hi = _mm512_add_epi16(gidx_hi, _mm512_set1_epi16(1));
+
+    __mmask32 rmask = _cvtu32_mask32(0b11111111110000000000000000000000);
+    __mmask32 gbmask = _cvtu32_mask32(0b11111111111000000000000000000000);
+
+    __m512i rlo = _mm512_permutex2var_epi16(bytes0, ridx_lo, bytes1);
+    __m512i glo = _mm512_permutex2var_epi16(bytes0, gidx_lo, bytes1);
+    __m512i blo = _mm512_permutex2var_epi16(bytes0, bidx_lo, bytes1);
+    __m512i r = _mm512_mask_permutexvar_epi16(rlo, rmask, ridx_hi, bytes2);
+    __m512i g = _mm512_mask_permutexvar_epi16(glo, gbmask, gidx_hi, bytes2);
+    __m512i b = _mm512_mask_permutexvar_epi16(blo, gbmask, bidx_hi, bytes2);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
+  }
+
+  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
+    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
+    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
+    __m512i rg_mask = _mm512_set1_epi32(0xFFFF);
+    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
+    __m512i rg = _mm512_permutexvar_epi64(
+        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, rg_mask),
+                                        _mm512_and_si512(bytes2, rg_mask)));
+    __m512i ba = _mm512_permutexvar_epi64(
+        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
+                                        _mm512_srli_epi32(bytes2, 16)));
+    __m512i r = _mm512_and_si512(rg, _mm512_set1_epi16(0xFF));
+    __m512i g = _mm512_srli_epi16(rg, 8);
+    __m512i b = _mm512_and_si512(ba, _mm512_set1_epi16(0xFF));
+    __m512i a = _mm512_srli_epi16(ba, 8);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
+  }
+  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
+    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
+    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
+    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
+    __m512i bytes3 = _mm512_loadu_si512((__m512i*)(data + 192));
+
+    auto pack32 = [](__m512i a, __m512i b) {
+      __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
+      return _mm512_permutexvar_epi64(permuteidx, _mm512_packus_epi32(a, b));
+    };
+    auto packlow32 = [&pack32](__m512i a, __m512i b) {
+      __m512i mask = _mm512_set1_epi32(0xFFFF);
+      return pack32(_mm512_and_si512(a, mask), _mm512_and_si512(b, mask));
+    };
+    auto packhi32 = [&pack32](__m512i a, __m512i b) {
+      return pack32(_mm512_srli_epi32(a, 16), _mm512_srli_epi32(b, 16));
+    };
+
+    __m512i rb0 = packlow32(bytes0, bytes1);
+    __m512i rb1 = packlow32(bytes2, bytes3);
+    __m512i ga0 = packhi32(bytes0, bytes1);
+    __m512i ga1 = packhi32(bytes2, bytes3);
+
+    __m512i r = packlow32(rb0, rb1);
+    __m512i g = packlow32(ga0, ga1);
+    __m512i b = packhi32(rb0, rb1);
+    __m512i a = packhi32(ga0, ga1);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
+  }
+
+  void SwapEndian() {
+    auto indices = _mm512_broadcast_i32x4(
+        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
+    vec = _mm512_shuffle_epi8(vec, indices);
+  }
+};
+
+SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
+                             const SIMDVec16& if_false) {
+  return SIMDVec16{_mm512_mask_blend_epi16(mask, if_false.vec, if_true.vec)};
+}
+
+SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
+                             const SIMDVec32& if_false) {
+  return SIMDVec32{_mm512_mask_blend_epi32(mask, if_false.vec, if_true.vec)};
+}
+
+struct Bits64 {
+  static constexpr size_t kLanes = 8;
+
+  __m512i nbits;
+  __m512i bits;
+
+  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
+    _mm512_storeu_si512((__m512i*)nbits_out, nbits);
+    _mm512_storeu_si512((__m512i*)bits_out, bits);
+  }
+};
+
+struct Bits32 {
+  __m512i nbits;
+  __m512i bits;
+
+  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
+    return Bits32{nbits.vec, bits.vec};
+  }
+
+  Bits64 Merge() const {
+    auto nbits_hi32 = _mm512_srli_epi64(nbits, 32);
+    auto nbits_lo32 = _mm512_and_si512(nbits, _mm512_set1_epi64(0xFFFFFFFF));
+    auto bits_hi32 = _mm512_srli_epi64(bits, 32);
+    auto bits_lo32 = _mm512_and_si512(bits, _mm512_set1_epi64(0xFFFFFFFF));
+
+    auto nbits64 = _mm512_add_epi64(nbits_hi32, nbits_lo32);
+    auto bits64 =
+        _mm512_or_si512(_mm512_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
+    return Bits64{nbits64, bits64};
+  }
+
+  void Interleave(const Bits32& low) {
+    bits = _mm512_or_si512(_mm512_sllv_epi32(bits, low.nbits), low.bits);
+    nbits = _mm512_add_epi32(nbits, low.nbits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 16);
+    constexpr uint32_t kMask[32] = {
+        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
+        ~0u, ~0u, ~0u, ~0u, ~0u, 0,   0,   0,   0,   0,   0,
+        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    };
+    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
+    nbits = _mm512_and_si512(mask, nbits);
+    bits = _mm512_and_si512(mask, bits);
+  }
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 16);
+    constexpr uint32_t kMask[32] = {
+        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+        0,   0,   0,   0,   0,   ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
+        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
+    };
+    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
+    nbits = _mm512_and_si512(mask, nbits);
+    bits = _mm512_and_si512(mask, bits);
+  }
+};
+
+struct Bits16 {
+  __m512i nbits;
+  __m512i bits;
+
+  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
+    return Bits16{nbits.vec, bits.vec};
+  }
+
+  Bits32 Merge() const {
+    auto nbits_hi16 = _mm512_srli_epi32(nbits, 16);
+    auto nbits_lo16 = _mm512_and_si512(nbits, _mm512_set1_epi32(0xFFFF));
+    auto bits_hi16 = _mm512_srli_epi32(bits, 16);
+    auto bits_lo16 = _mm512_and_si512(bits, _mm512_set1_epi32(0xFFFF));
+
+    auto nbits32 = _mm512_add_epi32(nbits_hi16, nbits_lo16);
+    auto bits32 =
+        _mm512_or_si512(_mm512_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
+    return Bits32{nbits32, bits32};
+  }
+
+  void Interleave(const Bits16& low) {
+    bits = _mm512_or_si512(_mm512_sllv_epi16(bits, low.nbits), low.bits);
+    nbits = _mm512_add_epi16(nbits, low.nbits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 32);
+    constexpr uint16_t kMask[64] = {
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+    };
+    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
+    nbits = _mm512_and_si512(mask, nbits);
+    bits = _mm512_and_si512(mask, bits);
+  }
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 32);
+    constexpr uint16_t kMask[64] = {
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+    };
+    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
+    nbits = _mm512_and_si512(mask, nbits);
+    bits = _mm512_and_si512(mask, bits);
+  }
+};
+
+#endif
+
+#ifdef FJXL_AVX2
+#define FJXL_GENERIC_SIMD
+
+struct SIMDVec32;
+
+struct Mask32 {
+  __m256i mask;
+  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
+  size_t CountPrefix() const {
+    return CtzNonZero(~static_cast<uint64_t>(
+        (uint8_t)_mm256_movemask_ps(_mm256_castsi256_ps(mask))));
+  }
+};
+
+struct SIMDVec32 {
+  __m256i vec;
+
+  static constexpr size_t kLanes = 8;
+
+  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
+    return SIMDVec32{_mm256_loadu_si256((__m256i*)data)};
+  }
+  FJXL_INLINE void Store(uint32_t* data) {
+    _mm256_storeu_si256((__m256i*)data, vec);
+  }
+  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
+    return SIMDVec32{_mm256_set1_epi32(v)};
+  }
+  FJXL_INLINE SIMDVec32 ValToToken() const {
+    // we know that each value has at most 20 bits, so we just need 5 nibbles
+    // and don't need to mask the fifth. However we do need to set the higher
+    // bytes to 0xFF, which will make table lookups return 0.
+    auto nibble0 =
+        _mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi32(0xF)),
+                        _mm256_set1_epi32(0xFFFFFF00));
+    auto nibble1 = _mm256_or_si256(
+        _mm256_and_si256(_mm256_srli_epi32(vec, 4), _mm256_set1_epi32(0xF)),
+        _mm256_set1_epi32(0xFFFFFF00));
+    auto nibble2 = _mm256_or_si256(
+        _mm256_and_si256(_mm256_srli_epi32(vec, 8), _mm256_set1_epi32(0xF)),
+        _mm256_set1_epi32(0xFFFFFF00));
+    auto nibble3 = _mm256_or_si256(
+        _mm256_and_si256(_mm256_srli_epi32(vec, 12), _mm256_set1_epi32(0xF)),
+        _mm256_set1_epi32(0xFFFFFF00));
+    auto nibble4 = _mm256_or_si256(_mm256_srli_epi32(vec, 16),
+                                   _mm256_set1_epi32(0xFFFFFF00));
+
+    auto lut0 = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4));
+    auto lut1 = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8));
+    auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12));
+    auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16));
+    auto lut4 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20));
+
+    auto token0 = _mm256_shuffle_epi8(lut0, nibble0);
+    auto token1 = _mm256_shuffle_epi8(lut1, nibble1);
+    auto token2 = _mm256_shuffle_epi8(lut2, nibble2);
+    auto token3 = _mm256_shuffle_epi8(lut3, nibble3);
+    auto token4 = _mm256_shuffle_epi8(lut4, nibble4);
+
+    auto token =
+        _mm256_max_epi32(_mm256_max_epi32(_mm256_max_epi32(token0, token1),
+                                          _mm256_max_epi32(token2, token3)),
+                         token4);
+    return SIMDVec32{token};
+  }
+  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{_mm256_sub_epi32(_mm256_max_epu32(vec, to_subtract.vec),
+                                      to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{_mm256_sub_epi32(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
+    return SIMDVec32{_mm256_add_epi32(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
+    return SIMDVec32{_mm256_xor_si256(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Pow2() const {
+    return SIMDVec32{_mm256_sllv_epi32(_mm256_set1_epi32(1), vec)};
+  }
+  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
+    return Mask32{_mm256_cmpeq_epi32(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
+    return Mask32{_mm256_cmpgt_epi32(vec, oth.vec)};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
+    return SIMDVec32{_mm256_srai_epi32(vec, i)};
+  }
+};
+
+struct SIMDVec16;
+
+struct Mask16 {
+  __m256i mask;
+  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
+  Mask16 And(const Mask16& oth) const {
+    return Mask16{_mm256_and_si256(mask, oth.mask)};
+  }
+  size_t CountPrefix() const {
+    return CtzNonZero(
+               ~static_cast<uint64_t>((uint32_t)_mm256_movemask_epi8(mask))) /
+           2;
+  }
+};
+
+struct SIMDVec16 {
+  __m256i vec;
+
+  static constexpr size_t kLanes = 16;
+
+  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
+    return SIMDVec16{_mm256_loadu_si256((__m256i*)data)};
+  }
+  FJXL_INLINE void Store(uint16_t* data) {
+    _mm256_storeu_si256((__m256i*)data, vec);
+  }
+  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
+    return SIMDVec16{_mm256_set1_epi16(v)};
+  }
+  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
+                                         const SIMDVec32& hi) {
+    auto tmp = _mm256_packus_epi32(lo.vec, hi.vec);
+    return SIMDVec16{_mm256_permute4x64_epi64(tmp, 0b11011000)};
+  }
+
+  FJXL_INLINE SIMDVec16 ValToToken() const {
+    auto nibble0 =
+        _mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi16(0xF)),
+                        _mm256_set1_epi16(0xFF00));
+    auto nibble1 = _mm256_or_si256(
+        _mm256_and_si256(_mm256_srli_epi16(vec, 4), _mm256_set1_epi16(0xF)),
+        _mm256_set1_epi16(0xFF00));
+    auto nibble2 = _mm256_or_si256(
+        _mm256_and_si256(_mm256_srli_epi16(vec, 8), _mm256_set1_epi16(0xF)),
+        _mm256_set1_epi16(0xFF00));
+    auto nibble3 =
+        _mm256_or_si256(_mm256_srli_epi16(vec, 12), _mm256_set1_epi16(0xFF00));
+
+    auto lut0 = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4));
+    auto lut1 = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8));
+    auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12));
+    auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16));
+
+    auto token0 = _mm256_shuffle_epi8(lut0, nibble0);
+    auto token1 = _mm256_shuffle_epi8(lut1, nibble1);
+    auto token2 = _mm256_shuffle_epi8(lut2, nibble2);
+    auto token3 = _mm256_shuffle_epi8(lut3, nibble3);
+
+    auto token = _mm256_max_epi16(_mm256_max_epi16(token0, token1),
+                                  _mm256_max_epi16(token2, token3));
+    return SIMDVec16{token};
+  }
+
+  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{_mm256_subs_epu16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{_mm256_sub_epi16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_add_epi16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_min_epu16(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
+    return Mask16{_mm256_cmpeq_epi16(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
+    return Mask16{_mm256_cmpgt_epi16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Pow2() const {
+    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
+                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
+    auto pow2_hi_lut = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1 << 0, 1 << 1, 1 << 2, 1 << 3,
+                      1 << 4, 1 << 5, 1 << 6, 1u << 7));
+
+    auto masked = _mm256_or_si256(vec, _mm256_set1_epi16(0xFF00));
+
+    auto pow2_lo = _mm256_shuffle_epi8(pow2_lo_lut, masked);
+    auto pow2_hi = _mm256_shuffle_epi8(pow2_hi_lut, masked);
+
+    auto pow2 = _mm256_or_si256(_mm256_slli_epi16(pow2_hi, 8), pow2_lo);
+    return SIMDVec16{pow2};
+  }
+  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_or_si256(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_xor_si256(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_and_si256(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_srai_epi16(_mm256_add_epi16(vec, oth.vec), 1)};
+  }
+  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
+    return SIMDVec16{_mm256_or_si256(vec, _mm256_set1_epi16(0xFF00))};
+  }
+  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
+    return SIMDVec16{_mm256_shuffle_epi8(
+        _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)table)), vec)};
+  }
+  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
+    auto v02 = _mm256_unpacklo_epi16(low.vec, vec);
+    auto v13 = _mm256_unpackhi_epi16(low.vec, vec);
+    return {SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x20)},
+            SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x31)}};
+  }
+  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
+    auto v02 = _mm256_unpacklo_epi16(vec, _mm256_setzero_si256());
+    auto v13 = _mm256_unpackhi_epi16(vec, _mm256_setzero_si256());
+    return {SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x20)},
+            SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x31)}};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
+    return SIMDVec16{_mm256_srai_epi16(vec, i)};
+  }
+
+  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
+    __m128i bytes = _mm_loadu_si128((__m128i*)data);
+    return {SIMDVec16{_mm256_cvtepu8_epi16(bytes)}};
+  }
+  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
+    return {Load((const uint16_t*)data)};
+  }
+
+  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
+    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
+    __m256i gray = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
+    __m256i alpha = _mm256_srli_epi16(bytes, 8);
+    return {SIMDVec16{gray}, SIMDVec16{alpha}};
+  }
+  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
+    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
+    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
+    __m256i g_mask = _mm256_set1_epi32(0xFFFF);
+    __m256i g = _mm256_permute4x64_epi64(
+        _mm256_packus_epi32(_mm256_and_si256(bytes1, g_mask),
+                            _mm256_and_si256(bytes2, g_mask)),
+        0b11011000);
+    __m256i a = _mm256_permute4x64_epi64(
+        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
+                            _mm256_srli_epi32(bytes2, 16)),
+        0b11011000);
+    return {SIMDVec16{g}, SIMDVec16{a}};
+  }
+
+  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
+    __m128i bytes0 = _mm_loadu_si128((__m128i*)data);
+    __m128i bytes1 = _mm_loadu_si128((__m128i*)(data + 16));
+    __m128i bytes2 = _mm_loadu_si128((__m128i*)(data + 32));
+
+    __m128i idx =
+        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
+
+    __m128i r6b5g5_0 = _mm_shuffle_epi8(bytes0, idx);
+    __m128i g6r5b5_1 = _mm_shuffle_epi8(bytes1, idx);
+    __m128i b6g5r5_2 = _mm_shuffle_epi8(bytes2, idx);
+
+    __m128i mask010 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF,
+                                    0xFF, 0, 0, 0, 0, 0);
+    __m128i mask001 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF,
+                                    0xFF, 0xFF, 0xFF);
+
+    __m128i b2g2b1 = _mm_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
+    __m128i b2b0b1 = _mm_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
+
+    __m128i r0r1b1 = _mm_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
+    __m128i r0r1r2 = _mm_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
+
+    __m128i g1r1g0 = _mm_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
+    __m128i g1g2g0 = _mm_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
+
+    __m128i g0g1g2 = _mm_alignr_epi8(g1g2g0, g1g2g0, 11);
+    __m128i b0b1b2 = _mm_alignr_epi8(b2b0b1, b2b0b1, 6);
+
+    return {SIMDVec16{_mm256_cvtepu8_epi16(r0r1r2)},
+            SIMDVec16{_mm256_cvtepu8_epi16(g0g1g2)},
+            SIMDVec16{_mm256_cvtepu8_epi16(b0b1b2)}};
+  }
+  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
+    auto load_and_split_lohi = [](const unsigned char* data) {
+      // LHLHLH...
+      __m256i bytes = _mm256_loadu_si256((__m256i*)data);
+      // L0L0L0...
+      __m256i lo = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
+      // H0H0H0...
+      __m256i hi = _mm256_srli_epi16(bytes, 8);
+      // LLLLLLLLHHHHHHHHLLLLLLLLHHHHHHHH
+      __m256i packed = _mm256_packus_epi16(lo, hi);
+      return _mm256_permute4x64_epi64(packed, 0b11011000);
+    };
+    __m256i bytes0 = load_and_split_lohi(data);
+    __m256i bytes1 = load_and_split_lohi(data + 32);
+    __m256i bytes2 = load_and_split_lohi(data + 64);
+
+    __m256i idx = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13));
+
+    __m256i r6b5g5_0 = _mm256_shuffle_epi8(bytes0, idx);
+    __m256i g6r5b5_1 = _mm256_shuffle_epi8(bytes1, idx);
+    __m256i b6g5r5_2 = _mm256_shuffle_epi8(bytes2, idx);
+
+    __m256i mask010 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0));
+    __m256i mask001 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF));
+
+    __m256i b2g2b1 = _mm256_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
+    __m256i b2b0b1 = _mm256_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
+
+    __m256i r0r1b1 = _mm256_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
+    __m256i r0r1r2 = _mm256_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
+
+    __m256i g1r1g0 = _mm256_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
+    __m256i g1g2g0 = _mm256_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
+
+    __m256i g0g1g2 = _mm256_alignr_epi8(g1g2g0, g1g2g0, 11);
+    __m256i b0b1b2 = _mm256_alignr_epi8(b2b0b1, b2b0b1, 6);
+
+    // Now r0r1r2, g0g1g2, b0b1b2 have the low bytes of the RGB pixels in their
+    // lower half, and the high bytes in their upper half.
+
+    auto combine_low_hi = [](__m256i v) {
+      __m128i low = _mm256_extracti128_si256(v, 0);
+      __m128i hi = _mm256_extracti128_si256(v, 1);
+      __m256i low16 = _mm256_cvtepu8_epi16(low);
+      __m256i hi16 = _mm256_cvtepu8_epi16(hi);
+      return _mm256_or_si256(_mm256_slli_epi16(hi16, 8), low16);
+    };
+
+    return {SIMDVec16{combine_low_hi(r0r1r2)},
+            SIMDVec16{combine_low_hi(g0g1g2)},
+            SIMDVec16{combine_low_hi(b0b1b2)}};
+  }
+
+  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
+    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
+    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
+    __m256i rg_mask = _mm256_set1_epi32(0xFFFF);
+    __m256i rg = _mm256_permute4x64_epi64(
+        _mm256_packus_epi32(_mm256_and_si256(bytes1, rg_mask),
+                            _mm256_and_si256(bytes2, rg_mask)),
+        0b11011000);
+    __m256i ba = _mm256_permute4x64_epi64(
+        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
+                            _mm256_srli_epi32(bytes2, 16)),
+        0b11011000);
+    __m256i r = _mm256_and_si256(rg, _mm256_set1_epi16(0xFF));
+    __m256i g = _mm256_srli_epi16(rg, 8);
+    __m256i b = _mm256_and_si256(ba, _mm256_set1_epi16(0xFF));
+    __m256i a = _mm256_srli_epi16(ba, 8);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
+  }
+  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
+    __m256i bytes0 = _mm256_loadu_si256((__m256i*)data);
+    __m256i bytes1 = _mm256_loadu_si256((__m256i*)(data + 32));
+    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 64));
+    __m256i bytes3 = _mm256_loadu_si256((__m256i*)(data + 96));
+
+    auto pack32 = [](__m256i a, __m256i b) {
+      return _mm256_permute4x64_epi64(_mm256_packus_epi32(a, b), 0b11011000);
+    };
+    auto packlow32 = [&pack32](__m256i a, __m256i b) {
+      __m256i mask = _mm256_set1_epi32(0xFFFF);
+      return pack32(_mm256_and_si256(a, mask), _mm256_and_si256(b, mask));
+    };
+    auto packhi32 = [&pack32](__m256i a, __m256i b) {
+      return pack32(_mm256_srli_epi32(a, 16), _mm256_srli_epi32(b, 16));
+    };
+
+    __m256i rb0 = packlow32(bytes0, bytes1);
+    __m256i rb1 = packlow32(bytes2, bytes3);
+    __m256i ga0 = packhi32(bytes0, bytes1);
+    __m256i ga1 = packhi32(bytes2, bytes3);
+
+    __m256i r = packlow32(rb0, rb1);
+    __m256i g = packlow32(ga0, ga1);
+    __m256i b = packhi32(rb0, rb1);
+    __m256i a = packhi32(ga0, ga1);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
+  }
+
+  void SwapEndian() {
+    auto indices = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
+    vec = _mm256_shuffle_epi8(vec, indices);
+  }
+};
+
+SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
+                             const SIMDVec16& if_false) {
+  return SIMDVec16{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
+}
+
+SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
+                             const SIMDVec32& if_false) {
+  return SIMDVec32{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
+}
+
+struct Bits64 {
+  static constexpr size_t kLanes = 4;
+
+  __m256i nbits;
+  __m256i bits;
+
+  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
+    _mm256_storeu_si256((__m256i*)nbits_out, nbits);
+    _mm256_storeu_si256((__m256i*)bits_out, bits);
+  }
+};
+
+struct Bits32 {
+  __m256i nbits;
+  __m256i bits;
+
+  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
+    return Bits32{nbits.vec, bits.vec};
+  }
+
+  Bits64 Merge() const {
+    auto nbits_hi32 = _mm256_srli_epi64(nbits, 32);
+    auto nbits_lo32 = _mm256_and_si256(nbits, _mm256_set1_epi64x(0xFFFFFFFF));
+    auto bits_hi32 = _mm256_srli_epi64(bits, 32);
+    auto bits_lo32 = _mm256_and_si256(bits, _mm256_set1_epi64x(0xFFFFFFFF));
+
+    auto nbits64 = _mm256_add_epi64(nbits_hi32, nbits_lo32);
+    auto bits64 =
+        _mm256_or_si256(_mm256_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
+    return Bits64{nbits64, bits64};
+  }
+
+  void Interleave(const Bits32& low) {
+    bits = _mm256_or_si256(_mm256_sllv_epi32(bits, low.nbits), low.bits);
+    nbits = _mm256_add_epi32(nbits, low.nbits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 8);
+    constexpr uint32_t kMask[16] = {
+        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, 0, 0, 0, 0,
+    };
+    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
+    nbits = _mm256_and_si256(mask, nbits);
+    bits = _mm256_and_si256(mask, bits);
+  }
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 8);
+    constexpr uint32_t kMask[16] = {
+        0, 0, 0, 0, 0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
+    };
+    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
+    nbits = _mm256_and_si256(mask, nbits);
+    bits = _mm256_and_si256(mask, bits);
+  }
+};
+
+struct Bits16 {
+  __m256i nbits;
+  __m256i bits;
+
+  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
+    return Bits16{nbits.vec, bits.vec};
+  }
+
+  Bits32 Merge() const {
+    auto nbits_hi16 = _mm256_srli_epi32(nbits, 16);
+    auto nbits_lo16 = _mm256_and_si256(nbits, _mm256_set1_epi32(0xFFFF));
+    auto bits_hi16 = _mm256_srli_epi32(bits, 16);
+    auto bits_lo16 = _mm256_and_si256(bits, _mm256_set1_epi32(0xFFFF));
+
+    auto nbits32 = _mm256_add_epi32(nbits_hi16, nbits_lo16);
+    auto bits32 =
+        _mm256_or_si256(_mm256_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
+    return Bits32{nbits32, bits32};
+  }
+
+  void Interleave(const Bits16& low) {
+    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
+                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
+    auto low_nbits_masked =
+        _mm256_or_si256(low.nbits, _mm256_set1_epi16(0xFF00));
+
+    auto bits_shifted = _mm256_mullo_epi16(
+        bits, _mm256_shuffle_epi8(pow2_lo_lut, low_nbits_masked));
+
+    nbits = _mm256_add_epi16(nbits, low.nbits);
+    bits = _mm256_or_si256(bits_shifted, low.bits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 16);
+    constexpr uint16_t kMask[32] = {
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+    };
+    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
+    nbits = _mm256_and_si256(mask, nbits);
+    bits = _mm256_and_si256(mask, bits);
+  }
+
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 16);
+    constexpr uint16_t kMask[32] = {
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+    };
+    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
+    nbits = _mm256_and_si256(mask, nbits);
+    bits = _mm256_and_si256(mask, bits);
+  }
+};
+
+#endif
+
+#ifdef FJXL_NEON
+#define FJXL_GENERIC_SIMD
+
+struct SIMDVec32;
+
+struct Mask32 {
+  uint32x4_t mask;
+  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
+  Mask32 And(const Mask32& oth) const {
+    return Mask32{vandq_u32(mask, oth.mask)};
+  }
+  size_t CountPrefix() const {
+    uint32_t val_unset[4] = {0, 1, 2, 3};
+    uint32_t val_set[4] = {4, 4, 4, 4};
+    uint32x4_t val = vbslq_u32(mask, vld1q_u32(val_set), vld1q_u32(val_unset));
+    return vminvq_u32(val);
+  }
+};
+
+struct SIMDVec32 {
+  uint32x4_t vec;
+
+  static constexpr size_t kLanes = 4;
+
+  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
+    return SIMDVec32{vld1q_u32(data)};
+  }
+  FJXL_INLINE void Store(uint32_t* data) { vst1q_u32(data, vec); }
+  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
+    return SIMDVec32{vdupq_n_u32(v)};
+  }
+  FJXL_INLINE SIMDVec32 ValToToken() const {
+    return SIMDVec32{vsubq_u32(vdupq_n_u32(32), vclzq_u32(vec))};
+  }
+  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{vqsubq_u32(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{vsubq_u32(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
+    return SIMDVec32{vaddq_u32(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
+    return SIMDVec32{veorq_u32(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Pow2() const {
+    return SIMDVec32{vshlq_u32(vdupq_n_u32(1), vreinterpretq_s32_u32(vec))};
+  }
+  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
+    return Mask32{vceqq_u32(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
+    return Mask32{
+        vcgtq_s32(vreinterpretq_s32_u32(vec), vreinterpretq_s32_u32(oth.vec))};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
+    return SIMDVec32{
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(vec), i))};
+  }
+};
+
+struct SIMDVec16;
+
+struct Mask16 {
+  uint16x8_t mask;
+  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
+  Mask16 And(const Mask16& oth) const {
+    return Mask16{vandq_u16(mask, oth.mask)};
+  }
+  size_t CountPrefix() const {
+    uint16_t val_unset[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint16_t val_set[8] = {8, 8, 8, 8, 8, 8, 8, 8};
+    uint16x8_t val = vbslq_u16(mask, vld1q_u16(val_set), vld1q_u16(val_unset));
+    return vminvq_u16(val);
+  }
+};
+
+struct SIMDVec16 {
+  uint16x8_t vec;
+
+  static constexpr size_t kLanes = 8;
+
+  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
+    return SIMDVec16{vld1q_u16(data)};
+  }
+  FJXL_INLINE void Store(uint16_t* data) { vst1q_u16(data, vec); }
+  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
+    return SIMDVec16{vdupq_n_u16(v)};
+  }
+  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
+                                         const SIMDVec32& hi) {
+    return SIMDVec16{vmovn_high_u32(vmovn_u32(lo.vec), hi.vec)};
+  }
+
+  FJXL_INLINE SIMDVec16 ValToToken() const {
+    return SIMDVec16{vsubq_u16(vdupq_n_u16(16), vclzq_u16(vec))};
+  }
+  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{vqsubq_u16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{vsubq_u16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
+    return SIMDVec16{vaddq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
+    return SIMDVec16{vminq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
+    return Mask16{vceqq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
+    return Mask16{
+        vcgtq_s16(vreinterpretq_s16_u16(vec), vreinterpretq_s16_u16(oth.vec))};
+  }
+  FJXL_INLINE SIMDVec16 Pow2() const {
+    return SIMDVec16{vshlq_u16(vdupq_n_u16(1), vreinterpretq_s16_u16(vec))};
+  }
+  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
+    return SIMDVec16{vorrq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
+    return SIMDVec16{veorq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
+    return SIMDVec16{vandq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
+    return SIMDVec16{vhaddq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
+    return SIMDVec16{vorrq_u16(vec, vdupq_n_u16(0xFF00))};
+  }
+  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
+    uint8x16_t tbl = vld1q_u8(table);
+    uint8x16_t indices = vreinterpretq_u8_u16(vec);
+    return SIMDVec16{vreinterpretq_u16_u8(vqtbl1q_u8(tbl, indices))};
+  }
+  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
+    return {SIMDVec16{vzip1q_u16(low.vec, vec)},
+            SIMDVec16{vzip2q_u16(low.vec, vec)}};
+  }
+  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
+    uint32x4_t lo = vmovl_u16(vget_low_u16(vec));
+    uint32x4_t hi = vmovl_high_u16(vec);
+    return {SIMDVec32{lo}, SIMDVec32{hi}};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
+    return SIMDVec16{
+        vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(vec), i))};
+  }
+
+  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
+    uint8x8_t v = vld1_u8(data);
+    return {SIMDVec16{vmovl_u8(v)}};
+  }
+  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
+    return {Load((const uint16_t*)data)};
+  }
+
+  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
+    uint8x8x2_t v = vld2_u8(data);
+    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])}};
+  }
+  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
+    uint16x8x2_t v = vld2q_u16((const uint16_t*)data);
+    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}};
+  }
+
+  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
+    uint8x8x3_t v = vld3_u8(data);
+    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
+            SIMDVec16{vmovl_u8(v.val[2])}};
+  }
+  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
+    uint16x8x3_t v = vld3q_u16((const uint16_t*)data);
+    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]}};
+  }
+
+  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
+    uint8x8x4_t v = vld4_u8(data);
+    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
+            SIMDVec16{vmovl_u8(v.val[2])}, SIMDVec16{vmovl_u8(v.val[3])}};
+  }
+  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
+    uint16x8x4_t v = vld4q_u16((const uint16_t*)data);
+    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]},
+            SIMDVec16{v.val[3]}};
+  }
+
+  void SwapEndian() {
+    vec = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(vec)));
+  }
+};
+
+SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
+                             const SIMDVec16& if_false) {
+  return SIMDVec16{vbslq_u16(mask, if_true.vec, if_false.vec)};
+}
+
+SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
+                             const SIMDVec32& if_false) {
+  return SIMDVec32{vbslq_u32(mask, if_true.vec, if_false.vec)};
+}
+
+struct Bits64 {
+  static constexpr size_t kLanes = 2;
+
+  uint64x2_t nbits;
+  uint64x2_t bits;
+
+  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
+    vst1q_u64(nbits_out, nbits);
+    vst1q_u64(bits_out, bits);
+  }
+};
+
+struct Bits32 {
+  uint32x4_t nbits;
+  uint32x4_t bits;
+
+  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
+    return Bits32{nbits.vec, bits.vec};
+  }
+
+  Bits64 Merge() const {
+    // TODO(veluca): can probably be optimized.
+    uint64x2_t nbits_lo32 =
+        vandq_u64(vreinterpretq_u64_u32(nbits), vdupq_n_u64(0xFFFFFFFF));
+    uint64x2_t bits_hi32 =
+        vshlq_u64(vshrq_n_u64(vreinterpretq_u64_u32(bits), 32),
+                  vreinterpretq_s64_u64(nbits_lo32));
+    uint64x2_t bits_lo32 =
+        vandq_u64(vreinterpretq_u64_u32(bits), vdupq_n_u64(0xFFFFFFFF));
+    uint64x2_t nbits64 =
+        vsraq_n_u64(nbits_lo32, vreinterpretq_u64_u32(nbits), 32);
+    uint64x2_t bits64 = vorrq_u64(bits_hi32, bits_lo32);
+    return Bits64{nbits64, bits64};
+  }
+
+  void Interleave(const Bits32& low) {
+    bits =
+        vorrq_u32(vshlq_u32(bits, vreinterpretq_s32_u32(low.nbits)), low.bits);
+    nbits = vaddq_u32(nbits, low.nbits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 4);
+    constexpr uint32_t kMask[8] = {
+        ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0,
+    };
+    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
+    nbits = vandq_u32(mask, nbits);
+    bits = vandq_u32(mask, bits);
+  }
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 4);
+    constexpr uint32_t kMask[8] = {
+        0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u,
+    };
+    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
+    nbits = vandq_u32(mask, nbits);
+    bits = vandq_u32(mask, bits);
+  }
+};
+
+struct Bits16 {
+  uint16x8_t nbits;
+  uint16x8_t bits;
+
+  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
+    return Bits16{nbits.vec, bits.vec};
+  }
+
+  Bits32 Merge() const {
+    // TODO(veluca): can probably be optimized.
+    uint32x4_t nbits_lo16 =
+        vandq_u32(vreinterpretq_u32_u16(nbits), vdupq_n_u32(0xFFFF));
+    uint32x4_t bits_hi16 =
+        vshlq_u32(vshrq_n_u32(vreinterpretq_u32_u16(bits), 16),
+                  vreinterpretq_s32_u32(nbits_lo16));
+    uint32x4_t bits_lo16 =
+        vandq_u32(vreinterpretq_u32_u16(bits), vdupq_n_u32(0xFFFF));
+    uint32x4_t nbits32 =
+        vsraq_n_u32(nbits_lo16, vreinterpretq_u32_u16(nbits), 16);
+    uint32x4_t bits32 = vorrq_u32(bits_hi16, bits_lo16);
+    return Bits32{nbits32, bits32};
+  }
+
+  void Interleave(const Bits16& low) {
+    bits =
+        vorrq_u16(vshlq_u16(bits, vreinterpretq_s16_u16(low.nbits)), low.bits);
+    nbits = vaddq_u16(nbits, low.nbits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 8);
+    constexpr uint16_t kMask[16] = {
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0,      0,      0,      0,      0,      0,      0,      0,
+    };
+    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
+    nbits = vandq_u16(mask, nbits);
+    bits = vandq_u16(mask, bits);
+  }
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 8);
+    constexpr uint16_t kMask[16] = {
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+    };
+    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
+    nbits = vandq_u16(mask, nbits);
+    bits = vandq_u16(mask, bits);
+  }
+};
+
+#endif
+
+#ifdef FJXL_GENERIC_SIMD
+constexpr size_t SIMDVec32::kLanes;
+constexpr size_t SIMDVec16::kLanes;
+
+//  Each of these functions will process SIMDVec16::kLanes worth of values.
+
+FJXL_INLINE void TokenizeSIMD(const uint16_t* residuals, uint16_t* token_out,
+                              uint16_t* nbits_out, uint16_t* bits_out) {
+  SIMDVec16 res = SIMDVec16::Load(residuals);
+  SIMDVec16 token = res.ValToToken();
+  SIMDVec16 nbits = token.SatSubU(SIMDVec16::Val(1));
+  SIMDVec16 bits = res.SatSubU(nbits.Pow2());
+  token.Store(token_out);
+  nbits.Store(nbits_out);
+  bits.Store(bits_out);
+}
+
+FJXL_INLINE void TokenizeSIMD(const uint32_t* residuals, uint16_t* token_out,
+                              uint32_t* nbits_out, uint32_t* bits_out) {
+  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes, "");
+  SIMDVec32 res_lo = SIMDVec32::Load(residuals);
+  SIMDVec32 res_hi = SIMDVec32::Load(residuals + SIMDVec32::kLanes);
+  SIMDVec32 token_lo = res_lo.ValToToken();
+  SIMDVec32 token_hi = res_hi.ValToToken();
+  SIMDVec32 nbits_lo = token_lo.SatSubU(SIMDVec32::Val(1));
+  SIMDVec32 nbits_hi = token_hi.SatSubU(SIMDVec32::Val(1));
+  SIMDVec32 bits_lo = res_lo.SatSubU(nbits_lo.Pow2());
+  SIMDVec32 bits_hi = res_hi.SatSubU(nbits_hi.Pow2());
+  SIMDVec16 token = SIMDVec16::FromTwo32(token_lo, token_hi);
+  token.Store(token_out);
+  nbits_lo.Store(nbits_out);
+  nbits_hi.Store(nbits_out + SIMDVec32::kLanes);
+  bits_lo.Store(bits_out);
+  bits_hi.Store(bits_out + SIMDVec32::kLanes);
+}
+
+FJXL_INLINE void HuffmanSIMDUpTo13(const uint16_t* tokens,
+                                   const PrefixCode& code, uint16_t* nbits_out,
+                                   uint16_t* bits_out) {
+  SIMDVec16 tok = SIMDVec16::Load(tokens).PrepareForU8Lookup();
+  tok.U8Lookup(code.raw_nbits_simd).Store(nbits_out);
+  tok.U8Lookup(code.raw_bits_simd).Store(bits_out);
+}
+
+FJXL_INLINE void HuffmanSIMD14(const uint16_t* tokens, const PrefixCode& code,
+                               uint16_t* nbits_out, uint16_t* bits_out) {
+  SIMDVec16 token_cap = SIMDVec16::Val(15);
+  SIMDVec16 tok = SIMDVec16::Load(tokens);
+  SIMDVec16 tok_index = tok.Min(token_cap).PrepareForU8Lookup();
+  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(code.raw_bits_simd);
+  // Set the highest bit when token == 16; the Huffman code is constructed in
+  // such a way that the code for token 15 is the same as the code for 16,
+  // except for the highest bit.
+  Mask16 needs_high_bit = tok.Eq(SIMDVec16::Val(16));
+  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
+      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
+  huff_bits.Store(bits_out);
+  tok_index.U8Lookup(code.raw_nbits_simd).Store(nbits_out);
+}
+
+FJXL_INLINE void HuffmanSIMDAbove14(const uint16_t* tokens,
+                                    const PrefixCode& code, uint16_t* nbits_out,
+                                    uint16_t* bits_out) {
+  SIMDVec16 tok = SIMDVec16::Load(tokens);
+  // We assume `tok` fits in a *signed* 16-bit integer.
+  Mask16 above = tok.Gt(SIMDVec16::Val(12));
+  // 13, 14 -> 13
+  // 15, 16 -> 14
+  // 17, 18 -> 15
+  SIMDVec16 remap_tok = above.IfThenElse(tok.HAdd(SIMDVec16::Val(13)), tok);
+  SIMDVec16 tok_index = remap_tok.PrepareForU8Lookup();
+  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(code.raw_bits_simd);
+  // Set the highest bit when token == 14, 16, 18.
+  Mask16 needs_high_bit = above.And(tok.Eq(tok.And(SIMDVec16::Val(0xFFFE))));
+  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
+      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
+  huff_bits.Store(bits_out);
+  tok_index.U8Lookup(code.raw_nbits_simd).Store(nbits_out);
+}
+
+FJXL_INLINE void StoreSIMDUpTo8(const uint16_t* nbits_tok,
+                                const uint16_t* bits_tok,
+                                const uint16_t* nbits_huff,
+                                const uint16_t* bits_huff, size_t n,
+                                size_t skip, Bits32* bits_out) {
+  Bits16 bits =
+      Bits16::FromRaw(SIMDVec16::Load(nbits_tok), SIMDVec16::Load(bits_tok));
+  Bits16 huff_bits =
+      Bits16::FromRaw(SIMDVec16::Load(nbits_huff), SIMDVec16::Load(bits_huff));
+  bits.Interleave(huff_bits);
+  bits.ClipTo(n);
+  bits.Skip(skip);
+  bits_out[0] = bits.Merge();
+}
+
+// Huffman and raw bits don't necessarily fit in a single u16 here.
+FJXL_INLINE void StoreSIMDUpTo14(const uint16_t* nbits_tok,
+                                 const uint16_t* bits_tok,
+                                 const uint16_t* nbits_huff,
+                                 const uint16_t* bits_huff, size_t n,
+                                 size_t skip, Bits32* bits_out) {
+  VecPair<SIMDVec16> bits =
+      SIMDVec16::Load(bits_tok).Interleave(SIMDVec16::Load(bits_huff));
+  VecPair<SIMDVec16> nbits =
+      SIMDVec16::Load(nbits_tok).Interleave(SIMDVec16::Load(nbits_huff));
+  Bits16 low = Bits16::FromRaw(nbits.low, bits.low);
+  Bits16 hi = Bits16::FromRaw(nbits.hi, bits.hi);
+  low.ClipTo(2 * n);
+  low.Skip(2 * skip);
+  hi.ClipTo(std::max(2 * n, SIMDVec16::kLanes) - SIMDVec16::kLanes);
+  hi.Skip(std::max(2 * skip, SIMDVec16::kLanes) - SIMDVec16::kLanes);
+
+  bits_out[0] = low.Merge();
+  bits_out[1] = hi.Merge();
+}
+
+FJXL_INLINE void StoreSIMDAbove14(const uint32_t* nbits_tok,
+                                  const uint32_t* bits_tok,
+                                  const uint16_t* nbits_huff,
+                                  const uint16_t* bits_huff, size_t n,
+                                  size_t skip, Bits32* bits_out) {
+  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes, "");
+  Bits32 bits_low =
+      Bits32::FromRaw(SIMDVec32::Load(nbits_tok), SIMDVec32::Load(bits_tok));
+  Bits32 bits_hi =
+      Bits32::FromRaw(SIMDVec32::Load(nbits_tok + SIMDVec32::kLanes),
+                      SIMDVec32::Load(bits_tok + SIMDVec32::kLanes));
+
+  VecPair<SIMDVec32> huff_bits = SIMDVec16::Load(bits_huff).Upcast();
+  VecPair<SIMDVec32> huff_nbits = SIMDVec16::Load(nbits_huff).Upcast();
+
+  Bits32 huff_low = Bits32::FromRaw(huff_nbits.low, huff_bits.low);
+  Bits32 huff_hi = Bits32::FromRaw(huff_nbits.hi, huff_bits.hi);
+
+  bits_low.Interleave(huff_low);
+  bits_low.ClipTo(n);
+  bits_low.Skip(skip);
+  bits_out[0] = bits_low;
+  bits_hi.Interleave(huff_hi);
+  bits_hi.ClipTo(std::max(n, SIMDVec32::kLanes) - SIMDVec32::kLanes);
+  bits_hi.Skip(std::max(skip, SIMDVec32::kLanes) - SIMDVec32::kLanes);
+  bits_out[1] = bits_hi;
+}
+
+#ifdef FJXL_AVX512
+FJXL_INLINE void StoreToWriterAVX512(const Bits32& bits32, BitWriter& output) {
+  __m512i bits = bits32.bits;
+  __m512i nbits = bits32.nbits;
+
+  // Insert the leftover bits from the bit buffer at the bottom of the vector
+  // and extract the top of the vector.
+  uint64_t trail_bits =
+      _mm512_cvtsi512_si32(_mm512_alignr_epi32(bits, bits, 15));
+  uint64_t trail_nbits =
+      _mm512_cvtsi512_si32(_mm512_alignr_epi32(nbits, nbits, 15));
+  __m512i lead_bits = _mm512_set1_epi32(output.buffer);
+  __m512i lead_nbits = _mm512_set1_epi32(output.bits_in_buffer);
+  bits = _mm512_alignr_epi32(bits, lead_bits, 15);
+  nbits = _mm512_alignr_epi32(nbits, lead_nbits, 15);
+
+  // Merge 32 -> 64 bits.
+  Bits32 b{nbits, bits};
+  Bits64 b64 = b.Merge();
+  bits = b64.bits;
+  nbits = b64.nbits;
+
+  __m512i zero = _mm512_setzero_si512();
+
+  auto sh1 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 7); };
+  auto sh2 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 6); };
+  auto sh4 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 4); };
+
+  // Compute first-past-end-bit-position.
+  __m512i end_interm0 = _mm512_add_epi64(nbits, sh1(nbits));
+  __m512i end_interm1 = _mm512_add_epi64(end_interm0, sh2(end_interm0));
+  __m512i end = _mm512_add_epi64(end_interm1, sh4(end_interm1));
+
+  uint64_t simd_nbits = _mm512_cvtsi512_si32(_mm512_alignr_epi64(end, end, 7));
+
+  // Compute begin-bit-position.
+  __m512i begin = _mm512_sub_epi64(end, nbits);
+
+  // Index of the last bit in the chunk, or the end bit if nbits==0.
+  __m512i last = _mm512_mask_sub_epi64(
+      end, _mm512_cmpneq_epi64_mask(nbits, zero), end, _mm512_set1_epi64(1));
+
+  __m512i lane_offset_mask = _mm512_set1_epi64(63);
+
+  // Starting position of the chunk that each lane will ultimately belong to.
+  __m512i chunk_start = _mm512_andnot_si512(lane_offset_mask, last);
+
+  // For all lanes that contain bits belonging to two different 64-bit chunks,
+  // compute the number of bits that belong to the first chunk.
+  // total # of bits fit in a u16, so we can satsub_u16 here.
+  __m512i first_chunk_nbits = _mm512_subs_epu16(chunk_start, begin);
+
+  // Move all the previous-chunk-bits to the previous lane.
+  __m512i negnbits = _mm512_sub_epi64(_mm512_set1_epi64(64), first_chunk_nbits);
+  __m512i first_chunk_bits =
+      _mm512_srlv_epi64(_mm512_sllv_epi64(bits, negnbits), negnbits);
+  __m512i first_chunk_bits_down =
+      _mm512_alignr_epi32(zero, first_chunk_bits, 2);
+  bits = _mm512_srlv_epi64(bits, first_chunk_nbits);
+  nbits = _mm512_sub_epi64(nbits, first_chunk_nbits);
+  bits = _mm512_or_si512(bits, _mm512_sllv_epi64(first_chunk_bits_down, nbits));
+  begin = _mm512_add_epi64(begin, first_chunk_nbits);
+
+  // We now know that every lane should give bits to only one chunk. We can
+  // shift the bits and then horizontally-or-reduce them within the same chunk.
+  __m512i offset = _mm512_and_si512(begin, lane_offset_mask);
+  __m512i aligned_bits = _mm512_sllv_epi64(bits, offset);
+  // h-or-reduce within same chunk
+  __m512i red0 = _mm512_mask_or_epi64(
+      aligned_bits, _mm512_cmpeq_epi64_mask(sh1(chunk_start), chunk_start),
+      sh1(aligned_bits), aligned_bits);
+  __m512i red1 = _mm512_mask_or_epi64(
+      red0, _mm512_cmpeq_epi64_mask(sh2(chunk_start), chunk_start), sh2(red0),
+      red0);
+  __m512i reduced = _mm512_mask_or_epi64(
+      red1, _mm512_cmpeq_epi64_mask(sh4(chunk_start), chunk_start), sh4(red1),
+      red1);
+  // Extract the highest lane that belongs to each chunk (the lane that ends up
+  // with the OR-ed value of all the other lanes of that chunk).
+  __m512i next_chunk_start =
+      _mm512_alignr_epi32(_mm512_set1_epi64(~0), chunk_start, 2);
+  __m512i result = _mm512_maskz_compress_epi64(
+      _mm512_cmpneq_epi64_mask(chunk_start, next_chunk_start), reduced);
+
+  _mm512_storeu_si512((__m512i*)(output.data.get() + output.bytes_written),
+                      result);
+
+  // Update the bit writer and add the last 32-bit lane.
+  // Note that since trail_nbits was at most 32 to begin with, operating on
+  // trail_bits does not risk overflowing.
+  output.bytes_written += simd_nbits / 8;
+  // Here we are implicitly relying on the fact that simd_nbits < 512 to know
+  // that the byte of bitreader data we access is initialized. This is
+  // guaranteed because the remaining bits in the bitreader buffer are at most
+  // 7, so simd_nbits <= 505 always.
+  trail_bits = (trail_bits << (simd_nbits % 8)) +
+               output.data.get()[output.bytes_written];
+  trail_nbits += simd_nbits % 8;
+  StoreLE64(output.data.get() + output.bytes_written, trail_bits);
+  size_t trail_bytes = trail_nbits / 8;
+  output.bits_in_buffer = trail_nbits % 8;
+  output.buffer = trail_bits >> (trail_bytes * 8);
+  output.bytes_written += trail_bytes;
+}
+
+#endif
+
+template <size_t n>
+FJXL_INLINE void StoreToWriter(const Bits32* bits, BitWriter& output) {
+#ifdef FJXL_AVX512
+  static_assert(n <= 2, "");
+  StoreToWriterAVX512(bits[0], output);
+  if (n == 2) {
+    StoreToWriterAVX512(bits[1], output);
+  }
+  return;
+#endif
+  static_assert(n <= 4, "");
+  alignas(64) uint64_t nbits64[Bits64::kLanes * n];
+  alignas(64) uint64_t bits64[Bits64::kLanes * n];
+  bits[0].Merge().Store(nbits64, bits64);
+  if (n > 1) {
+    bits[1].Merge().Store(nbits64 + Bits64::kLanes, bits64 + Bits64::kLanes);
+  }
+  if (n > 2) {
+    bits[2].Merge().Store(nbits64 + 2 * Bits64::kLanes,
+                          bits64 + 2 * Bits64::kLanes);
+  }
+  if (n > 3) {
+    bits[3].Merge().Store(nbits64 + 3 * Bits64::kLanes,
+                          bits64 + 3 * Bits64::kLanes);
+  }
+  output.WriteMultiple(nbits64, bits64, Bits64::kLanes * n);
+}
+
+namespace detail {
+template <typename T>
+struct IntegerTypes;
+
+template <>
+struct IntegerTypes<SIMDVec16> {
+  using signed_ = int16_t;
+  using unsigned_ = uint16_t;
+};
+
+template <>
+struct IntegerTypes<SIMDVec32> {
+  using signed_ = int32_t;
+  using unsigned_ = uint32_t;
+};
+
+template <typename T>
+struct SIMDType;
+
+template <>
+struct SIMDType<int16_t> {
+  using type = SIMDVec16;
+};
+
+template <>
+struct SIMDType<int32_t> {
+  using type = SIMDVec32;
+};
+
+}  // namespace detail
+
+template <typename T>
+using signed_t = typename detail::IntegerTypes<T>::signed_;
+
+template <typename T>
+using unsigned_t = typename detail::IntegerTypes<T>::unsigned_;
+
+template <typename T>
+using simd_t = typename detail::SIMDType<T>::type;
+
+// This function will process exactly one vector worth of pixels.
+
+template <typename T>
+size_t PredictPixels(const signed_t<T>* pixels, const signed_t<T>* pixels_left,
+                     const signed_t<T>* pixels_top,
+                     const signed_t<T>* pixels_topleft,
+                     unsigned_t<T>* residuals) {
+  T px = T::Load((unsigned_t<T>*)pixels);
+  T left = T::Load((unsigned_t<T>*)pixels_left);
+  T top = T::Load((unsigned_t<T>*)pixels_top);
+  T topleft = T::Load((unsigned_t<T>*)pixels_topleft);
+  T ac = left.Sub(topleft);
+  T ab = left.Sub(top);
+  T bc = top.Sub(topleft);
+  T grad = ac.Add(top);
+  T d = ab.Xor(bc);
+  T zero = T::Val(0);
+  T clamp = zero.Gt(d).IfThenElse(top, left);
+  T s = ac.Xor(bc);
+  T pred = zero.Gt(s).IfThenElse(grad, clamp);
+  T res = px.Sub(pred);
+  T res_times_2 = res.Add(res);
+  res = zero.Gt(res).IfThenElse(T::Val(-1).Sub(res_times_2), res_times_2);
+  res.Store(residuals);
+  return res.Eq(T::Val(0)).CountPrefix();
+}
+
+#endif
+
+void EncodeHybridUint000(uint32_t value, uint32_t* token, uint32_t* nbits,
+                         uint32_t* bits) {
+  uint32_t n = FloorLog2(value);
+  *token = value ? n + 1 : 0;
+  *nbits = value ? n : 0;
+  *bits = value ? value - (1 << n) : 0;
+}
+
+#ifdef FJXL_AVX512
+constexpr static size_t kLogChunkSize = 5;
+#elif defined(FJXL_AVX2) || defined(FJXL_NEON)
+// Even if NEON only has 128-bit lanes, it is still significantly (~1.3x) faster
+// to process two vectors at a time.
+constexpr static size_t kLogChunkSize = 4;
+#else
+constexpr static size_t kLogChunkSize = 3;
+#endif
+
+constexpr static size_t kChunkSize = 1 << kLogChunkSize;
+
+template <typename Residual>
+void GenericEncodeChunk(const Residual* residuals, size_t n, size_t skip,
+                        const PrefixCode& code, BitWriter& output) {
+  for (size_t ix = skip; ix < n; ix++) {
+    unsigned token, nbits, bits;
+    EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
+    output.Write(code.raw_nbits[token] + nbits,
+                 code.raw_bits[token] | bits << code.raw_nbits[token]);
+  }
+}
+
+struct UpTo8Bits {
+  size_t bitdepth;
+  explicit UpTo8Bits(size_t bitdepth) : bitdepth(bitdepth) {
+    assert(bitdepth <= 8);
+  }
+  // Here we can fit up to 9 extra bits + 7 Huffman bits in a u16; for all other
+  // symbols, we could actually go up to 8 Huffman bits as we have at most 8
+  // extra bits; however, the SIMD bit merging logic for AVX2 assumes that no
+  // Huffman length is 8 or more, so we cap at 8 anyway. Last symbol is used for
+  // LZ77 lengths and has no limitations except allowing to represent 32 symbols
+  // in total.
+  static constexpr uint8_t kMinRawLength[12] = {};
+  static constexpr uint8_t kMaxRawLength[12] = {
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10,
+  };
+  static size_t MaxEncodedBitsPerSample() { return 16; }
+  static constexpr size_t kInputBytes = 1;
+  using pixel_t = int16_t;
+  using upixel_t = uint16_t;
+
+  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
+                             size_t n, uint8_t* nbits_simd,
+                             uint8_t* bits_simd) {
+    assert(n <= 16);
+    memcpy(nbits_simd, nbits, 16);
+    memcpy(bits_simd, bits, 16);
+  }
+
+  static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip,
+                          const PrefixCode& code, BitWriter& output) {
+#ifdef FJXL_GENERIC_SIMD
+    Bits32 bits32[kChunkSize / SIMDVec16::kLanes];
+    alignas(64) uint16_t bits[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
+    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t token[SIMDVec16::kLanes];
+    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
+      TokenizeSIMD(residuals + i, token, nbits, bits);
+      HuffmanSIMDUpTo13(token, code, nbits_huff, bits_huff);
+      StoreSIMDUpTo8(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
+                     std::max(skip, i) - i, bits32 + i / SIMDVec16::kLanes);
+    }
+    StoreToWriter<kChunkSize / SIMDVec16::kLanes>(bits32, output);
+    return;
+#endif
+    GenericEncodeChunk(residuals, n, skip, code, output);
+  }
+
+  size_t NumSymbols(bool doing_ycocg) const {
+    // values gain 1 bit for YCoCg, 1 bit for prediction.
+    // Maximum symbol is 1 + effective bit depth of residuals.
+    if (doing_ycocg) {
+      return bitdepth + 3;
+    } else {
+      return bitdepth + 2;
+    }
+  }
+};
+constexpr uint8_t UpTo8Bits::kMinRawLength[];
+constexpr uint8_t UpTo8Bits::kMaxRawLength[];
+
+struct From9To13Bits {
+  size_t bitdepth;
+  explicit From9To13Bits(size_t bitdepth) : bitdepth(bitdepth) {
+    assert(bitdepth <= 13 && bitdepth >= 9);
+  }
+  // Last symbol is used for LZ77 lengths and has no limitations except allowing
+  // to represent 32 symbols in total.
+  // We cannot fit all the bits in a u16, so do not even try and use up to 8
+  // bits per raw symbol.
+  // There are at most 16 raw symbols, so Huffman coding can be SIMDfied without
+  // any special tricks.
+  static constexpr uint8_t kMinRawLength[17] = {};
+  static constexpr uint8_t kMaxRawLength[17] = {
+      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10,
+  };
+  static size_t MaxEncodedBitsPerSample() { return 21; }
+  static constexpr size_t kInputBytes = 2;
+  using pixel_t = int16_t;
+  using upixel_t = uint16_t;
+
+  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
+                             size_t n, uint8_t* nbits_simd,
+                             uint8_t* bits_simd) {
+    assert(n <= 16);
+    memcpy(nbits_simd, nbits, 16);
+    memcpy(bits_simd, bits, 16);
+  }
+
+  static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip,
+                          const PrefixCode& code, BitWriter& output) {
+#ifdef FJXL_GENERIC_SIMD
+    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
+    alignas(64) uint16_t bits[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
+    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t token[SIMDVec16::kLanes];
+    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
+      TokenizeSIMD(residuals + i, token, nbits, bits);
+      HuffmanSIMDUpTo13(token, code, nbits_huff, bits_huff);
+      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
+                      std::max(skip, i) - i,
+                      bits32 + 2 * i / SIMDVec16::kLanes);
+    }
+    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
+    return;
+#endif
+    GenericEncodeChunk(residuals, n, skip, code, output);
+  }
+
+  size_t NumSymbols(bool doing_ycocg) const {
+    // values gain 1 bit for YCoCg, 1 bit for prediction.
+    // Maximum symbol is 1 + effective bit depth of residuals.
+    if (doing_ycocg) {
+      return bitdepth + 3;
+    } else {
+      return bitdepth + 2;
+    }
+  }
+};
+constexpr uint8_t From9To13Bits::kMinRawLength[];
+constexpr uint8_t From9To13Bits::kMaxRawLength[];
+
+void CheckHuffmanBitsSIMD(int bits1, int nbits1, int bits2, int nbits2) {
+  assert(nbits1 == 8);
+  assert(nbits2 == 8);
+  assert(bits2 == (bits1 | 128));
+}
+
+struct Exactly14Bits {
+  explicit Exactly14Bits(size_t bitdepth) { assert(bitdepth == 14); }
+  // Force LZ77 symbols to have at least 8 bits, and raw symbols 15 and 16 to
+  // have exactly 8, and no other symbol to have 8 or more. This ensures that
+  // the representation for 15 and 16 is identical up to one bit.
+  static constexpr uint8_t kMinRawLength[18] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 7,
+  };
+  static constexpr uint8_t kMaxRawLength[18] = {
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 10,
+  };
+  static constexpr size_t bitdepth = 14;
+  static size_t MaxEncodedBitsPerSample() { return 22; }
+  static constexpr size_t kInputBytes = 2;
+  using pixel_t = int16_t;
+  using upixel_t = uint16_t;
+
+  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
+                             size_t n, uint8_t* nbits_simd,
+                             uint8_t* bits_simd) {
+    assert(n == 17);
+    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
+    memcpy(nbits_simd, nbits, 16);
+    memcpy(bits_simd, bits, 16);
+  }
+
+  static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip,
+                          const PrefixCode& code, BitWriter& output) {
+#ifdef FJXL_GENERIC_SIMD
+    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
+    alignas(64) uint16_t bits[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
+    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t token[SIMDVec16::kLanes];
+    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
+      TokenizeSIMD(residuals + i, token, nbits, bits);
+      HuffmanSIMD14(token, code, nbits_huff, bits_huff);
+      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
+                      std::max(skip, i) - i,
+                      bits32 + 2 * i / SIMDVec16::kLanes);
+    }
+    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
+    return;
+#endif
+    GenericEncodeChunk(residuals, n, skip, code, output);
+  }
+
+  size_t NumSymbols(bool) const { return 17; }
+};
+constexpr uint8_t Exactly14Bits::kMinRawLength[];
+constexpr uint8_t Exactly14Bits::kMaxRawLength[];
+
+struct MoreThan14Bits {
+  size_t bitdepth;
+  explicit MoreThan14Bits(size_t bitdepth) : bitdepth(bitdepth) {
+    assert(bitdepth > 14);
+    assert(bitdepth <= 16);
+  }
+  // Force LZ77 symbols to have at least 8 bits, and raw symbols 13 to 18 to
+  // have exactly 8, and no other symbol to have 8 or more. This ensures that
+  // the representation for (13, 14), (15, 16), (17, 18) is identical up to one
+  // bit.
+  static constexpr uint8_t kMinRawLength[20] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 7,
+  };
+  static constexpr uint8_t kMaxRawLength[20] = {
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 10,
+  };
+  static size_t MaxEncodedBitsPerSample() { return 24; }
+  static constexpr size_t kInputBytes = 2;
+  using pixel_t = int32_t;
+  using upixel_t = uint32_t;
+
+  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
+                             size_t n, uint8_t* nbits_simd,
+                             uint8_t* bits_simd) {
+    assert(n == 19);
+    CheckHuffmanBitsSIMD(bits[13], nbits[13], bits[14], nbits[14]);
+    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
+    CheckHuffmanBitsSIMD(bits[17], nbits[17], bits[18], nbits[18]);
+    for (size_t i = 0; i < 14; i++) {
+      nbits_simd[i] = nbits[i];
+      bits_simd[i] = bits[i];
+    }
+    nbits_simd[14] = nbits[15];
+    bits_simd[14] = bits[15];
+    nbits_simd[15] = nbits[17];
+    bits_simd[15] = bits[17];
+  }
+
+  static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip,
+                          const PrefixCode& code, BitWriter& output) {
+#ifdef FJXL_GENERIC_SIMD
+    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
+    alignas(64) uint32_t bits[SIMDVec16::kLanes];
+    alignas(64) uint32_t nbits[SIMDVec16::kLanes];
+    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t token[SIMDVec16::kLanes];
+    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
+      TokenizeSIMD(residuals + i, token, nbits, bits);
+      HuffmanSIMDAbove14(token, code, nbits_huff, bits_huff);
+      StoreSIMDAbove14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
+                       std::max(skip, i) - i,
+                       bits32 + 2 * i / SIMDVec16::kLanes);
+    }
+    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
+    return;
+#endif
+    GenericEncodeChunk(residuals, n, skip, code, output);
+  }
+  size_t NumSymbols(bool) const { return 19; }
+};
+constexpr uint8_t MoreThan14Bits::kMinRawLength[];
+constexpr uint8_t MoreThan14Bits::kMaxRawLength[];
+
+void PrepareDCGlobalCommon(bool is_single_group, size_t width, size_t height,
+                           const PrefixCode code[4], BitWriter* output) {
+  output->Allocate(100000 + (is_single_group ? width * height * 16 : 0));
+  // No patches, spline or noise.
+  output->Write(1, 1);  // default DC dequantization factors (?)
+  output->Write(1, 1);  // use global tree / histograms
+  output->Write(1, 0);  // no lz77 for the tree
+
+  output->Write(1, 1);         // simple code for the tree's context map
+  output->Write(2, 0);         // all contexts clustered together
+  output->Write(1, 1);         // use prefix code for tree
+  output->Write(4, 0);         // 000 hybrid uint
+  output->Write(6, 0b100011);  // Alphabet size is 4 (var16)
+  output->Write(2, 1);         // simple prefix code
+  output->Write(2, 3);         // with 4 symbols
+  output->Write(2, 0);
+  output->Write(2, 1);
+  output->Write(2, 2);
+  output->Write(2, 3);
+  output->Write(1, 0);  // First tree encoding option
+  // Huffman table + extra bits for the tree.
+  uint8_t symbol_bits[6] = {0b00, 0b10, 0b001, 0b101, 0b0011, 0b0111};
+  uint8_t symbol_nbits[6] = {2, 2, 3, 3, 4, 4};
+  // Write a tree with a leaf per channel, and gradient predictor for every
+  // leaf.
+  for (auto v : {1, 2, 1, 4, 1, 0, 0, 5, 0, 0, 0, 0, 5,
+                 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0}) {
+    output->Write(symbol_nbits[v], symbol_bits[v]);
+  }
+
+  output->Write(1, 1);     // Enable lz77 for the main bitstream
+  output->Write(2, 0b00);  // lz77 offset 224
+  static_assert(kLZ77Offset == 224, "");
+  output->Write(4, 0b1010);  // lz77 min length 7
+  // 400 hybrid uint config for lz77
+  output->Write(4, 4);
+  output->Write(3, 0);
+  output->Write(3, 0);
+
+  output->Write(1, 1);  // simple code for the context map
+  output->Write(2, 3);  // 3 bits per entry
+  output->Write(3, 4);  // channel 3
+  output->Write(3, 3);  // channel 2
+  output->Write(3, 2);  // channel 1
+  output->Write(3, 1);  // channel 0
+  output->Write(3, 0);  // distance histogram first
+
+  output->Write(1, 1);  // use prefix codes
+  output->Write(4, 0);  // 000 hybrid uint config for distances (only need 0)
+  for (size_t i = 0; i < 4; i++) {
+    output->Write(4, 0);  // 000 hybrid uint config for symbols (only <= 10)
+  }
+
+  // Distance alphabet size:
+  output->Write(5, 0b00001);  // 2: just need 1 for RLE (i.e. distance 1)
+  // Symbol + LZ77 alphabet size:
+  for (size_t i = 0; i < 4; i++) {
+    output->Write(1, 1);    // > 1
+    output->Write(4, 8);    // <= 512
+    output->Write(8, 256);  // == 512
+  }
+
+  // Distance histogram:
+  output->Write(2, 1);  // simple prefix code
+  output->Write(2, 0);  // with one symbol
+  output->Write(1, 1);  // 1
+
+  // Symbol + lz77 histogram:
+  for (size_t i = 0; i < 4; i++) {
+    code[i].WriteTo(output);
+  }
+
+  // Group header for global modular image.
+  output->Write(1, 1);  // Global tree
+  output->Write(1, 1);  // All default wp
+}
+
+void PrepareDCGlobal(bool is_single_group, size_t width, size_t height,
+                     size_t nb_chans, const PrefixCode code[4],
+                     BitWriter* output) {
+  PrepareDCGlobalCommon(is_single_group, width, height, code, output);
+  if (nb_chans > 2) {
+    output->Write(2, 0b01);     // 1 transform
+    output->Write(2, 0b00);     // RCT
+    output->Write(5, 0b00000);  // Starting from ch 0
+    output->Write(2, 0b00);     // YCoCg
+  } else {
+    output->Write(2, 0b00);  // no transforms
+  }
+  if (!is_single_group) {
+    output->ZeroPadToByte();
+  }
+}
+
+template <typename BitDepth>
+struct ChunkEncoder {
+  FJXL_INLINE static void EncodeRle(size_t count, const PrefixCode& code,
+                                    BitWriter& output) {
+    if (count == 0) return;
+    count -= kLZ77MinLength + 1;
+    if (count < kLZ77CacheSize) {
+      output.Write(code.lz77_cache_nbits[count], code.lz77_cache_bits[count]);
+    } else {
+      unsigned token, nbits, bits;
+      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
+      uint64_t wbits = bits;
+      wbits = (wbits << code.lz77_nbits[token]) | code.lz77_bits[token];
+      wbits = (wbits << code.raw_nbits[0]) | code.raw_bits[0];
+      output.Write(code.lz77_nbits[token] + nbits + code.raw_nbits[0], wbits);
+    }
+  }
+
+  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
+                         size_t skip, size_t n) {
+    EncodeRle(run, *code, *output);
+    BitDepth::EncodeChunk(residuals, n, skip, *code, *output);
+  }
+
+  inline void Finalize(size_t run) { EncodeRle(run, *code, *output); }
+
+  const PrefixCode* code;
+  BitWriter* output;
+};
+
+template <typename BitDepth>
+struct ChunkSampleCollector {
+  FJXL_INLINE void Rle(size_t count, uint64_t* lz77_counts) {
+    if (count == 0) return;
+    raw_counts[0] += 1;
+    count -= kLZ77MinLength + 1;
+    unsigned token, nbits, bits;
+    EncodeHybridUintLZ77(count, &token, &nbits, &bits);
+    lz77_counts[token]++;
+  }
+
+  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
+                         size_t skip, size_t n) {
+    // Run is broken. Encode the run and encode the individual vector.
+    Rle(run, lz77_counts);
+    for (size_t ix = skip; ix < n; ix++) {
+      unsigned token, nbits, bits;
+      EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
+      raw_counts[token]++;
+    }
+  }
+
+  // don't count final run since we don't know how long it really is
+  void Finalize(size_t run) {}
+
+  uint64_t* raw_counts;
+  uint64_t* lz77_counts;
+};
+
+constexpr uint32_t PackSigned(int32_t value) {
+  return (static_cast<uint32_t>(value) << 1) ^
+         ((static_cast<uint32_t>(~value) >> 31) - 1);
+}
+
+template <typename T, typename BitDepth>
+struct ChannelRowProcessor {
+  using upixel_t = typename BitDepth::upixel_t;
+  using pixel_t = typename BitDepth::pixel_t;
+  T* t;
+  void ProcessChunk(const pixel_t* row, const pixel_t* row_left,
+                    const pixel_t* row_top, const pixel_t* row_topleft,
+                    size_t n) {
+    alignas(64) upixel_t residuals[kChunkSize] = {};
+    size_t prefix_size = 0;
+    size_t required_prefix_size = 0;
+#ifdef FJXL_GENERIC_SIMD
+    constexpr size_t kNum =
+        sizeof(pixel_t) == 2 ? SIMDVec16::kLanes : SIMDVec32::kLanes;
+    for (size_t ix = 0; ix < kChunkSize; ix += kNum) {
+      size_t c =
+          PredictPixels<simd_t<pixel_t>>(row + ix, row_left + ix, row_top + ix,
+                                         row_topleft + ix, residuals + ix);
+      prefix_size =
+          prefix_size == required_prefix_size ? prefix_size + c : prefix_size;
+      required_prefix_size += kNum;
+    }
+#else
+    for (size_t ix = 0; ix < kChunkSize; ix++) {
+      pixel_t px = row[ix];
+      pixel_t left = row_left[ix];
+      pixel_t top = row_top[ix];
+      pixel_t topleft = row_topleft[ix];
+      pixel_t ac = left - topleft;
+      pixel_t ab = left - top;
+      pixel_t bc = top - topleft;
+      pixel_t grad = static_cast<pixel_t>(static_cast<upixel_t>(ac) +
+                                          static_cast<upixel_t>(top));
+      pixel_t d = ab ^ bc;
+      pixel_t clamp = d < 0 ? top : left;
+      pixel_t s = ac ^ bc;
+      pixel_t pred = s < 0 ? grad : clamp;
+      residuals[ix] = PackSigned(px - pred);
+      prefix_size = prefix_size == required_prefix_size
+                        ? prefix_size + (residuals[ix] == 0)
+                        : prefix_size;
+      required_prefix_size += 1;
+    }
+#endif
+    prefix_size = std::min(n, prefix_size);
+    if (prefix_size == n && (run > 0 || prefix_size > kLZ77MinLength)) {
+      // Run continues, nothing to do.
+      run += prefix_size;
+    } else if (prefix_size + run > kLZ77MinLength) {
+      // Run is broken. Encode the run and encode the individual vector.
+      t->Chunk(run + prefix_size, residuals, prefix_size, n);
+      run = 0;
+    } else {
+      // There was no run to begin with.
+      t->Chunk(0, residuals, 0, n);
+    }
+  }
+
+  void ProcessRow(const pixel_t* row, const pixel_t* row_left,
+                  const pixel_t* row_top, const pixel_t* row_topleft,
+                  size_t xs) {
+    for (size_t x = 0; x < xs; x += kChunkSize) {
+      ProcessChunk(row + x, row_left + x, row_top + x, row_topleft + x,
+                   std::min(kChunkSize, xs - x));
+    }
+  }
+
+  void Finalize() { t->Finalize(run); }
+  // Invariant: run == 0 or run > kLZ77MinLength.
+  size_t run = 0;
+};
+
+uint16_t LoadLE16(const unsigned char* ptr) {
+  return uint16_t{ptr[0]} | (uint16_t{ptr[1]} << 8);
+}
+
+uint16_t SwapEndian(uint16_t in) { return (in >> 8) | (in << 8); }
+
+#ifdef FJXL_GENERIC_SIMD
+void StorePixels(SIMDVec16 p, int16_t* dest) { p.Store((uint16_t*)dest); }
+
+void StorePixels(SIMDVec16 p, int32_t* dest) {
+  VecPair<SIMDVec32> p_up = p.Upcast();
+  p_up.low.Store((uint32_t*)dest);
+  p_up.hi.Store((uint32_t*)dest + SIMDVec32::kLanes);
+}
+#endif
+
+template <typename pixel_t>
+void FillRowG8(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadG8(rgba + x);
+    StorePixels(rgb[0], luma + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    luma[x] = rgba[x];
+  }
+}
+
+template <bool big_endian, typename pixel_t>
+void FillRowG16(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadG16(rgba + 2 * x);
+    if (big_endian) {
+      rgb[0].SwapEndian();
+    }
+    StorePixels(rgb[0], luma + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t val = LoadLE16(rgba + 2 * x);
+    if (big_endian) {
+      val = SwapEndian(val);
+    }
+    luma[x] = val;
+  }
+}
+
+template <typename pixel_t>
+void FillRowGA8(const unsigned char* rgba, size_t oxs, pixel_t* luma,
+                pixel_t* alpha) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadGA8(rgba + 2 * x);
+    StorePixels(rgb[0], luma + x);
+    StorePixels(rgb[1], alpha + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    luma[x] = rgba[2 * x];
+    alpha[x] = rgba[2 * x + 1];
+  }
+}
+
+template <bool big_endian, typename pixel_t>
+void FillRowGA16(const unsigned char* rgba, size_t oxs, pixel_t* luma,
+                 pixel_t* alpha) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadGA16(rgba + 4 * x);
+    if (big_endian) {
+      rgb[0].SwapEndian();
+      rgb[1].SwapEndian();
+    }
+    StorePixels(rgb[0], luma + x);
+    StorePixels(rgb[1], alpha + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t l = LoadLE16(rgba + 4 * x);
+    uint16_t a = LoadLE16(rgba + 4 * x + 2);
+    if (big_endian) {
+      l = SwapEndian(l);
+      a = SwapEndian(a);
+    }
+    luma[x] = l;
+    alpha[x] = a;
+  }
+}
+
+template <typename pixel_t>
+void StoreYCoCg(pixel_t r, pixel_t g, pixel_t b, pixel_t* y, pixel_t* co,
+                pixel_t* cg) {
+  *co = r - b;
+  pixel_t tmp = b + (*co >> 1);
+  *cg = g - tmp;
+  *y = tmp + (*cg >> 1);
+}
+
+#ifdef FJXL_GENERIC_SIMD
+void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int16_t* y, int16_t* co,
+                int16_t* cg) {
+  SIMDVec16 co_v = r.Sub(b);
+  SIMDVec16 tmp = b.Add(co_v.SignedShiftRight<1>());
+  SIMDVec16 cg_v = g.Sub(tmp);
+  SIMDVec16 y_v = tmp.Add(cg_v.SignedShiftRight<1>());
+  y_v.Store((uint16_t*)y);
+  co_v.Store((uint16_t*)co);
+  cg_v.Store((uint16_t*)cg);
+}
+
+void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int32_t* y, int32_t* co,
+                int32_t* cg) {
+  VecPair<SIMDVec32> r_up = r.Upcast();
+  VecPair<SIMDVec32> g_up = g.Upcast();
+  VecPair<SIMDVec32> b_up = b.Upcast();
+  SIMDVec32 co_lo_v = r_up.low.Sub(b_up.low);
+  SIMDVec32 tmp_lo = b_up.low.Add(co_lo_v.SignedShiftRight<1>());
+  SIMDVec32 cg_lo_v = g_up.low.Sub(tmp_lo);
+  SIMDVec32 y_lo_v = tmp_lo.Add(cg_lo_v.SignedShiftRight<1>());
+  SIMDVec32 co_hi_v = r_up.hi.Sub(b_up.hi);
+  SIMDVec32 tmp_hi = b_up.hi.Add(co_hi_v.SignedShiftRight<1>());
+  SIMDVec32 cg_hi_v = g_up.hi.Sub(tmp_hi);
+  SIMDVec32 y_hi_v = tmp_hi.Add(cg_hi_v.SignedShiftRight<1>());
+  y_lo_v.Store((uint32_t*)y);
+  co_lo_v.Store((uint32_t*)co);
+  cg_lo_v.Store((uint32_t*)cg);
+  y_hi_v.Store((uint32_t*)y + SIMDVec32::kLanes);
+  co_hi_v.Store((uint32_t*)co + SIMDVec32::kLanes);
+  cg_hi_v.Store((uint32_t*)cg + SIMDVec32::kLanes);
+}
+#endif
+
+template <typename pixel_t>
+void FillRowRGB8(const unsigned char* rgba, size_t oxs, pixel_t* y, pixel_t* co,
+                 pixel_t* cg) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadRGB8(rgba + 3 * x);
+    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t r = rgba[3 * x];
+    uint16_t g = rgba[3 * x + 1];
+    uint16_t b = rgba[3 * x + 2];
+    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
+  }
+}
+
+template <bool big_endian, typename pixel_t>
+void FillRowRGB16(const unsigned char* rgba, size_t oxs, pixel_t* y,
+                  pixel_t* co, pixel_t* cg) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadRGB16(rgba + 6 * x);
+    if (big_endian) {
+      rgb[0].SwapEndian();
+      rgb[1].SwapEndian();
+      rgb[2].SwapEndian();
+    }
+    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t r = LoadLE16(rgba + 6 * x);
+    uint16_t g = LoadLE16(rgba + 6 * x + 2);
+    uint16_t b = LoadLE16(rgba + 6 * x + 4);
+    if (big_endian) {
+      r = SwapEndian(r);
+      g = SwapEndian(g);
+      b = SwapEndian(b);
+    }
+    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
+  }
+}
+
+template <typename pixel_t>
+void FillRowRGBA8(const unsigned char* rgba, size_t oxs, pixel_t* y,
+                  pixel_t* co, pixel_t* cg, pixel_t* alpha) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadRGBA8(rgba + 4 * x);
+    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
+    StorePixels(rgb[3], alpha + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t r = rgba[4 * x];
+    uint16_t g = rgba[4 * x + 1];
+    uint16_t b = rgba[4 * x + 2];
+    uint16_t a = rgba[4 * x + 3];
+    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
+    alpha[x] = a;
+  }
+}
+
+template <bool big_endian, typename pixel_t>
+void FillRowRGBA16(const unsigned char* rgba, size_t oxs, pixel_t* y,
+                   pixel_t* co, pixel_t* cg, pixel_t* alpha) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadRGBA16(rgba + 8 * x);
+    if (big_endian) {
+      rgb[0].SwapEndian();
+      rgb[1].SwapEndian();
+      rgb[2].SwapEndian();
+      rgb[3].SwapEndian();
+    }
+    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
+    StorePixels(rgb[3], alpha + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t r = LoadLE16(rgba + 8 * x);
+    uint16_t g = LoadLE16(rgba + 8 * x + 2);
+    uint16_t b = LoadLE16(rgba + 8 * x + 4);
+    uint16_t a = LoadLE16(rgba + 8 * x + 6);
+    if (big_endian) {
+      r = SwapEndian(r);
+      g = SwapEndian(g);
+      b = SwapEndian(b);
+      a = SwapEndian(a);
+    }
+    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
+    alpha[x] = a;
+  }
+}
+
+template <typename Processor, typename BitDepth>
+void ProcessImageArea(const unsigned char* rgba, size_t x0, size_t y0,
+                      size_t xs, size_t yskip, size_t ys, size_t row_stride,
+                      BitDepth bitdepth, size_t nb_chans, bool big_endian,
+                      Processor* processors) {
+  constexpr size_t kPadding = 32;
+
+  using pixel_t = typename BitDepth::pixel_t;
+
+  constexpr size_t kAlign = 64;
+  constexpr size_t kAlignPixels = kAlign / sizeof(pixel_t);
+
+  auto align = [=](pixel_t* ptr) {
+    size_t offset = reinterpret_cast<uintptr_t>(ptr) % kAlign;
+    if (offset) {
+      ptr += offset / sizeof(pixel_t);
+    }
+    return ptr;
+  };
+
+  constexpr size_t kNumPx =
+      (256 + kPadding * 2 + kAlignPixels + kAlignPixels - 1) / kAlignPixels *
+      kAlignPixels;
+
+  std::vector<std::array<std::array<pixel_t, kNumPx>, 2>> group_data(nb_chans);
+
+  for (size_t y = 0; y < ys; y++) {
+    const auto rgba_row =
+        rgba + row_stride * (y0 + y) + x0 * nb_chans * BitDepth::kInputBytes;
+    pixel_t* crow[4] = {};
+    pixel_t* prow[4] = {};
+    for (size_t i = 0; i < nb_chans; i++) {
+      crow[i] = align(&group_data[i][y & 1][kPadding]);
+      prow[i] = align(&group_data[i][(y - 1) & 1][kPadding]);
+    }
+
+    // Pre-fill rows with YCoCg converted pixels.
+    if (nb_chans == 1) {
+      if (BitDepth::kInputBytes == 1) {
+        FillRowG8(rgba_row, xs, crow[0]);
+      } else if (big_endian) {
+        FillRowG16</*big_endian=*/true>(rgba_row, xs, crow[0]);
+      } else {
+        FillRowG16</*big_endian=*/false>(rgba_row, xs, crow[0]);
+      }
+    } else if (nb_chans == 2) {
+      if (BitDepth::kInputBytes == 1) {
+        FillRowGA8(rgba_row, xs, crow[0], crow[1]);
+      } else if (big_endian) {
+        FillRowGA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1]);
+      } else {
+        FillRowGA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1]);
+      }
+    } else if (nb_chans == 3) {
+      if (BitDepth::kInputBytes == 1) {
+        FillRowRGB8(rgba_row, xs, crow[0], crow[1], crow[2]);
+      } else if (big_endian) {
+        FillRowRGB16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
+                                          crow[2]);
+      } else {
+        FillRowRGB16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
+                                           crow[2]);
+      }
+    } else {
+      if (BitDepth::kInputBytes == 1) {
+        FillRowRGBA8(rgba_row, xs, crow[0], crow[1], crow[2], crow[3]);
+      } else if (big_endian) {
+        FillRowRGBA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
+                                           crow[2], crow[3]);
+      } else {
+        FillRowRGBA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
+                                            crow[2], crow[3]);
+      }
+    }
+    // Deal with x == 0.
+    for (size_t c = 0; c < nb_chans; c++) {
+      *(crow[c] - 1) = y > 0 ? *(prow[c]) : 0;
+      // Fix topleft.
+      *(prow[c] - 1) = y > 0 ? *(prow[c]) : 0;
+    }
+    if (y < yskip) continue;
+    for (size_t c = 0; c < nb_chans; c++) {
+      // Get pointers to px/left/top/topleft data to speedup loop.
+      const pixel_t* row = crow[c];
+      const pixel_t* row_left = crow[c] - 1;
+      const pixel_t* row_top = y == 0 ? row_left : prow[c];
+      const pixel_t* row_topleft = y == 0 ? row_left : prow[c] - 1;
+
+      processors[c].ProcessRow(row, row_left, row_top, row_topleft, xs);
+    }
+  }
+  for (size_t c = 0; c < nb_chans; c++) {
+    processors[c].Finalize();
+  }
+}
+
+template <typename BitDepth>
+void WriteACSection(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
+                    size_t ys, size_t row_stride, bool is_single_group,
+                    BitDepth bitdepth, size_t nb_chans, bool big_endian,
+                    const PrefixCode code[4],
+                    std::array<BitWriter, 4>& output) {
+  for (size_t i = 0; i < nb_chans; i++) {
+    if (is_single_group && i == 0) continue;
+    output[i].Allocate(xs * ys * bitdepth.MaxEncodedBitsPerSample() + 4);
+  }
+  if (!is_single_group) {
+    // Group header for modular image.
+    // When the image is single-group, the global modular image is the one
+    // that contains the pixel data, and there is no group header.
+    output[0].Write(1, 1);     // Global tree
+    output[0].Write(1, 1);     // All default wp
+    output[0].Write(2, 0b00);  // 0 transforms
+  }
+
+  ChunkEncoder<BitDepth> encoders[4];
+  ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth> row_encoders[4];
+  for (size_t c = 0; c < nb_chans; c++) {
+    row_encoders[c].t = &encoders[c];
+    encoders[c].output = &output[c];
+    encoders[c].code = &code[c];
+  }
+  ProcessImageArea<ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth>>(
+      rgba, x0, y0, xs, 0, ys, row_stride, bitdepth, nb_chans, big_endian,
+      row_encoders);
+}
+
+constexpr int kHashExp = 16;
+constexpr uint32_t kHashSize = 1 << kHashExp;
+constexpr uint32_t kHashMultiplier = 2654435761;
+constexpr int kMaxColors = 512;
+
+// can be any function that returns a value in 0 .. kHashSize-1
+// has to map 0 to 0
+inline uint32_t pixel_hash(uint32_t p) {
+  return (p * kHashMultiplier) >> (32 - kHashExp);
+}
+
+template <size_t nb_chans>
+void FillRowPalette(const unsigned char* inrow, size_t xs,
+                    const int16_t* lookup, int16_t* out) {
+  for (size_t x = 0; x < xs; x++) {
+    uint32_t p = 0;
+    memcpy(&p, inrow + x * nb_chans, nb_chans);
+    out[x] = lookup[pixel_hash(p)];
+  }
+}
+
+template <typename Processor>
+void ProcessImageAreaPalette(const unsigned char* rgba, size_t x0, size_t y0,
+                             size_t xs, size_t yskip, size_t ys,
+                             size_t row_stride, const int16_t* lookup,
+                             size_t nb_chans, Processor* processors) {
+  constexpr size_t kPadding = 32;
+
+  std::vector<std::array<int16_t, 256 + kPadding * 2>> group_data(2);
+  Processor& row_encoder = processors[0];
+
+  for (size_t y = 0; y < ys; y++) {
+    // Pre-fill rows with palette converted pixels.
+    const unsigned char* inrow = rgba + row_stride * (y0 + y) + x0 * nb_chans;
+    int16_t* outrow = &group_data[y & 1][kPadding];
+    if (nb_chans == 1) {
+      FillRowPalette<1>(inrow, xs, lookup, outrow);
+    } else if (nb_chans == 2) {
+      FillRowPalette<2>(inrow, xs, lookup, outrow);
+    } else if (nb_chans == 3) {
+      FillRowPalette<3>(inrow, xs, lookup, outrow);
+    } else if (nb_chans == 4) {
+      FillRowPalette<4>(inrow, xs, lookup, outrow);
+    }
+    // Deal with x == 0.
+    group_data[y & 1][kPadding - 1] =
+        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
+    // Fix topleft.
+    group_data[(y - 1) & 1][kPadding - 1] =
+        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
+    // Get pointers to px/left/top/topleft data to speedup loop.
+    const int16_t* row = &group_data[y & 1][kPadding];
+    const int16_t* row_left = &group_data[y & 1][kPadding - 1];
+    const int16_t* row_top =
+        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding];
+    const int16_t* row_topleft =
+        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding - 1];
+
+    row_encoder.ProcessRow(row, row_left, row_top, row_topleft, xs);
+  }
+  row_encoder.Finalize();
+}
+
+void WriteACSectionPalette(const unsigned char* rgba, size_t x0, size_t y0,
+                           size_t xs, size_t ys, size_t row_stride,
+                           bool is_single_group, const PrefixCode code[4],
+                           const int16_t* lookup, size_t nb_chans,
+                           BitWriter& output) {
+  if (!is_single_group) {
+    output.Allocate(16 * xs * ys + 4);
+    // Group header for modular image.
+    // When the image is single-group, the global modular image is the one
+    // that contains the pixel data, and there is no group header.
+    output.Write(1, 1);     // Global tree
+    output.Write(1, 1);     // All default wp
+    output.Write(2, 0b00);  // 0 transforms
+  }
+
+  ChunkEncoder<UpTo8Bits> encoder;
+  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
+
+  row_encoder.t = &encoder;
+  encoder.output = &output;
+  encoder.code = &code[is_single_group ? 1 : 0];
+  ProcessImageAreaPalette<
+      ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits>>(
+      rgba, x0, y0, xs, 0, ys, row_stride, lookup, nb_chans, &row_encoder);
+}
+
+template <typename BitDepth>
+void CollectSamples(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
+                    size_t row_stride, size_t row_count,
+                    uint64_t raw_counts[4][kNumRawSymbols],
+                    uint64_t lz77_counts[4][kNumLZ77], bool is_single_group,
+                    bool palette, BitDepth bitdepth, size_t nb_chans,
+                    bool big_endian, const int16_t* lookup) {
+  if (palette) {
+    ChunkSampleCollector<UpTo8Bits> sample_collectors[4];
+    ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>
+        row_sample_collectors[4];
+    for (size_t c = 0; c < nb_chans; c++) {
+      row_sample_collectors[c].t = &sample_collectors[c];
+      sample_collectors[c].raw_counts = raw_counts[is_single_group ? 1 : 0];
+      sample_collectors[c].lz77_counts = lz77_counts[is_single_group ? 1 : 0];
+    }
+    ProcessImageAreaPalette<
+        ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>>(
+        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, lookup, nb_chans,
+        row_sample_collectors);
+  } else {
+    ChunkSampleCollector<BitDepth> sample_collectors[4];
+    ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>
+        row_sample_collectors[4];
+    for (size_t c = 0; c < nb_chans; c++) {
+      row_sample_collectors[c].t = &sample_collectors[c];
+      sample_collectors[c].raw_counts = raw_counts[c];
+      sample_collectors[c].lz77_counts = lz77_counts[c];
+    }
+    ProcessImageArea<
+        ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>>(
+        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, bitdepth, nb_chans,
+        big_endian, row_sample_collectors);
+  }
+}
+
+void PrepareDCGlobalPalette(bool is_single_group, size_t width, size_t height,
+                            const PrefixCode code[4],
+                            const std::vector<uint32_t>& palette,
+                            size_t pcolors, BitWriter* output) {
+  PrepareDCGlobalCommon(is_single_group, width, height, code, output);
+  output->Write(2, 0b01);     // 1 transform
+  output->Write(2, 0b01);     // Palette
+  output->Write(5, 0b00000);  // Starting from ch 0
+  output->Write(2, 0b10);     // 4-channel palette (RGBA)
+  // pcolors <= kMaxColors + kChunkSize - 1
+  static_assert(kMaxColors + kChunkSize < 1281,
+                "add code to signal larger palette sizes");
+  if (pcolors < 256) {
+    output->Write(2, 0b00);
+    output->Write(8, pcolors);
+  } else {
+    output->Write(2, 0b01);
+    output->Write(10, pcolors - 256);
+  }
+
+  output->Write(2, 0b00);  // nb_deltas == 0
+  output->Write(4, 0);     // Zero predictor for delta palette
+  // Encode palette
+  ChunkEncoder<UpTo8Bits> encoder;
+  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
+  row_encoder.t = &encoder;
+  encoder.output = output;
+  encoder.code = &code[0];
+  int16_t p[4][32 + 1024] = {};
+  uint8_t prgba[4];
+  size_t i = 0;
+  size_t have_zero = 0;
+  if (palette[pcolors - 1] == 0) have_zero = 1;
+  for (; i < pcolors; i++) {
+    memcpy(prgba, &palette[i], 4);
+    p[0][16 + i + have_zero] = prgba[0];
+    p[1][16 + i + have_zero] = prgba[1];
+    p[2][16 + i + have_zero] = prgba[2];
+    p[3][16 + i + have_zero] = prgba[3];
+  }
+  p[0][15] = 0;
+  row_encoder.ProcessRow(p[0] + 16, p[0] + 15, p[0] + 15, p[0] + 15, pcolors);
+  p[1][15] = p[0][16];
+  p[0][15] = p[0][16];
+  row_encoder.ProcessRow(p[1] + 16, p[1] + 15, p[0] + 16, p[0] + 15, pcolors);
+  p[2][15] = p[1][16];
+  p[1][15] = p[1][16];
+  row_encoder.ProcessRow(p[2] + 16, p[2] + 15, p[1] + 16, p[1] + 15, pcolors);
+  p[3][15] = p[2][16];
+  p[2][15] = p[2][16];
+  row_encoder.ProcessRow(p[3] + 16, p[3] + 15, p[2] + 16, p[2] + 15, pcolors);
+  row_encoder.Finalize();
+
+  if (!is_single_group) {
+    output->ZeroPadToByte();
+  }
+}
+
+template <typename BitDepth>
+JxlFastLosslessFrameState* LLEnc(const unsigned char* rgba, size_t width,
+                                 size_t stride, size_t height,
+                                 BitDepth bitdepth, size_t nb_chans,
+                                 bool big_endian, int effort,
+                                 void* runner_opaque,
+                                 FJxlParallelRunner runner) {
+  assert(width != 0);
+  assert(height != 0);
+  assert(stride >= nb_chans * BitDepth::kInputBytes * width);
+
+  // Count colors to try palette
+  std::vector<uint32_t> palette(kHashSize);
+  palette[0] = 1;
+  std::vector<int16_t> lookup(kHashSize);
+  lookup[0] = 0;
+  int pcolors = 0;
+  bool collided = effort < 2 || bitdepth.bitdepth != 8 ||
+                  nb_chans < 4;  // todo: also do rgb palette
+  for (size_t y = 0; y < height && !collided; y++) {
+    const unsigned char* r = rgba + stride * y;
+    size_t x = 0;
+    if (nb_chans == 4) {
+      // this is just an unrolling of the next loop
+      for (; x + 7 < width; x += 8) {
+        uint32_t p[8], index[8];
+        memcpy(p, r + x * 4, 32);
+        for (int i = 0; i < 8; i++) index[i] = pixel_hash(p[i]);
+        for (int i = 0; i < 8; i++) {
+          uint32_t init_entry = index[i] ? 0 : 1;
+          if (init_entry != palette[index[i]] && p[i] != palette[index[i]]) {
+            collided = true;
+          }
+        }
+        for (int i = 0; i < 8; i++) palette[index[i]] = p[i];
+      }
+      for (; x < width; x++) {
+        uint32_t p;
+        memcpy(&p, r + x * 4, 4);
+        uint32_t index = pixel_hash(p);
+        uint32_t init_entry = index ? 0 : 1;
+        if (init_entry != palette[index] && p != palette[index]) {
+          collided = true;
+        }
+        palette[index] = p;
+      }
+    } else {
+      for (; x < width; x++) {
+        uint32_t p = 0;
+        memcpy(&p, r + x * nb_chans, nb_chans);
+        uint32_t index = pixel_hash(p);
+        uint32_t init_entry = index ? 0 : 1;
+        if (init_entry != palette[index] && p != palette[index]) {
+          collided = true;
+        }
+        palette[index] = p;
+      }
+    }
+  }
+
+  int nb_entries = 0;
+  if (!collided) {
+    if (palette[0] == 0) pcolors = 1;
+    if (palette[0] == 1) palette[0] = 0;
+    bool have_color = false;
+    uint8_t minG = 255, maxG = 0;
+    for (uint32_t k = 0; k < kHashSize; k++) {
+      if (palette[k] == 0) continue;
+      uint8_t p[4];
+      memcpy(p, &palette[k], 4);
+      // move entries to front so sort has less work
+      palette[nb_entries] = palette[k];
+      if (p[0] != p[1] || p[0] != p[2]) have_color = true;
+      if (p[1] < minG) minG = p[1];
+      if (p[1] > maxG) maxG = p[1];
+      nb_entries++;
+      // don't do palette if too many colors are needed
+      if (nb_entries + pcolors > kMaxColors) {
+        collided = true;
+        break;
+      }
+    }
+    if (!have_color) {
+      // don't do palette if it's just grayscale without many holes
+      if (maxG - minG < nb_entries * 1.4f) collided = true;
+    }
+  }
+  if (!collided) {
+    std::sort(
+        palette.begin(), palette.begin() + nb_entries,
+        [](uint32_t ap, uint32_t bp) {
+          if (ap == 0) return false;
+          if (bp == 0) return true;
+          uint8_t a[4], b[4];
+          memcpy(a, &ap, 4);
+          memcpy(b, &bp, 4);
+          float ay, by;
+          ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f) * a[3];
+          by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f) * b[3];
+          return ay < by;  // sort on alpha*luma
+        });
+    for (int k = 0; k < nb_entries; k++) {
+      if (palette[k] == 0) break;
+      lookup[pixel_hash(palette[k])] = pcolors++;
+    }
+  }
+
+  size_t num_groups_x = (width + 255) / 256;
+  size_t num_groups_y = (height + 255) / 256;
+  size_t num_dc_groups_x = (width + 2047) / 2048;
+  size_t num_dc_groups_y = (height + 2047) / 2048;
+
+  uint64_t raw_counts[4][kNumRawSymbols] = {};
+  uint64_t lz77_counts[4][kNumLZ77] = {};
+
+  bool onegroup = num_groups_x == 1 && num_groups_y == 1;
+
+  // sample the middle (effort * 2) rows of every group
+  for (size_t g = 0; g < num_groups_y * num_groups_x; g++) {
+    size_t xg = g % num_groups_x;
+    size_t yg = g / num_groups_x;
+    int y_offset = yg * 256;
+    int y_max = std::min<size_t>(height - yg * 256, 256);
+    int y_begin = y_offset + std::max<int>(0, y_max - 2 * effort) / 2;
+    int y_count =
+        std::min<int>(2 * effort * y_max / 256, y_offset + y_max - y_begin - 1);
+    int x_max =
+        std::min<size_t>(width - xg * 256, 256) / kChunkSize * kChunkSize;
+    CollectSamples(rgba, xg * 256, y_begin, x_max, stride, y_count, raw_counts,
+                   lz77_counts, onegroup, !collided, bitdepth, nb_chans,
+                   big_endian, lookup.data());
+  }
+
+  // TODO(veluca): can probably improve this and make it bitdepth-dependent.
+  uint64_t base_raw_counts[kNumRawSymbols] = {
+      3843, 852, 1270, 1214, 1014, 727, 481, 300, 159, 51,
+      5,    1,   1,    1,    1,    1,   1,   1,   1};
+
+  bool doing_ycocg = nb_chans > 2 && collided;
+  for (size_t i = bitdepth.NumSymbols(doing_ycocg); i < kNumRawSymbols; i++) {
+    base_raw_counts[i] = 0;
+  }
+
+  for (size_t c = 0; c < 4; c++) {
+    for (size_t i = 0; i < kNumRawSymbols; i++) {
+      raw_counts[c][i] = (raw_counts[c][i] << 8) + base_raw_counts[i];
+    }
+  }
+
+  if (!collided) {
+    unsigned token, nbits, bits;
+    EncodeHybridUint000(PackSigned(pcolors - 1), &token, &nbits, &bits);
+    // ensure all palette indices can actually be encoded
+    for (size_t i = 0; i < token + 1; i++)
+      raw_counts[0][i] = std::max<uint64_t>(raw_counts[0][i], 1);
+    // these tokens are only used for the palette itself so they can get a bad
+    // code
+    for (size_t i = token + 1; i < 10; i++) raw_counts[0][i] = 1;
+  }
+
+  uint64_t base_lz77_counts[kNumLZ77] = {
+      29, 27, 25,  23, 21, 21, 19, 18, 21, 17, 16, 15, 15, 14,
+      13, 13, 137, 98, 61, 34, 1,  1,  1,  1,  1,  1,  1,  1,
+  };
+
+  for (size_t c = 0; c < 4; c++) {
+    for (size_t i = 0; i < kNumLZ77; i++) {
+      lz77_counts[c][i] = (lz77_counts[c][i] << 8) + base_lz77_counts[i];
+    }
+  }
+
+  alignas(64) PrefixCode hcode[4];
+  for (size_t i = 0; i < 4; i++) {
+    hcode[i] = PrefixCode(bitdepth, raw_counts[i], lz77_counts[i]);
+  }
+
+  size_t num_groups = onegroup ? 1
+                               : (2 + num_dc_groups_x * num_dc_groups_y +
+                                  num_groups_x * num_groups_y);
+
+  JxlFastLosslessFrameState* frame_state = new JxlFastLosslessFrameState();
+
+  frame_state->width = width;
+  frame_state->height = height;
+  frame_state->nb_chans = nb_chans;
+  frame_state->bitdepth = bitdepth.bitdepth;
+
+  frame_state->group_data = std::vector<std::array<BitWriter, 4>>(num_groups);
+  if (collided) {
+    PrepareDCGlobal(onegroup, width, height, nb_chans, hcode,
+                    &frame_state->group_data[0][0]);
+  } else {
+    PrepareDCGlobalPalette(onegroup, width, height, hcode, palette, pcolors,
+                           &frame_state->group_data[0][0]);
+  }
+
+  auto run_one = [&](size_t g) {
+    size_t xg = g % num_groups_x;
+    size_t yg = g / num_groups_x;
+    size_t group_id =
+        onegroup ? 0 : (2 + num_dc_groups_x * num_dc_groups_y + g);
+    size_t xs = std::min<size_t>(width - xg * 256, 256);
+    size_t ys = std::min<size_t>(height - yg * 256, 256);
+    size_t x0 = xg * 256;
+    size_t y0 = yg * 256;
+    auto& gd = frame_state->group_data[group_id];
+    if (collided) {
+      WriteACSection(rgba, x0, y0, xs, ys, stride, onegroup, bitdepth, nb_chans,
+                     big_endian, hcode, gd);
+
+    } else {
+      WriteACSectionPalette(rgba, x0, y0, xs, ys, stride, onegroup, hcode,
+                            lookup.data(), nb_chans, gd[0]);
+    }
+  };
+
+  runner(
+      runner_opaque, &run_one,
+      +[](void* r, size_t i) { (*reinterpret_cast<decltype(&run_one)>(r))(i); },
+      num_groups_x * num_groups_y);
+
+  return frame_state;
+}
+
+JxlFastLosslessFrameState* JxlFastLosslessEncodeImpl(
+    const unsigned char* rgba, size_t width, size_t stride, size_t height,
+    size_t nb_chans, size_t bitdepth, bool big_endian, int effort,
+    void* runner_opaque, FJxlParallelRunner runner) {
+  assert(bitdepth > 0);
+  assert(nb_chans <= 4);
+  assert(nb_chans != 0);
+  if (bitdepth <= 8) {
+    return LLEnc(rgba, width, stride, height, UpTo8Bits(bitdepth), nb_chans,
+                 big_endian, effort, runner_opaque, runner);
+  }
+  if (bitdepth <= 13) {
+    return LLEnc(rgba, width, stride, height, From9To13Bits(bitdepth), nb_chans,
+                 big_endian, effort, runner_opaque, runner);
+  }
+  if (bitdepth == 14) {
+    return LLEnc(rgba, width, stride, height, Exactly14Bits(bitdepth), nb_chans,
+                 big_endian, effort, runner_opaque, runner);
+  }
+  return LLEnc(rgba, width, stride, height, MoreThan14Bits(bitdepth), nb_chans,
+               big_endian, effort, runner_opaque, runner);
+}
+
+}  // namespace
+
+#endif  // FJXL_SELF_INCLUDE
+
+#ifndef FJXL_SELF_INCLUDE
+
+#define FJXL_SELF_INCLUDE
+
+// If we have NEON enabled, it is the default target.
+#if FJXL_ENABLE_NEON
+
+namespace default_implementation {
+#define FJXL_NEON
+#include "lib/jxl/enc_fast_lossless.cc"
+#undef FJXL_NEON
+}  // namespace default_implementation
+
+#else  // FJXL_ENABLE_NEON
+
+namespace default_implementation {
+#include "lib/jxl/enc_fast_lossless.cc"
+}
+
+#if FJXL_ENABLE_AVX2
+#ifdef __clang__
+#pragma clang attribute push(__attribute__((target("avx,avx2"))), \
+                             apply_to = function)
+// Causes spurious warnings on clang5.
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmissing-braces"
+#elif defined(__GNUC__)
+#pragma GCC push_options
+// Seems to cause spurious errors on GCC8.
+#pragma GCC diagnostic ignored "-Wpsabi"
+#pragma GCC target "avx,avx2"
+#endif
+
+namespace AVX2 {
+#define FJXL_AVX2
+#include "lib/jxl/enc_fast_lossless.cc"
+#undef FJXL_AVX2
+}  // namespace AVX2
+
+#ifdef __clang__
+#pragma clang attribute pop
+#pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#endif  // FJXL_ENABLE_AVX2
+
+#if FJXL_ENABLE_AVX512
+#ifdef __clang__
+#pragma clang attribute push(                                                 \
+    __attribute__((target("avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"))), \
+    apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target "avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"
+#endif
+
+namespace AVX512 {
+#define FJXL_AVX512
+#include "lib/jxl/enc_fast_lossless.cc"
+#undef FJXL_AVX512
+}  // namespace AVX512
+
+#ifdef __clang__
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#endif  // FJXL_ENABLE_AVX512
+
+#endif
+
+extern "C" {
+
+size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,
+                             size_t row_stride, size_t height, size_t nb_chans,
+                             size_t bitdepth, int big_endian, int effort,
+                             unsigned char** output, void* runner_opaque,
+                             FJxlParallelRunner runner) {
+  auto frame_state = JxlFastLosslessPrepareFrame(
+      rgba, width, row_stride, height, nb_chans, bitdepth, big_endian, effort,
+      runner_opaque, runner);
+  JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/1,
+                               /*is_last=*/1);
+  size_t output_size = JxlFastLosslessMaxRequiredOutput(frame_state);
+  *output = (unsigned char*)malloc(output_size);
+  size_t written = 0;
+  size_t total = 0;
+  while ((written = JxlFastLosslessWriteOutput(frame_state, *output + total,
+                                               output_size - total)) != 0) {
+    total += written;
+  }
+  return total;
+}
+
+JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame(
+    const unsigned char* rgba, size_t width, size_t row_stride, size_t height,
+    size_t nb_chans, size_t bitdepth, int big_endian, int effort,
+    void* runner_opaque, FJxlParallelRunner runner) {
+  auto trivial_runner =
+      +[](void*, void* opaque, void fun(void*, size_t), size_t count) {
+        for (size_t i = 0; i < count; i++) {
+          fun(opaque, i);
+        }
+      };
+
+  if (runner == nullptr) {
+    runner = trivial_runner;
+  }
+
+#if FJXL_ENABLE_AVX512
+  if (__builtin_cpu_supports("avx512cd") &&
+      __builtin_cpu_supports("avx512vbmi") &&
+      __builtin_cpu_supports("avx512bw") && __builtin_cpu_supports("avx512f") &&
+      __builtin_cpu_supports("avx512vl")) {
+    return AVX512::JxlFastLosslessEncodeImpl(rgba, width, row_stride, height,
+                                             nb_chans, bitdepth, big_endian,
+                                             effort, runner_opaque, runner);
+  }
+#endif
+#if FJXL_ENABLE_AVX2
+  if (__builtin_cpu_supports("avx2")) {
+    return AVX2::JxlFastLosslessEncodeImpl(rgba, width, row_stride, height,
+                                           nb_chans, bitdepth, big_endian,
+                                           effort, runner_opaque, runner);
+  }
+#endif
+
+  return default_implementation::JxlFastLosslessEncodeImpl(
+      rgba, width, row_stride, height, nb_chans, bitdepth, big_endian, effort,
+      runner_opaque, runner);
+}
+
+}  // extern "C"
+
+#endif  // FJXL_SELF_INCLUDE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h
new file mode 100644
index 0000000000..4ea1d4f69b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h
@@ -0,0 +1,72 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_FAST_LOSSLESS_H_
+#define LIB_JXL_ENC_FAST_LOSSLESS_H_
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Simple encoding API.
+
+// A FJxlParallelRunner must call fun(opaque, i) for all i from 0 to count. It
+// may do so in parallel.
+typedef void(FJxlParallelRunner)(void* runner_opaque, void* opaque,
+                                 void fun(void*, size_t), size_t count);
+
+// You may pass `nullptr` as a runner: encoding will be sequential.
+size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,
+                             size_t row_stride, size_t height, size_t nb_chans,
+                             size_t bitdepth, int big_endian, int effort,
+                             unsigned char** output, void* runner_opaque,
+                             FJxlParallelRunner runner);
+
+// More complex API for cases in which you may want to allocate your own buffer
+// and other advanced use cases.
+
+// Opaque struct that represents an intermediate state of the computation.
+struct JxlFastLosslessFrameState;
+
+// Returned JxlFastLosslessFrameState must be freed by calling
+// JxlFastLosslessFreeFrameState.
+JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame(
+    const unsigned char* rgba, size_t width, size_t row_stride, size_t height,
+    size_t nb_chans, size_t bitdepth, int big_endian, int effort,
+    void* runner_opaque, FJxlParallelRunner runner);
+
+// Prepare the (image/frame) header. You may encode animations by concatenating
+// the output of multiple frames, of which the first one has add_image_header =
+// 1 and subsequent ones have add_image_header = 0, and all frames but the last
+// one have is_last = 0.
+void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame,
+                                  int add_image_header, int is_last);
+
+// Upper bound on the required output size, including any padding that may be
+// required by JxlFastLosslessWriteOutput. Cannot be called before
+// JxlFastLosslessPrepareHeader.
+size_t JxlFastLosslessMaxRequiredOutput(const JxlFastLosslessFrameState* frame);
+
+// Actual size of the frame once it is encoded. This is not identical to
+// JxlFastLosslessMaxRequiredOutput because JxlFastLosslessWriteOutput may
+// require extra padding.
+size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState* frame);
+
+// Writes the frame to the given output buffer. Returns the number of bytes that
+// were written, which is at least 1 unless the entire output has been written
+// already. It is required that `output_size >= 32` when calling this function.
+// This function must be called repeatedly until it returns 0.
+size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState* frame,
+                                  unsigned char* output, size_t output_size);
+
+// Frees the provided frame state.
+void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState* frame);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // LIB_JXL_ENC_FAST_LOSSLESS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_fields.cc b/third_party/jpeg-xl/lib/jxl/enc_fields.cc
new file mode 100644
index 0000000000..22c763e13f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_fields.cc
@@ -0,0 +1,239 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_fields.h"
+
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+
+namespace {
+using ::jxl::fields_internal::VisitorBase;
+class WriteVisitor : public VisitorBase {
+ public:
+  WriteVisitor(const size_t extension_bits, BitWriter* JXL_RESTRICT writer)
+      : extension_bits_(extension_bits), writer_(writer) {}
+
+  Status Bits(const size_t bits, const uint32_t /*default_value*/,
+              uint32_t* JXL_RESTRICT value) override {
+    ok_ &= BitsCoder::Write(bits, *value, writer_);
+    return true;
+  }
+  Status U32(const U32Enc enc, const uint32_t /*default_value*/,
+             uint32_t* JXL_RESTRICT value) override {
+    ok_ &= U32Coder::Write(enc, *value, writer_);
+    return true;
+  }
+
+  Status U64(const uint64_t /*default_value*/,
+             uint64_t* JXL_RESTRICT value) override {
+    ok_ &= U64Coder::Write(*value, writer_);
+    return true;
+  }
+
+  Status F16(const float /*default_value*/,
+             float* JXL_RESTRICT value) override {
+    ok_ &= F16Coder::Write(*value, writer_);
+    return true;
+  }
+
+  Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override {
+    JXL_QUIET_RETURN_IF_ERROR(VisitorBase::BeginExtensions(extensions));
+    if (*extensions == 0) {
+      JXL_ASSERT(extension_bits_ == 0);
+      return true;
+    }
+    // TODO(janwas): extend API to pass in array of extension_bits, one per
+    // extension. We currently ascribe all bits to the first extension, but
+    // this is only an encoder limitation. NOTE: extension_bits_ can be zero
+    // if an extension does not require any additional fields.
+    ok_ &= U64Coder::Write(extension_bits_, writer_);
+    // For each nonzero bit except the lowest/first (already written):
+    for (uint64_t remaining_extensions = *extensions & (*extensions - 1);
+         remaining_extensions != 0;
+         remaining_extensions &= remaining_extensions - 1) {
+      ok_ &= U64Coder::Write(0, writer_);
+    }
+    return true;
+  }
+  // EndExtensions = default.
+
+  Status OK() const { return ok_; }
+
+ private:
+  const size_t extension_bits_;
+  BitWriter* JXL_RESTRICT writer_;
+  bool ok_ = true;
+};
+}  // namespace
+
+Status Bundle::Write(const Fields& fields, BitWriter* writer, size_t layer,
+                     AuxOut* aux_out) {
+  size_t extension_bits, total_bits;
+  JXL_RETURN_IF_ERROR(Bundle::CanEncode(fields, &extension_bits, &total_bits));
+
+  BitWriter::Allotment allotment(writer, total_bits);
+  WriteVisitor visitor(extension_bits, writer);
+  JXL_RETURN_IF_ERROR(visitor.VisitConst(fields));
+  JXL_RETURN_IF_ERROR(visitor.OK());
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+  return true;
+}
+
+// Returns false if the value is too large to encode.
+Status BitsCoder::Write(const size_t bits, const uint32_t value,
+                        BitWriter* JXL_RESTRICT writer) {
+  if (value >= (1ULL << bits)) {
+    return JXL_FAILURE("Value %d too large to encode in %" PRIu64 " bits",
+                       value, static_cast<uint64_t>(bits));
+  }
+  writer->Write(bits, value);
+  return true;
+}
+
+// Returns false if the value is too large to encode.
+Status U32Coder::Write(const U32Enc enc, const uint32_t value,
+                       BitWriter* JXL_RESTRICT writer) {
+  uint32_t selector;
+  size_t total_bits;
+  JXL_RETURN_IF_ERROR(ChooseSelector(enc, value, &selector, &total_bits));
+
+  writer->Write(2, selector);
+
+  const U32Distr d = enc.GetDistr(selector);
+  if (!d.IsDirect()) {  // Nothing more to write for direct encoding
+    const uint32_t offset = d.Offset();
+    JXL_ASSERT(value >= offset);
+    writer->Write(total_bits - 2, value - offset);
+  }
+
+  return true;
+}
+
+// Returns false if the value is too large to encode.
+Status U64Coder::Write(uint64_t value, BitWriter* JXL_RESTRICT writer) {
+  if (value == 0) {
+    // Selector: use 0 bits, value 0
+    writer->Write(2, 0);
+  } else if (value <= 16) {
+    // Selector: use 4 bits, value 1..16
+    writer->Write(2, 1);
+    writer->Write(4, value - 1);
+  } else if (value <= 272) {
+    // Selector: use 8 bits, value 17..272
+    writer->Write(2, 2);
+    writer->Write(8, value - 17);
+  } else {
+    // Selector: varint, first a 12-bit group, after that per 8-bit group.
+    writer->Write(2, 3);
+    writer->Write(12, value & 4095);
+    value >>= 12;
+    int shift = 12;
+    while (value > 0 && shift < 60) {
+      // Indicate varint not done
+      writer->Write(1, 1);
+      writer->Write(8, value & 255);
+      value >>= 8;
+      shift += 8;
+    }
+    if (value > 0) {
+      // This only could happen if shift == N - 4.
+      writer->Write(1, 1);
+      writer->Write(4, value & 15);
+      // Implicitly closed sequence, no extra stop bit is required.
+    } else {
+      // Indicate end of varint
+      writer->Write(1, 0);
+    }
+  }
+
+  return true;
+}
+
+Status F16Coder::Write(float value, BitWriter* JXL_RESTRICT writer) {
+  uint32_t bits32;
+  memcpy(&bits32, &value, sizeof(bits32));
+  const uint32_t sign = bits32 >> 31;
+  const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
+  const uint32_t mantissa32 = bits32 & 0x7FFFFF;
+
+  const int32_t exp = static_cast<int32_t>(biased_exp32) - 127;
+  if (JXL_UNLIKELY(exp > 15)) {
+    return JXL_FAILURE("Too big to encode, CanEncode should return false");
+  }
+
+  // Tiny or zero => zero.
+  if (exp < -24) {
+    writer->Write(16, 0);
+    return true;
+  }
+
+  uint32_t biased_exp16, mantissa16;
+
+  // exp = [-24, -15] => subnormal
+  if (JXL_UNLIKELY(exp < -14)) {
+    biased_exp16 = 0;
+    const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
+    JXL_ASSERT(1 <= sub_exp && sub_exp < 11);
+    mantissa16 = (1 << (10 - sub_exp)) + (mantissa32 >> (13 + sub_exp));
+  } else {
+    // exp = [-14, 15]
+    biased_exp16 = static_cast<uint32_t>(exp + 15);
+    JXL_ASSERT(1 <= biased_exp16 && biased_exp16 < 31);
+    mantissa16 = mantissa32 >> 13;
+  }
+
+  JXL_ASSERT(mantissa16 < 1024);
+  const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
+  JXL_ASSERT(bits16 < 0x10000);
+  writer->Write(16, bits16);
+  return true;
+}
+
+Status WriteCodestreamHeaders(CodecMetadata* metadata, BitWriter* writer,
+                              AuxOut* aux_out) {
+  // Marker/signature
+  BitWriter::Allotment allotment(writer, 16);
+  writer->Write(8, 0xFF);
+  writer->Write(8, kCodestreamMarker);
+  allotment.ReclaimAndCharge(writer, kLayerHeader, aux_out);
+
+  JXL_RETURN_IF_ERROR(
+      WriteSizeHeader(metadata->size, writer, kLayerHeader, aux_out));
+
+  JXL_RETURN_IF_ERROR(
+      WriteImageMetadata(metadata->m, writer, kLayerHeader, aux_out));
+
+  metadata->transform_data.nonserialized_xyb_encoded = metadata->m.xyb_encoded;
+  JXL_RETURN_IF_ERROR(
+      Bundle::Write(metadata->transform_data, writer, kLayerHeader, aux_out));
+
+  return true;
+}
+
+Status WriteFrameHeader(const FrameHeader& frame,
+                        BitWriter* JXL_RESTRICT writer, AuxOut* aux_out) {
+  return Bundle::Write(frame, writer, kLayerHeader, aux_out);
+}
+
+Status WriteImageMetadata(const ImageMetadata& metadata,
+                          BitWriter* JXL_RESTRICT writer, size_t layer,
+                          AuxOut* aux_out) {
+  return Bundle::Write(metadata, writer, layer, aux_out);
+}
+
+Status WriteQuantizerParams(const QuantizerParams& params,
+                            BitWriter* JXL_RESTRICT writer, size_t layer,
+                            AuxOut* aux_out) {
+  return Bundle::Write(params, writer, layer, aux_out);
+}
+
+Status WriteSizeHeader(const SizeHeader& size, BitWriter* JXL_RESTRICT writer,
+                       size_t layer, AuxOut* aux_out) {
+  return Bundle::Write(size, writer, layer, aux_out);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_fields.h b/third_party/jpeg-xl/lib/jxl/enc_fields.h
new file mode 100644
index 0000000000..5bb179a719
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_fields.h
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_FIELDS_H_
+#define LIB_JXL_ENC_FIELDS_H_
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/quantizer.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Write headers from the CodecMetadata. Also may modify nonserialized_...
+// fields of the metadata.
+Status WriteCodestreamHeaders(CodecMetadata* metadata, BitWriter* writer,
+                              AuxOut* aux_out);
+
+Status WriteFrameHeader(const FrameHeader& frame,
+                        BitWriter* JXL_RESTRICT writer, AuxOut* aux_out);
+
+Status WriteQuantizerParams(const QuantizerParams& params,
+                            BitWriter* JXL_RESTRICT writer, size_t layer,
+                            AuxOut* aux_out);
+
+Status WriteSizeHeader(const SizeHeader& size, BitWriter* JXL_RESTRICT writer,
+                       size_t layer, AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_FIELDS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_file.cc b/third_party/jpeg-xl/lib/jxl/enc_file.cc
new file mode 100644
index 0000000000..b1f1442cc2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_file.cc
@@ -0,0 +1,141 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_file.h"
+
+#include <stddef.h>
+
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_frame.h"
+#include "lib/jxl/enc_icc_codec.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+namespace {
+
+Status PrepareCodecMetadataFromIO(const CompressParams& cparams,
+                                  const CodecInOut* io,
+                                  CodecMetadata* metadata) {
+  *metadata = io->metadata;
+  size_t ups = 1;
+  if (cparams.already_downsampled) ups = cparams.resampling;
+
+  JXL_RETURN_IF_ERROR(metadata->size.Set(io->xsize() * ups, io->ysize() * ups));
+
+  // Keep ICC profile in lossless modes because a reconstructed profile may be
+  // slightly different (quantization).
+  // Also keep ICC in JPEG reconstruction mode as we need byte-exact profiles.
+  if (!cparams.IsLossless() && !io->Main().IsJPEG()) {
+    metadata->m.color_encoding.DecideIfWantICC();
+  }
+
+  metadata->m.xyb_encoded =
+      cparams.color_transform == ColorTransform::kXYB ? true : false;
+
+  // TODO(firsching): move this EncodeFile to test_utils / re-implement this
+  // using API functions
+  return true;
+}
+
+}  // namespace
+
+Status EncodePreview(const CompressParams& cparams, const ImageBundle& ib,
+                     const CodecMetadata* metadata, const JxlCmsInterface& cms,
+                     ThreadPool* pool, BitWriter* JXL_RESTRICT writer) {
+  BitWriter preview_writer;
+  // TODO(janwas): also support generating preview by downsampling
+  if (ib.HasColor()) {
+    AuxOut aux_out;
+    PassesEncoderState passes_enc_state;
+    // TODO(lode): check if we want all extra channels and matching xyb_encoded
+    // for the preview, such that using the main ImageMetadata object for
+    // encoding this frame is warrented.
+    FrameInfo frame_info;
+    frame_info.is_preview = true;
+    JXL_RETURN_IF_ERROR(EncodeFrame(cparams, frame_info, metadata, ib,
+                                    &passes_enc_state, cms, pool,
+                                    &preview_writer, &aux_out));
+    preview_writer.ZeroPadToByte();
+  }
+
+  if (preview_writer.BitsWritten() != 0) {
+    writer->ZeroPadToByte();
+    writer->AppendByteAligned(preview_writer);
+  }
+
+  return true;
+}
+
+Status EncodeFile(const CompressParams& params, const CodecInOut* io,
+                  PassesEncoderState* passes_enc_state, PaddedBytes* compressed,
+                  const JxlCmsInterface& cms, AuxOut* aux_out,
+                  ThreadPool* pool) {
+  io->CheckMetadata();
+  BitWriter writer;
+
+  CompressParams cparams = params;
+  if (io->Main().color_transform != ColorTransform::kNone) {
+    // Set the color transform to YCbCr or XYB if the original image is such.
+    cparams.color_transform = io->Main().color_transform;
+  }
+
+  JXL_RETURN_IF_ERROR(ParamsPostInit(&cparams));
+
+  std::unique_ptr<CodecMetadata> metadata = jxl::make_unique<CodecMetadata>();
+  JXL_RETURN_IF_ERROR(PrepareCodecMetadataFromIO(cparams, io, metadata.get()));
+  JXL_RETURN_IF_ERROR(WriteCodestreamHeaders(metadata.get(), &writer, aux_out));
+
+  // Only send ICC (at least several hundred bytes) if fields aren't enough.
+  if (metadata->m.color_encoding.WantICC()) {
+    JXL_RETURN_IF_ERROR(WriteICC(metadata->m.color_encoding.ICC(), &writer,
+                                 kLayerHeader, aux_out));
+  }
+
+  if (metadata->m.have_preview) {
+    JXL_RETURN_IF_ERROR(EncodePreview(cparams, io->preview_frame,
+                                      metadata.get(), cms, pool, &writer));
+  }
+
+  // Each frame should start on byte boundaries.
+  BitWriter::Allotment allotment(&writer, 8);
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, kLayerHeader, aux_out);
+
+  for (size_t i = 0; i < io->frames.size(); i++) {
+    FrameInfo info;
+    info.is_last = i == io->frames.size() - 1;
+    if (io->frames[i].use_for_next_frame) {
+      info.save_as_reference = 1;
+    }
+    JXL_RETURN_IF_ERROR(EncodeFrame(cparams, info, metadata.get(),
+                                    io->frames[i], passes_enc_state, cms, pool,
+                                    &writer, aux_out));
+  }
+
+  // Clean up passes_enc_state in case it gets reused.
+  for (size_t i = 0; i < 4; i++) {
+    passes_enc_state->shared.dc_frames[i] = Image3F();
+    passes_enc_state->shared.reference_frames[i].frame = ImageBundle();
+  }
+
+  *compressed = std::move(writer).TakeBytes();
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_file.h b/third_party/jpeg-xl/lib/jxl/enc_file.h
new file mode 100644
index 0000000000..ff3ad1233d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_file.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_FILE_H_
+#define LIB_JXL_ENC_FILE_H_
+
+// Facade for JXL encoding.
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_params.h"
+
+namespace jxl {
+
+struct AuxOut;
+class CodecInOut;
+
+// Compresses pixels from `io` (given in any ColorEncoding).
+// `io->metadata.m.original` must be set.
+Status EncodeFile(const CompressParams& params, const CodecInOut* io,
+                  PassesEncoderState* passes_enc_state, PaddedBytes* compressed,
+                  const JxlCmsInterface& cms, AuxOut* aux_out = nullptr,
+                  ThreadPool* pool = nullptr);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_FILE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_frame.cc b/third_party/jpeg-xl/lib/jxl/enc_frame.cc
new file mode 100644
index 0000000000..ed4088120e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_frame.cc
@@ -0,0 +1,1745 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_frame.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cmath>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/compressed_dc.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/enc_adaptive_quantization.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_chroma_from_luma.h"
+#include "lib/jxl/enc_coeff_order.h"
+#include "lib/jxl/enc_context_map.h"
+#include "lib/jxl/enc_entropy_coder.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_gaborish.h"
+#include "lib/jxl/enc_group.h"
+#include "lib/jxl/enc_modular.h"
+#include "lib/jxl/enc_noise.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_patch_dictionary.h"
+#include "lib/jxl/enc_photon_noise.h"
+#include "lib/jxl/enc_quant_weights.h"
+#include "lib/jxl/enc_splines.h"
+#include "lib/jxl/enc_toc.h"
+#include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/modular/options.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer.h"
+#include "lib/jxl/splines.h"
+#include "lib/jxl/toc.h"
+
+namespace jxl {
+namespace {
+
+PassDefinition progressive_passes_dc_vlf_lf_full_ac[] = {
+    {/*num_coefficients=*/2, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/4},
+    {/*num_coefficients=*/3, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/2},
+    {/*num_coefficients=*/8, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/0},
+};
+
+PassDefinition progressive_passes_dc_quant_ac_full_ac[] = {
+    {/*num_coefficients=*/8, /*shift=*/1,
+     /*suitable_for_downsampling_of_at_least=*/2},
+    {/*num_coefficients=*/8, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/0},
+};
+
+void ClusterGroups(PassesEncoderState* enc_state) {
+  if (enc_state->shared.frame_header.passes.num_passes > 1) {
+    // TODO(veluca): implement this for progressive modes.
+    return;
+  }
+  // This only considers pass 0 for now.
+  std::vector<uint8_t> context_map;
+  EntropyEncodingData codes;
+  auto& ac = enc_state->passes[0].ac_tokens;
+  size_t limit = std::ceil(std::sqrt(ac.size()));
+  if (limit == 1) return;
+  size_t num_contexts = enc_state->shared.block_ctx_map.NumACContexts();
+  std::vector<float> costs(ac.size());
+  HistogramParams params;
+  params.uint_method = HistogramParams::HybridUintMethod::kNone;
+  params.lz77_method = HistogramParams::LZ77Method::kNone;
+  params.ans_histogram_strategy =
+      HistogramParams::ANSHistogramStrategy::kApproximate;
+  size_t max = 0;
+  auto token_cost = [&](std::vector<std::vector<Token>>& tokens, size_t num_ctx,
+                        bool estimate = true) {
+    // TODO(veluca): not estimating is very expensive.
+    BitWriter writer;
+    size_t c = BuildAndEncodeHistograms(
+        params, num_ctx, tokens, &codes, &context_map,
+        estimate ? nullptr : &writer, 0, /*aux_out=*/0);
+    if (estimate) return c;
+    for (size_t i = 0; i < tokens.size(); i++) {
+      WriteTokens(tokens[i], codes, context_map, &writer, 0, nullptr);
+    }
+    return writer.BitsWritten();
+  };
+  for (size_t i = 0; i < ac.size(); i++) {
+    std::vector<std::vector<Token>> tokens{ac[i]};
+    costs[i] =
+        token_cost(tokens, enc_state->shared.block_ctx_map.NumACContexts());
+    if (costs[i] > costs[max]) {
+      max = i;
+    }
+  }
+  auto dist = [&](int i, int j) {
+    std::vector<std::vector<Token>> tokens{ac[i], ac[j]};
+    return token_cost(tokens, num_contexts) - costs[i] - costs[j];
+  };
+  std::vector<size_t> out{max};
+  std::vector<float> dists(ac.size());
+  size_t farthest = 0;
+  for (size_t i = 0; i < ac.size(); i++) {
+    if (i == max) continue;
+    dists[i] = dist(max, i);
+    if (dists[i] > dists[farthest]) {
+      farthest = i;
+    }
+  }
+
+  while (dists[farthest] > 0 && out.size() < limit) {
+    out.push_back(farthest);
+    dists[farthest] = 0;
+    enc_state->histogram_idx[farthest] = out.size() - 1;
+    for (size_t i = 0; i < ac.size(); i++) {
+      float d = dist(out.back(), i);
+      if (d < dists[i]) {
+        dists[i] = d;
+        enc_state->histogram_idx[i] = out.size() - 1;
+      }
+      if (dists[i] > dists[farthest]) {
+        farthest = i;
+      }
+    }
+  }
+
+  std::vector<size_t> remap(out.size());
+  std::iota(remap.begin(), remap.end(), 0);
+  for (size_t i = 0; i < enc_state->histogram_idx.size(); i++) {
+    enc_state->histogram_idx[i] = remap[enc_state->histogram_idx[i]];
+  }
+  auto remap_cost = [&](std::vector<size_t> remap) {
+    std::vector<size_t> re_remap(remap.size(), remap.size());
+    size_t r = 0;
+    for (size_t i = 0; i < remap.size(); i++) {
+      if (re_remap[remap[i]] == remap.size()) {
+        re_remap[remap[i]] = r++;
+      }
+      remap[i] = re_remap[remap[i]];
+    }
+    auto tokens = ac;
+    size_t max_hist = 0;
+    for (size_t i = 0; i < tokens.size(); i++) {
+      for (size_t j = 0; j < tokens[i].size(); j++) {
+        size_t hist = remap[enc_state->histogram_idx[i]];
+        tokens[i][j].context += hist * num_contexts;
+        max_hist = std::max(hist + 1, max_hist);
+      }
+    }
+    return token_cost(tokens, max_hist * num_contexts, /*estimate=*/false);
+  };
+
+  for (size_t src = 0; src < out.size(); src++) {
+    float cost = remap_cost(remap);
+    size_t best = src;
+    for (size_t j = src + 1; j < out.size(); j++) {
+      if (remap[src] == remap[j]) continue;
+      auto remap_c = remap;
+      std::replace(remap_c.begin(), remap_c.end(), remap[src], remap[j]);
+      float c = remap_cost(remap_c);
+      if (c < cost) {
+        best = j;
+        cost = c;
+      }
+    }
+    if (src != best) {
+      std::replace(remap.begin(), remap.end(), remap[src], remap[best]);
+    }
+  }
+  std::vector<size_t> re_remap(remap.size(), remap.size());
+  size_t r = 0;
+  for (size_t i = 0; i < remap.size(); i++) {
+    if (re_remap[remap[i]] == remap.size()) {
+      re_remap[remap[i]] = r++;
+    }
+    remap[i] = re_remap[remap[i]];
+  }
+
+  enc_state->shared.num_histograms =
+      *std::max_element(remap.begin(), remap.end()) + 1;
+  for (size_t i = 0; i < enc_state->histogram_idx.size(); i++) {
+    enc_state->histogram_idx[i] = remap[enc_state->histogram_idx[i]];
+  }
+  for (size_t i = 0; i < ac.size(); i++) {
+    for (size_t j = 0; j < ac[i].size(); j++) {
+      ac[i][j].context += enc_state->histogram_idx[i] * num_contexts;
+    }
+  }
+}
+
+uint64_t FrameFlagsFromParams(const CompressParams& cparams) {
+  uint64_t flags = 0;
+
+  const float dist = cparams.butteraugli_distance;
+
+  // We don't add noise at low butteraugli distances because the original
+  // noise is stored within the compressed image and adding noise makes things
+  // worse.
+  if (ApplyOverride(cparams.noise, dist >= kMinButteraugliForNoise) ||
+      cparams.photon_noise_iso > 0 ||
+      cparams.manual_noise.size() == NoiseParams::kNumNoisePoints) {
+    flags |= FrameHeader::kNoise;
+  }
+
+  if (cparams.progressive_dc > 0 && cparams.modular_mode == false) {
+    flags |= FrameHeader::kUseDcFrame;
+  }
+
+  return flags;
+}
+
+Status LoopFilterFromParams(const CompressParams& cparams,
+                            FrameHeader* JXL_RESTRICT frame_header) {
+  LoopFilter* loop_filter = &frame_header->loop_filter;
+
+  // Gaborish defaults to enabled in Hare or slower.
+  loop_filter->gab = ApplyOverride(
+      cparams.gaborish, cparams.speed_tier <= SpeedTier::kHare &&
+                            frame_header->encoding == FrameEncoding::kVarDCT &&
+                            cparams.decoding_speed_tier < 4);
+
+  if (cparams.epf != -1) {
+    loop_filter->epf_iters = cparams.epf;
+  } else {
+    if (frame_header->encoding == FrameEncoding::kModular) {
+      loop_filter->epf_iters = 0;
+    } else {
+      constexpr float kThresholds[3] = {0.7, 1.5, 4.0};
+      loop_filter->epf_iters = 0;
+      if (cparams.decoding_speed_tier < 3) {
+        for (size_t i = cparams.decoding_speed_tier == 2 ? 1 : 0; i < 3; i++) {
+          if (cparams.butteraugli_distance >= kThresholds[i]) {
+            loop_filter->epf_iters++;
+          }
+        }
+      }
+    }
+  }
+  // Strength of EPF in modular mode.
+  if (frame_header->encoding == FrameEncoding::kModular &&
+      !cparams.IsLossless()) {
+    // TODO(veluca): this formula is nonsense.
+    loop_filter->epf_sigma_for_modular = cparams.butteraugli_distance;
+  }
+  if (frame_header->encoding == FrameEncoding::kModular &&
+      cparams.lossy_palette) {
+    loop_filter->epf_sigma_for_modular = 1.0f;
+  }
+
+  return true;
+}
+
+Status MakeFrameHeader(const CompressParams& cparams,
+                       const ProgressiveSplitter& progressive_splitter,
+                       const FrameInfo& frame_info, const ImageBundle& ib,
+                       FrameHeader* JXL_RESTRICT frame_header) {
+  frame_header->nonserialized_is_preview = frame_info.is_preview;
+  frame_header->is_last = frame_info.is_last;
+  frame_header->save_before_color_transform =
+      frame_info.save_before_color_transform;
+  frame_header->frame_type = frame_info.frame_type;
+  frame_header->name = ib.name;
+
+  progressive_splitter.InitPasses(&frame_header->passes);
+
+  if (cparams.modular_mode) {
+    frame_header->encoding = FrameEncoding::kModular;
+    frame_header->group_size_shift = cparams.modular_group_size_shift;
+  }
+
+  frame_header->chroma_subsampling = ib.chroma_subsampling;
+  if (ib.IsJPEG()) {
+    // we are transcoding a JPEG, so we don't get to choose
+    frame_header->encoding = FrameEncoding::kVarDCT;
+    frame_header->color_transform = ib.color_transform;
+  } else {
+    frame_header->color_transform = cparams.color_transform;
+    if (!cparams.modular_mode &&
+        (frame_header->chroma_subsampling.MaxHShift() != 0 ||
+         frame_header->chroma_subsampling.MaxVShift() != 0)) {
+      return JXL_FAILURE(
+          "Chroma subsampling is not supported in VarDCT mode when not "
+          "recompressing JPEGs");
+    }
+  }
+  if (frame_header->color_transform != ColorTransform::kYCbCr &&
+      (frame_header->chroma_subsampling.MaxHShift() != 0 ||
+       frame_header->chroma_subsampling.MaxVShift() != 0)) {
+    return JXL_FAILURE(
+        "Chroma subsampling is not supported when color transform is not "
+        "YCbCr");
+  }
+
+  frame_header->flags = FrameFlagsFromParams(cparams);
+  // Non-photon noise is not supported in the Modular encoder for now.
+  if (frame_header->encoding != FrameEncoding::kVarDCT &&
+      cparams.photon_noise_iso == 0 && cparams.manual_noise.empty()) {
+    frame_header->UpdateFlag(false, FrameHeader::Flags::kNoise);
+  }
+
+  JXL_RETURN_IF_ERROR(LoopFilterFromParams(cparams, frame_header));
+
+  frame_header->dc_level = frame_info.dc_level;
+  if (frame_header->dc_level > 2) {
+    // With 3 or more progressive_dc frames, the implementation does not yet
+    // work, see enc_cache.cc.
+    return JXL_FAILURE("progressive_dc > 2 is not yet supported");
+  }
+  if (cparams.progressive_dc > 0 &&
+      (cparams.ec_resampling != 1 || cparams.resampling != 1)) {
+    return JXL_FAILURE("Resampling not supported with DC frames");
+  }
+  if (cparams.resampling != 1 && cparams.resampling != 2 &&
+      cparams.resampling != 4 && cparams.resampling != 8) {
+    return JXL_FAILURE("Invalid resampling factor");
+  }
+  if (cparams.ec_resampling != 1 && cparams.ec_resampling != 2 &&
+      cparams.ec_resampling != 4 && cparams.ec_resampling != 8) {
+    return JXL_FAILURE("Invalid ec_resampling factor");
+  }
+  // Resized frames.
+  if (frame_info.frame_type != FrameType::kDCFrame) {
+    frame_header->frame_origin = ib.origin;
+    size_t ups = 1;
+    if (cparams.already_downsampled) ups = cparams.resampling;
+
+    // TODO(lode): this is not correct in case of odd original image sizes in
+    // combination with cparams.already_downsampled. Likely these values should
+    // be set to respectively frame_header->default_xsize() and
+    // frame_header->default_ysize() instead, the original (non downsampled)
+    // intended decoded image dimensions. But it may be more subtle than that
+    // if combined with crop. This issue causes custom_size_or_origin to be
+    // incorrectly set to true in case of already_downsampled with odd output
+    // image size when no cropping is used.
+    frame_header->frame_size.xsize = ib.xsize() * ups;
+    frame_header->frame_size.ysize = ib.ysize() * ups;
+    if (ib.origin.x0 != 0 || ib.origin.y0 != 0 ||
+        frame_header->frame_size.xsize != frame_header->default_xsize() ||
+        frame_header->frame_size.ysize != frame_header->default_ysize()) {
+      frame_header->custom_size_or_origin = true;
+    }
+  }
+  // Upsampling.
+  frame_header->upsampling = cparams.resampling;
+  const std::vector<ExtraChannelInfo>& extra_channels =
+      frame_header->nonserialized_metadata->m.extra_channel_info;
+  frame_header->extra_channel_upsampling.clear();
+  frame_header->extra_channel_upsampling.resize(extra_channels.size(),
+                                                cparams.ec_resampling);
+  frame_header->save_as_reference = frame_info.save_as_reference;
+
+  // Set blending-related information.
+  if (ib.blend || frame_header->custom_size_or_origin) {
+    // Set blend_channel to the first alpha channel. These values are only
+    // encoded in case a blend mode involving alpha is used and there are more
+    // than one extra channels.
+    size_t index = 0;
+    if (frame_info.alpha_channel == -1) {
+      if (extra_channels.size() > 1) {
+        for (size_t i = 0; i < extra_channels.size(); i++) {
+          if (extra_channels[i].type == ExtraChannel::kAlpha) {
+            index = i;
+            break;
+          }
+        }
+      }
+    } else {
+      index = static_cast<size_t>(frame_info.alpha_channel);
+      JXL_ASSERT(index == 0 || index < extra_channels.size());
+    }
+    frame_header->blending_info.alpha_channel = index;
+    frame_header->blending_info.mode =
+        ib.blend ? ib.blendmode : BlendMode::kReplace;
+    frame_header->blending_info.source = frame_info.source;
+    frame_header->blending_info.clamp = frame_info.clamp;
+    const auto& extra_channel_info = frame_info.extra_channel_blending_info;
+    for (size_t i = 0; i < extra_channels.size(); i++) {
+      if (i < extra_channel_info.size()) {
+        frame_header->extra_channel_blending_info[i] = extra_channel_info[i];
+      } else {
+        frame_header->extra_channel_blending_info[i].alpha_channel = index;
+        BlendMode default_blend = ib.blendmode;
+        if (extra_channels[i].type != ExtraChannel::kBlack && i != index) {
+          // K needs to be blended, spot colors and other stuff gets added
+          default_blend = BlendMode::kAdd;
+        }
+        frame_header->extra_channel_blending_info[i].mode =
+            ib.blend ? default_blend : BlendMode::kReplace;
+        frame_header->extra_channel_blending_info[i].source = 1;
+      }
+    }
+  }
+
+  frame_header->animation_frame.duration = ib.duration;
+  frame_header->animation_frame.timecode = ib.timecode;
+
+  return true;
+}
+
+// Invisible (alpha = 0) pixels tend to be a mess in optimized PNGs.
+// Since they have no visual impact whatsoever, we can replace them with
+// something that compresses better and reduces artifacts near the edges. This
+// does some kind of smooth stuff that seems to work.
+// Replace invisible pixels with a weighted average of the pixel to the left,
+// the pixel to the topright, and non-invisible neighbours.
+// Produces downward-blurry smears, with in the upwards direction only a 1px
+// edge duplication but not more. It would probably be better to smear in all
+// directions. That requires an alpha-weighed convolution with a large enough
+// kernel though, which might be overkill...
+void SimplifyInvisible(Image3F* image, const ImageF& alpha, bool lossless) {
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < image->ysize(); ++y) {
+      float* JXL_RESTRICT row = image->PlaneRow(c, y);
+      const float* JXL_RESTRICT prow =
+          (y > 0 ? image->PlaneRow(c, y - 1) : nullptr);
+      const float* JXL_RESTRICT nrow =
+          (y + 1 < image->ysize() ? image->PlaneRow(c, y + 1) : nullptr);
+      const float* JXL_RESTRICT a = alpha.Row(y);
+      const float* JXL_RESTRICT pa = (y > 0 ? alpha.Row(y - 1) : nullptr);
+      const float* JXL_RESTRICT na =
+          (y + 1 < image->ysize() ? alpha.Row(y + 1) : nullptr);
+      for (size_t x = 0; x < image->xsize(); ++x) {
+        if (a[x] == 0) {
+          if (lossless) {
+            row[x] = 0;
+            continue;
+          }
+          float d = 0.f;
+          row[x] = 0;
+          if (x > 0) {
+            row[x] += row[x - 1];
+            d++;
+            if (a[x - 1] > 0.f) {
+              row[x] += row[x - 1];
+              d++;
+            }
+          }
+          if (x + 1 < image->xsize()) {
+            if (y > 0) {
+              row[x] += prow[x + 1];
+              d++;
+            }
+            if (a[x + 1] > 0.f) {
+              row[x] += 2.f * row[x + 1];
+              d += 2.f;
+            }
+            if (y > 0 && pa[x + 1] > 0.f) {
+              row[x] += 2.f * prow[x + 1];
+              d += 2.f;
+            }
+            if (y + 1 < image->ysize() && na[x + 1] > 0.f) {
+              row[x] += 2.f * nrow[x + 1];
+              d += 2.f;
+            }
+          }
+          if (y > 0 && pa[x] > 0.f) {
+            row[x] += 2.f * prow[x];
+            d += 2.f;
+          }
+          if (y + 1 < image->ysize() && na[x] > 0.f) {
+            row[x] += 2.f * nrow[x];
+            d += 2.f;
+          }
+          if (d > 1.f) row[x] /= d;
+        }
+      }
+    }
+  }
+}
+
+struct PixelStatsForChromacityAdjustment {
+  float dx = 0;
+  float db = 0;
+  float exposed_blue = 0;
+  float CalcPlane(const ImageF* JXL_RESTRICT plane) const {
+    float xmax = 0;
+    float ymax = 0;
+    for (size_t ty = 1; ty < plane->ysize(); ++ty) {
+      for (size_t tx = 1; tx < plane->xsize(); ++tx) {
+        float cur = plane->Row(ty)[tx];
+        float prev_row = plane->Row(ty - 1)[tx];
+        float prev = plane->Row(ty)[tx - 1];
+        xmax = std::max(xmax, std::abs(cur - prev));
+        ymax = std::max(ymax, std::abs(cur - prev_row));
+      }
+    }
+    return std::max(xmax, ymax);
+  }
+  void CalcExposedBlue(const ImageF* JXL_RESTRICT plane_y,
+                       const ImageF* JXL_RESTRICT plane_b) {
+    float eb = 0;
+    float xmax = 0;
+    float ymax = 0;
+    for (size_t ty = 1; ty < plane_y->ysize(); ++ty) {
+      for (size_t tx = 1; tx < plane_y->xsize(); ++tx) {
+        float cur_y = plane_y->Row(ty)[tx];
+        float cur_b = plane_b->Row(ty)[tx];
+        float exposed_b = cur_b - cur_y * 1.2;
+        float diff_b = cur_b - cur_y;
+        float prev_row = plane_b->Row(ty - 1)[tx];
+        float prev = plane_b->Row(ty)[tx - 1];
+        float diff_prev_row = prev_row - plane_y->Row(ty - 1)[tx];
+        float diff_prev = prev - plane_y->Row(ty)[tx - 1];
+        xmax = std::max(xmax, std::abs(diff_b - diff_prev));
+        ymax = std::max(ymax, std::abs(diff_b - diff_prev_row));
+        if (exposed_b >= 0) {
+          exposed_b *= fabs(cur_b - prev) + fabs(cur_b - prev_row);
+          eb = std::max(eb, exposed_b);
+        }
+      }
+    }
+    exposed_blue = eb;
+    db = std::max(xmax, ymax);
+  }
+  void Calc(const Image3F* JXL_RESTRICT opsin) {
+    dx = CalcPlane(&opsin->Plane(0));
+    CalcExposedBlue(&opsin->Plane(1), &opsin->Plane(2));
+  }
+  int HowMuchIsXChannelPixelized() {
+    if (dx >= 0.03) {
+      return 2;
+    }
+    if (dx >= 0.017) {
+      return 1;
+    }
+    return 0;
+  }
+  int HowMuchIsBChannelPixelized() {
+    int add = exposed_blue >= 0.13 ? 1 : 0;
+    if (db > 0.38) {
+      return 2 + add;
+    }
+    if (db > 0.33) {
+      return 1 + add;
+    }
+    if (db > 0.28) {
+      return add;
+    }
+    return 0;
+  }
+};
+
+}  // namespace
+
+class LossyFrameEncoder {
+ public:
+  LossyFrameEncoder(const CompressParams& cparams,
+                    const FrameHeader& frame_header,
+                    PassesEncoderState* JXL_RESTRICT enc_state,
+                    const JxlCmsInterface& cms, ThreadPool* pool,
+                    AuxOut* aux_out)
+      : enc_state_(enc_state), cms_(cms), pool_(pool), aux_out_(aux_out) {
+    JXL_CHECK(InitializePassesSharedState(frame_header, &enc_state_->shared,
+                                          /*encoder=*/true));
+    enc_state_->cparams = cparams;
+    enc_state_->passes.clear();
+  }
+
+  Status ComputeEncodingData(const ImageBundle* linear,
+                             Image3F* JXL_RESTRICT opsin,
+                             const JxlCmsInterface& cms, ThreadPool* pool,
+                             ModularFrameEncoder* modular_frame_encoder,
+                             FrameHeader* frame_header) {
+    PROFILER_ZONE("ComputeEncodingData uninstrumented");
+    JXL_ASSERT((opsin->xsize() % kBlockDim) == 0 &&
+               (opsin->ysize() % kBlockDim) == 0);
+    PassesSharedState& shared = enc_state_->shared;
+
+    if (!enc_state_->cparams.max_error_mode) {
+      // Compute chromacity adjustments using two approaches.
+      // 1) Distance based approach for chromacity adjustment:
+      float x_qm_scale_steps[4] = {1.25f, 7.0f, 15.0f, 24.0f};
+      shared.frame_header.x_qm_scale = 2;
+      for (float x_qm_scale_step : x_qm_scale_steps) {
+        if (enc_state_->cparams.original_butteraugli_distance >
+            x_qm_scale_step) {
+          shared.frame_header.x_qm_scale++;
+        }
+      }
+      if (enc_state_->cparams.butteraugli_distance < 0.299f) {
+        // Favor chromacity preservation for making images appear more
+        // faithful to original even with extreme (5-10x) zooming.
+        shared.frame_header.x_qm_scale++;
+      }
+      // 2) Pixel-based approach for chromacity adjustment:
+      // look at the individual pixels and make a guess how difficult
+      // the image would be based on the worst case pixel.
+      PixelStatsForChromacityAdjustment pixel_stats;
+      if (enc_state_->cparams.speed_tier <= SpeedTier::kWombat) {
+        pixel_stats.Calc(opsin);
+      }
+      // For X take the most severe adjustment.
+      shared.frame_header.x_qm_scale =
+          std::max<int>(shared.frame_header.x_qm_scale,
+                        2 + pixel_stats.HowMuchIsXChannelPixelized());
+      // B only ajudsted by pixel-based approach.
+      shared.frame_header.b_qm_scale =
+          2 + pixel_stats.HowMuchIsBChannelPixelized();
+    }
+
+    JXL_RETURN_IF_ERROR(enc_state_->heuristics->LossyFrameHeuristics(
+        enc_state_, modular_frame_encoder, linear, opsin, cms_, pool_,
+        aux_out_));
+
+    JXL_RETURN_IF_ERROR(InitializePassesEncoder(
+        *opsin, cms, pool_, enc_state_, modular_frame_encoder, aux_out_));
+
+    enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses());
+    for (PassesEncoderState::PassData& pass : enc_state_->passes) {
+      pass.ac_tokens.resize(shared.frame_dim.num_groups);
+    }
+
+    ComputeAllCoeffOrders(shared.frame_dim);
+    shared.num_histograms = 1;
+
+    const auto tokenize_group_init = [&](const size_t num_threads) {
+      group_caches_.resize(num_threads);
+      return true;
+    };
+    const auto tokenize_group = [&](const uint32_t group_index,
+                                    const size_t thread) {
+      // Tokenize coefficients.
+      const Rect rect = shared.BlockGroupRect(group_index);
+      for (size_t idx_pass = 0; idx_pass < enc_state_->passes.size();
+           idx_pass++) {
+        JXL_ASSERT(enc_state_->coeffs[idx_pass]->Type() == ACType::k32);
+        const int32_t* JXL_RESTRICT ac_rows[3] = {
+            enc_state_->coeffs[idx_pass]->PlaneRow(0, group_index, 0).ptr32,
+            enc_state_->coeffs[idx_pass]->PlaneRow(1, group_index, 0).ptr32,
+            enc_state_->coeffs[idx_pass]->PlaneRow(2, group_index, 0).ptr32,
+        };
+        // Ensure group cache is initialized.
+        group_caches_[thread].InitOnce();
+        TokenizeCoefficients(
+            &shared.coeff_orders[idx_pass * shared.coeff_order_size], rect,
+            ac_rows, shared.ac_strategy, frame_header->chroma_subsampling,
+            &group_caches_[thread].num_nzeroes,
+            &enc_state_->passes[idx_pass].ac_tokens[group_index],
+            enc_state_->shared.quant_dc, enc_state_->shared.raw_quant_field,
+            enc_state_->shared.block_ctx_map);
+      }
+    };
+    JXL_RETURN_IF_ERROR(RunOnPool(pool_, 0, shared.frame_dim.num_groups,
+                                  tokenize_group_init, tokenize_group,
+                                  "TokenizeGroup"));
+
+    *frame_header = shared.frame_header;
+    return true;
+  }
+
+  Status ComputeJPEGTranscodingData(const jpeg::JPEGData& jpeg_data,
+                                    ModularFrameEncoder* modular_frame_encoder,
+                                    FrameHeader* frame_header) {
+    PROFILER_ZONE("ComputeJPEGTranscodingData uninstrumented");
+    PassesSharedState& shared = enc_state_->shared;
+
+    frame_header->x_qm_scale = 2;
+    frame_header->b_qm_scale = 2;
+
+    FrameDimensions frame_dim = frame_header->ToFrameDimensions();
+
+    const size_t xsize = frame_dim.xsize_padded;
+    const size_t ysize = frame_dim.ysize_padded;
+    const size_t xsize_blocks = frame_dim.xsize_blocks;
+    const size_t ysize_blocks = frame_dim.ysize_blocks;
+
+    // no-op chroma from luma
+    shared.cmap = ColorCorrelationMap(xsize, ysize, false);
+    shared.ac_strategy.FillDCT8();
+    FillImage(uint8_t(0), &shared.epf_sharpness);
+
+    enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses());
+    for (PassesEncoderState::PassData& pass : enc_state_->passes) {
+      pass.ac_tokens.resize(shared.frame_dim.num_groups);
+    }
+
+    enc_state_->coeffs.clear();
+    while (enc_state_->coeffs.size() < enc_state_->passes.size()) {
+      enc_state_->coeffs.emplace_back(make_unique<ACImageT<int32_t>>(
+          kGroupDim * kGroupDim, frame_dim.num_groups));
+    }
+
+    // convert JPEG quantization table to a Quantizer object
+    float dcquantization[3];
+    std::vector<QuantEncoding> qe(DequantMatrices::kNum,
+                                  QuantEncoding::Library(0));
+
+    auto jpeg_c_map = JpegOrder(frame_header->color_transform,
+                                jpeg_data.components.size() == 1);
+
+    std::vector<int> qt(192);
+    for (size_t c = 0; c < 3; c++) {
+      size_t jpeg_c = jpeg_c_map[c];
+      const int32_t* quant =
+          jpeg_data.quant[jpeg_data.components[jpeg_c].quant_idx].values.data();
+
+      dcquantization[c] = 255 * 8.0f / quant[0];
+      for (size_t y = 0; y < 8; y++) {
+        for (size_t x = 0; x < 8; x++) {
+          // JPEG XL transposes the DCT, JPEG doesn't.
+          qt[c * 64 + 8 * x + y] = quant[8 * y + x];
+        }
+      }
+    }
+    DequantMatricesSetCustomDC(&shared.matrices, dcquantization);
+    float dcquantization_r[3] = {1.0f / dcquantization[0],
+                                 1.0f / dcquantization[1],
+                                 1.0f / dcquantization[2]};
+
+    qe[AcStrategy::Type::DCT] = QuantEncoding::RAW(qt);
+    DequantMatricesSetCustom(&shared.matrices, qe, modular_frame_encoder);
+
+    // Ensure that InvGlobalScale() is 1.
+    shared.quantizer = Quantizer(&shared.matrices, 1, kGlobalScaleDenom);
+    // Recompute MulDC() and InvMulDC().
+    shared.quantizer.RecomputeFromGlobalScale();
+
+    // Per-block dequant scaling should be 1.
+    FillImage(static_cast<int32_t>(shared.quantizer.InvGlobalScale()),
+              &shared.raw_quant_field);
+
+    std::vector<int32_t> scaled_qtable(192);
+    for (size_t c = 0; c < 3; c++) {
+      for (size_t i = 0; i < 64; i++) {
+        scaled_qtable[64 * c + i] =
+            (1 << kCFLFixedPointPrecision) * qt[64 + i] / qt[64 * c + i];
+      }
+    }
+
+    auto jpeg_row = [&](size_t c, size_t y) {
+      return jpeg_data.components[jpeg_c_map[c]].coeffs.data() +
+             jpeg_data.components[jpeg_c_map[c]].width_in_blocks *
+                 kDCTBlockSize * y;
+    };
+
+    Image3F dc = Image3F(xsize_blocks, ysize_blocks);
+    bool DCzero =
+        (shared.frame_header.color_transform == ColorTransform::kYCbCr);
+    // Compute chroma-from-luma for AC (doesn't seem to be useful for DC)
+    if (frame_header->chroma_subsampling.Is444() &&
+        enc_state_->cparams.force_cfl_jpeg_recompression &&
+        jpeg_data.components.size() == 3) {
+      for (size_t c : {0, 2}) {
+        ImageSB* map = (c == 0 ? &shared.cmap.ytox_map : &shared.cmap.ytob_map);
+        const float kScale = kDefaultColorFactor;
+        const int kOffset = 127;
+        const float kBase =
+            c == 0 ? shared.cmap.YtoXRatio(0) : shared.cmap.YtoBRatio(0);
+        const float kZeroThresh =
+            kScale * kZeroBiasDefault[c] *
+            0.9999f;  // just epsilon less for better rounding
+
+        auto process_row = [&](const uint32_t task, const size_t thread) {
+          size_t ty = task;
+          int8_t* JXL_RESTRICT row_out = map->Row(ty);
+          for (size_t tx = 0; tx < map->xsize(); ++tx) {
+            const size_t y0 = ty * kColorTileDimInBlocks;
+            const size_t x0 = tx * kColorTileDimInBlocks;
+            const size_t y1 = std::min(frame_dim.ysize_blocks,
+                                       (ty + 1) * kColorTileDimInBlocks);
+            const size_t x1 = std::min(frame_dim.xsize_blocks,
+                                       (tx + 1) * kColorTileDimInBlocks);
+            int32_t d_num_zeros[257] = {0};
+            // TODO(veluca): this needs SIMD + fixed point adaptation, and/or
+            // conversion to the new CfL algorithm.
+            for (size_t y = y0; y < y1; ++y) {
+              const int16_t* JXL_RESTRICT row_m = jpeg_row(1, y);
+              const int16_t* JXL_RESTRICT row_s = jpeg_row(c, y);
+              for (size_t x = x0; x < x1; ++x) {
+                for (size_t coeffpos = 1; coeffpos < kDCTBlockSize;
+                     coeffpos++) {
+                  const float scaled_m =
+                      row_m[x * kDCTBlockSize + coeffpos] *
+                      scaled_qtable[64 * c + coeffpos] *
+                      (1.0f / (1 << kCFLFixedPointPrecision));
+                  const float scaled_s =
+                      kScale * row_s[x * kDCTBlockSize + coeffpos] +
+                      (kOffset - kBase * kScale) * scaled_m;
+                  if (std::abs(scaled_m) > 1e-8f) {
+                    float from, to;
+                    if (scaled_m > 0) {
+                      from = (scaled_s - kZeroThresh) / scaled_m;
+                      to = (scaled_s + kZeroThresh) / scaled_m;
+                    } else {
+                      from = (scaled_s + kZeroThresh) / scaled_m;
+                      to = (scaled_s - kZeroThresh) / scaled_m;
+                    }
+                    if (from < 0.0f) {
+                      from = 0.0f;
+                    }
+                    if (to > 255.0f) {
+                      to = 255.0f;
+                    }
+                    // Instead of clamping the both values
+                    // we just check that range is sane.
+                    if (from <= to) {
+                      d_num_zeros[static_cast<int>(std::ceil(from))]++;
+                      d_num_zeros[static_cast<int>(std::floor(to + 1))]--;
+                    }
+                  }
+                }
+              }
+            }
+            int best = 0;
+            int32_t best_sum = 0;
+            FindIndexOfSumMaximum(d_num_zeros, 256, &best, &best_sum);
+            int32_t offset_sum = 0;
+            for (int i = 0; i < 256; ++i) {
+              if (i <= kOffset) {
+                offset_sum += d_num_zeros[i];
+              }
+            }
+            row_out[tx] = 0;
+            if (best_sum > offset_sum + 1) {
+              row_out[tx] = best - kOffset;
+            }
+          }
+        };
+
+        JXL_RETURN_IF_ERROR(RunOnPool(pool_, 0, map->ysize(),
+                                      ThreadPool::NoInit, process_row,
+                                      "FindCorrelation"));
+      }
+    }
+
+    if (!frame_header->chroma_subsampling.Is444()) {
+      ZeroFillImage(&dc);
+      for (auto& coeff : enc_state_->coeffs) {
+        coeff->ZeroFill();
+      }
+    }
+    // JPEG DC is from -1024 to 1023.
+    std::vector<size_t> dc_counts[3] = {};
+    dc_counts[0].resize(2048);
+    dc_counts[1].resize(2048);
+    dc_counts[2].resize(2048);
+    size_t total_dc[3] = {};
+    for (size_t c : {1, 0, 2}) {
+      if (jpeg_data.components.size() == 1 && c != 1) {
+        for (auto& coeff : enc_state_->coeffs) {
+          coeff->ZeroFillPlane(c);
+        }
+        ZeroFillImage(&dc.Plane(c));
+        // Ensure no division by 0.
+        dc_counts[c][1024] = 1;
+        total_dc[c] = 1;
+        continue;
+      }
+      size_t hshift = frame_header->chroma_subsampling.HShift(c);
+      size_t vshift = frame_header->chroma_subsampling.VShift(c);
+      ImageSB& map = (c == 0 ? shared.cmap.ytox_map : shared.cmap.ytob_map);
+      for (size_t group_index = 0; group_index < frame_dim.num_groups;
+           group_index++) {
+        const size_t gx = group_index % frame_dim.xsize_groups;
+        const size_t gy = group_index / frame_dim.xsize_groups;
+        int32_t* coeffs[kMaxNumPasses];
+        for (size_t i = 0; i < enc_state_->coeffs.size(); i++) {
+          coeffs[i] = enc_state_->coeffs[i]->PlaneRow(c, group_index, 0).ptr32;
+        }
+        int32_t block[64];
+        for (size_t by = gy * kGroupDimInBlocks;
+             by < ysize_blocks && by < (gy + 1) * kGroupDimInBlocks; ++by) {
+          if ((by >> vshift) << vshift != by) continue;
+          const int16_t* JXL_RESTRICT inputjpeg = jpeg_row(c, by >> vshift);
+          const int16_t* JXL_RESTRICT inputjpegY = jpeg_row(1, by);
+          float* JXL_RESTRICT fdc = dc.PlaneRow(c, by >> vshift);
+          const int8_t* JXL_RESTRICT cm =
+              map.ConstRow(by / kColorTileDimInBlocks);
+          for (size_t bx = gx * kGroupDimInBlocks;
+               bx < xsize_blocks && bx < (gx + 1) * kGroupDimInBlocks; ++bx) {
+            if ((bx >> hshift) << hshift != bx) continue;
+            size_t base = (bx >> hshift) * kDCTBlockSize;
+            int idc;
+            if (DCzero) {
+              idc = inputjpeg[base];
+            } else {
+              idc = inputjpeg[base] + 1024 / qt[c * 64];
+            }
+            dc_counts[c][std::min(static_cast<uint32_t>(idc + 1024),
+                                  uint32_t(2047))]++;
+            total_dc[c]++;
+            fdc[bx >> hshift] = idc * dcquantization_r[c];
+            if (c == 1 || !enc_state_->cparams.force_cfl_jpeg_recompression ||
+                !frame_header->chroma_subsampling.Is444()) {
+              for (size_t y = 0; y < 8; y++) {
+                for (size_t x = 0; x < 8; x++) {
+                  block[y * 8 + x] = inputjpeg[base + x * 8 + y];
+                }
+              }
+            } else {
+              const int32_t scale =
+                  shared.cmap.RatioJPEG(cm[bx / kColorTileDimInBlocks]);
+
+              for (size_t y = 0; y < 8; y++) {
+                for (size_t x = 0; x < 8; x++) {
+                  int Y = inputjpegY[kDCTBlockSize * bx + x * 8 + y];
+                  int QChroma = inputjpeg[kDCTBlockSize * bx + x * 8 + y];
+                  // Fixed-point multiply of CfL scale with quant table ratio
+                  // first, and Y value second.
+                  int coeff_scale = (scale * scaled_qtable[64 * c + y * 8 + x] +
+                                     (1 << (kCFLFixedPointPrecision - 1))) >>
+                                    kCFLFixedPointPrecision;
+                  int cfl_factor = (Y * coeff_scale +
+                                    (1 << (kCFLFixedPointPrecision - 1))) >>
+                                   kCFLFixedPointPrecision;
+                  int QCR = QChroma - cfl_factor;
+                  block[y * 8 + x] = QCR;
+                }
+              }
+            }
+            enc_state_->progressive_splitter.SplitACCoefficients(
+                block, AcStrategy::FromRawStrategy(AcStrategy::Type::DCT), bx,
+                by, coeffs);
+            for (size_t i = 0; i < enc_state_->coeffs.size(); i++) {
+              coeffs[i] += kDCTBlockSize;
+            }
+          }
+        }
+      }
+    }
+
+    auto& dct = enc_state_->shared.block_ctx_map.dc_thresholds;
+    auto& num_dc_ctxs = enc_state_->shared.block_ctx_map.num_dc_ctxs;
+    num_dc_ctxs = 1;
+    for (size_t i = 0; i < 3; i++) {
+      dct[i].clear();
+      int num_thresholds = (CeilLog2Nonzero(total_dc[i]) - 12) / 2;
+      // up to 3 buckets per channel:
+      // dark/medium/bright, yellow/unsat/blue, green/unsat/red
+      num_thresholds = std::min(std::max(num_thresholds, 0), 2);
+      size_t cumsum = 0;
+      size_t cut = total_dc[i] / (num_thresholds + 1);
+      for (int j = 0; j < 2048; j++) {
+        cumsum += dc_counts[i][j];
+        if (cumsum > cut) {
+          dct[i].push_back(j - 1025);
+          cut = total_dc[i] * (dct[i].size() + 1) / (num_thresholds + 1);
+        }
+      }
+      num_dc_ctxs *= dct[i].size() + 1;
+    }
+
+    auto& ctx_map = enc_state_->shared.block_ctx_map.ctx_map;
+    ctx_map.clear();
+    ctx_map.resize(3 * kNumOrders * num_dc_ctxs, 0);
+
+    int lbuckets = (dct[1].size() + 1);
+    for (size_t i = 0; i < num_dc_ctxs; i++) {
+      // up to 9 contexts for luma
+      ctx_map[i] = i / lbuckets;
+      // up to 3 contexts for chroma
+      ctx_map[kNumOrders * num_dc_ctxs + i] =
+          ctx_map[2 * kNumOrders * num_dc_ctxs + i] =
+              num_dc_ctxs / lbuckets + (i % lbuckets);
+    }
+    enc_state_->shared.block_ctx_map.num_ctxs =
+        *std::max_element(ctx_map.begin(), ctx_map.end()) + 1;
+
+    enc_state_->histogram_idx.resize(shared.frame_dim.num_groups);
+
+    // disable DC frame for now
+    shared.frame_header.UpdateFlag(false, FrameHeader::kUseDcFrame);
+    auto compute_dc_coeffs = [&](const uint32_t group_index,
+                                 size_t /* thread */) {
+      modular_frame_encoder->AddVarDCTDC(dc, group_index, /*nl_dc=*/false,
+                                         enc_state_, /*jpeg_transcode=*/true);
+      modular_frame_encoder->AddACMetadata(group_index, /*jpeg_transcode=*/true,
+                                           enc_state_);
+    };
+    JXL_RETURN_IF_ERROR(RunOnPool(pool_, 0, shared.frame_dim.num_dc_groups,
+                                  ThreadPool::NoInit, compute_dc_coeffs,
+                                  "Compute DC coeffs"));
+
+    // Must happen before WriteFrameHeader!
+    shared.frame_header.UpdateFlag(true, FrameHeader::kSkipAdaptiveDCSmoothing);
+
+    ComputeAllCoeffOrders(frame_dim);
+    shared.num_histograms = 1;
+
+    const auto tokenize_group_init = [&](const size_t num_threads) {
+      group_caches_.resize(num_threads);
+      return true;
+    };
+    const auto tokenize_group = [&](const uint32_t group_index,
+                                    const size_t thread) {
+      // Tokenize coefficients.
+      const Rect rect = shared.BlockGroupRect(group_index);
+      for (size_t idx_pass = 0; idx_pass < enc_state_->passes.size();
+           idx_pass++) {
+        JXL_ASSERT(enc_state_->coeffs[idx_pass]->Type() == ACType::k32);
+        const int32_t* JXL_RESTRICT ac_rows[3] = {
+            enc_state_->coeffs[idx_pass]->PlaneRow(0, group_index, 0).ptr32,
+            enc_state_->coeffs[idx_pass]->PlaneRow(1, group_index, 0).ptr32,
+            enc_state_->coeffs[idx_pass]->PlaneRow(2, group_index, 0).ptr32,
+        };
+        // Ensure group cache is initialized.
+        group_caches_[thread].InitOnce();
+        TokenizeCoefficients(
+            &shared.coeff_orders[idx_pass * shared.coeff_order_size], rect,
+            ac_rows, shared.ac_strategy, frame_header->chroma_subsampling,
+            &group_caches_[thread].num_nzeroes,
+            &enc_state_->passes[idx_pass].ac_tokens[group_index],
+            enc_state_->shared.quant_dc, enc_state_->shared.raw_quant_field,
+            enc_state_->shared.block_ctx_map);
+      }
+    };
+    JXL_RETURN_IF_ERROR(RunOnPool(pool_, 0, shared.frame_dim.num_groups,
+                                  tokenize_group_init, tokenize_group,
+                                  "TokenizeGroup"));
+    *frame_header = shared.frame_header;
+    doing_jpeg_recompression = true;
+    return true;
+  }
+
+  Status EncodeGlobalDCInfo(const FrameHeader& frame_header,
+                            BitWriter* writer) const {
+    // Encode quantizer DC and global scale.
+    QuantizerParams params = enc_state_->shared.quantizer.GetParams();
+    JXL_RETURN_IF_ERROR(
+        WriteQuantizerParams(params, writer, kLayerQuant, aux_out_));
+    EncodeBlockCtxMap(enc_state_->shared.block_ctx_map, writer, aux_out_);
+    ColorCorrelationMapEncodeDC(&enc_state_->shared.cmap, writer, kLayerDC,
+                                aux_out_);
+    return true;
+  }
+
+  Status EncodeGlobalACInfo(BitWriter* writer,
+                            ModularFrameEncoder* modular_frame_encoder) {
+    JXL_RETURN_IF_ERROR(DequantMatricesEncode(&enc_state_->shared.matrices,
+                                              writer, kLayerQuant, aux_out_,
+                                              modular_frame_encoder));
+    if (enc_state_->cparams.speed_tier <= SpeedTier::kTortoise) {
+      if (!doing_jpeg_recompression) ClusterGroups(enc_state_);
+    }
+    size_t num_histo_bits =
+        CeilLog2Nonzero(enc_state_->shared.frame_dim.num_groups);
+    if (num_histo_bits != 0) {
+      BitWriter::Allotment allotment(writer, num_histo_bits);
+      writer->Write(num_histo_bits, enc_state_->shared.num_histograms - 1);
+      allotment.ReclaimAndCharge(writer, kLayerAC, aux_out_);
+    }
+
+    for (size_t i = 0; i < enc_state_->progressive_splitter.GetNumPasses();
+         i++) {
+      // Encode coefficient orders.
+      size_t order_bits = 0;
+      JXL_RETURN_IF_ERROR(U32Coder::CanEncode(
+          kOrderEnc, enc_state_->used_orders[i], &order_bits));
+      BitWriter::Allotment allotment(writer, order_bits);
+      JXL_CHECK(U32Coder::Write(kOrderEnc, enc_state_->used_orders[i], writer));
+      allotment.ReclaimAndCharge(writer, kLayerOrder, aux_out_);
+      EncodeCoeffOrders(
+          enc_state_->used_orders[i],
+          &enc_state_->shared
+               .coeff_orders[i * enc_state_->shared.coeff_order_size],
+          writer, kLayerOrder, aux_out_);
+
+      // Encode histograms.
+      HistogramParams hist_params(
+          enc_state_->cparams.speed_tier,
+          enc_state_->shared.block_ctx_map.NumACContexts());
+      if (enc_state_->cparams.speed_tier > SpeedTier::kTortoise) {
+        hist_params.lz77_method = HistogramParams::LZ77Method::kNone;
+      }
+      if (enc_state_->cparams.decoding_speed_tier >= 1) {
+        hist_params.max_histograms = 6;
+      }
+      BuildAndEncodeHistograms(
+          hist_params,
+          enc_state_->shared.num_histograms *
+              enc_state_->shared.block_ctx_map.NumACContexts(),
+          enc_state_->passes[i].ac_tokens, &enc_state_->passes[i].codes,
+          &enc_state_->passes[i].context_map, writer, kLayerAC, aux_out_);
+    }
+
+    return true;
+  }
+
+  Status EncodeACGroup(size_t pass, size_t group_index, BitWriter* group_code,
+                       AuxOut* local_aux_out) {
+    return EncodeGroupTokenizedCoefficients(
+        group_index, pass, enc_state_->histogram_idx[group_index], *enc_state_,
+        group_code, local_aux_out);
+  }
+
+  PassesEncoderState* State() { return enc_state_; }
+
+ private:
+  void ComputeAllCoeffOrders(const FrameDimensions& frame_dim) {
+    PROFILER_FUNC;
+    // No coefficient reordering in Falcon or faster.
+    auto used_orders_info = ComputeUsedOrders(
+        enc_state_->cparams.speed_tier, enc_state_->shared.ac_strategy,
+        Rect(enc_state_->shared.raw_quant_field));
+    enc_state_->used_orders.clear();
+    enc_state_->used_orders.resize(
+        enc_state_->progressive_splitter.GetNumPasses(),
+        used_orders_info.second);
+    for (size_t i = 0; i < enc_state_->progressive_splitter.GetNumPasses();
+         i++) {
+      ComputeCoeffOrder(
+          enc_state_->cparams.speed_tier, *enc_state_->coeffs[i],
+          enc_state_->shared.ac_strategy, frame_dim, enc_state_->used_orders[i],
+          used_orders_info.first,
+          &enc_state_->shared
+               .coeff_orders[i * enc_state_->shared.coeff_order_size]);
+    }
+  }
+
+  template <typename V, typename R>
+  static inline void FindIndexOfSumMaximum(const V* array, const size_t len,
+                                           R* idx, V* sum) {
+    JXL_ASSERT(len > 0);
+    V maxval = 0;
+    V val = 0;
+    R maxidx = 0;
+    for (size_t i = 0; i < len; ++i) {
+      val += array[i];
+      if (val > maxval) {
+        maxval = val;
+        maxidx = i;
+      }
+    }
+    *idx = maxidx;
+    *sum = maxval;
+  }
+
+  PassesEncoderState* JXL_RESTRICT enc_state_;
+  JxlCmsInterface cms_;
+  ThreadPool* pool_;
+  AuxOut* aux_out_;
+  std::vector<EncCache> group_caches_;
+  bool doing_jpeg_recompression = false;
+};
+
+Status ParamsPostInit(CompressParams* p) {
+  if (!p->manual_noise.empty() &&
+      p->manual_noise.size() != NoiseParams::kNumNoisePoints) {
+    return JXL_FAILURE("Invalid number of noise lut entries");
+  }
+  if (!p->manual_xyb_factors.empty() && p->manual_xyb_factors.size() != 3) {
+    return JXL_FAILURE("Invalid number of XYB quantization factors");
+  }
+  if (!p->modular_mode && p->butteraugli_distance == 0.0) {
+    p->butteraugli_distance = kMinButteraugliDistance;
+  }
+  if (p->original_butteraugli_distance == -1.0) {
+    p->original_butteraugli_distance = p->butteraugli_distance;
+  }
+  if (p->resampling <= 0) {
+    p->resampling = 1;
+    // For very low bit rates, using 2x2 resampling gives better results on
+    // most photographic images, with an adjusted butteraugli score chosen to
+    // give roughly the same amount of bits per pixel.
+    if (!p->already_downsampled && p->butteraugli_distance >= 20) {
+      p->resampling = 2;
+      p->butteraugli_distance = 6 + ((p->butteraugli_distance - 20) * 0.25);
+    }
+  }
+  if (p->ec_resampling <= 0) {
+    p->ec_resampling = p->resampling;
+  }
+  return true;
+}
+
+Status EncodeFrame(const CompressParams& cparams_orig,
+                   const FrameInfo& frame_info, const CodecMetadata* metadata,
+                   const ImageBundle& ib, PassesEncoderState* passes_enc_state,
+                   const JxlCmsInterface& cms, ThreadPool* pool,
+                   BitWriter* writer, AuxOut* aux_out) {
+  CompressParams cparams = cparams_orig;
+  if (cparams.speed_tier == SpeedTier::kGlacier && !cparams.IsLossless()) {
+    cparams.speed_tier = SpeedTier::kTortoise;
+  }
+  if (cparams.speed_tier == SpeedTier::kGlacier) {
+    std::vector<CompressParams> all_params;
+    std::vector<size_t> size;
+
+    CompressParams cparams_attempt = cparams_orig;
+    cparams_attempt.speed_tier = SpeedTier::kTortoise;
+    cparams_attempt.options.max_properties = 4;
+
+    for (float x : {0.0f, 80.f}) {
+      cparams_attempt.channel_colors_percent = x;
+      for (float y : {0.0f, 95.0f}) {
+        cparams_attempt.channel_colors_pre_transform_percent = y;
+        // 70000 ensures that the number of palette colors is representable in
+        // modular headers.
+        for (int K : {0, 1 << 10, 70000}) {
+          cparams_attempt.palette_colors = K;
+          for (int tree_mode : {-1, (int)ModularOptions::TreeMode::kNoWP,
+                                (int)ModularOptions::TreeMode::kDefault}) {
+            if (tree_mode == -1) {
+              // LZ77 only
+              cparams_attempt.options.nb_repeats = 0;
+            } else {
+              cparams_attempt.options.nb_repeats = 1;
+              cparams_attempt.options.wp_tree_mode =
+                  static_cast<ModularOptions::TreeMode>(tree_mode);
+            }
+            for (Predictor pred : {Predictor::Zero, Predictor::Variable}) {
+              cparams_attempt.options.predictor = pred;
+              for (int g : {0, 1, 3}) {
+                cparams_attempt.modular_group_size_shift = g;
+                for (Override patches : {Override::kDefault, Override::kOff}) {
+                  cparams_attempt.patches = patches;
+                  all_params.push_back(cparams_attempt);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    size.resize(all_params.size());
+
+    std::atomic<int> num_errors{0};
+
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, all_params.size(), ThreadPool::NoInit,
+        [&](size_t task, size_t) {
+          BitWriter w;
+          PassesEncoderState state;
+          if (!EncodeFrame(all_params[task], frame_info, metadata, ib, &state,
+                           cms, nullptr, &w, aux_out)) {
+            num_errors.fetch_add(1, std::memory_order_relaxed);
+            return;
+          }
+          size[task] = w.BitsWritten();
+        },
+        "Compress kGlacier"));
+    JXL_RETURN_IF_ERROR(num_errors.load(std::memory_order_relaxed) == 0);
+
+    size_t best_idx = 0;
+    for (size_t i = 1; i < all_params.size(); i++) {
+      if (size[best_idx] > size[i]) {
+        best_idx = i;
+      }
+    }
+    cparams = all_params[best_idx];
+  }
+
+  if (cparams_orig.target_bitrate > 0.0f &&
+      frame_info.frame_type == FrameType::kRegularFrame) {
+    cparams.target_bitrate = 0.0f;
+    const float target_bitrate = cparams_orig.target_bitrate;
+    float bitrate = 0.0f;
+    float prev_bitrate = 0.0f;
+    float rescale = 1.0f;
+    size_t prev_bits = 0;
+    float error = 0.0f;
+    float best_error = 100.0f;
+    float best_rescale = 1.0f;
+    for (size_t i = 0; i < 10; ++i) {
+      std::unique_ptr<PassesEncoderState> state =
+          jxl::make_unique<PassesEncoderState>();
+      BitWriter bw;
+      JXL_CHECK(EncodeFrame(cparams, frame_info, metadata, ib, state.get(), cms,
+                            pool, &bw, nullptr));
+      bitrate = bw.BitsWritten() * 1.0 / (ib.xsize() * ib.ysize());
+      error = target_bitrate / bitrate - 1.0f;
+      if (std::abs(error) < std::abs(best_error)) {
+        best_error = error;
+        best_rescale = cparams.quant_ac_rescale;
+      }
+      if (bw.BitsWritten() == prev_bits || std::abs(error) < 0.0005f) {
+        break;
+      }
+      float lambda = 1.0f;
+      if (i > 0) {
+        lambda = (((bitrate / prev_bitrate) - 1.0f) / (rescale - 1.0f));
+      }
+      rescale = (1.0f + ((target_bitrate / bitrate) - 1.0f) / lambda);
+      if (rescale < 0.0f) {
+        break;
+      }
+      cparams.quant_ac_rescale *= rescale;
+      prev_bitrate = bitrate;
+      prev_bits = bw.BitsWritten();
+    }
+    if (aux_out) {
+      aux_out->max_quant_rescale = best_rescale;
+      aux_out->min_quant_rescale = best_rescale;
+      aux_out->min_bitrate_error = best_error;
+      aux_out->max_bitrate_error = best_error;
+    }
+    cparams.quant_ac_rescale = best_rescale;
+  }
+  ib.VerifyMetadata();
+
+  passes_enc_state->special_frames.clear();
+
+  if (cparams.qprogressive_mode) {
+    passes_enc_state->progressive_splitter.SetProgressiveMode(
+        ProgressiveMode{progressive_passes_dc_quant_ac_full_ac});
+  } else if (cparams.progressive_mode) {
+    passes_enc_state->progressive_splitter.SetProgressiveMode(
+        ProgressiveMode{progressive_passes_dc_vlf_lf_full_ac});
+  }
+
+  JXL_RETURN_IF_ERROR(ParamsPostInit(&cparams));
+
+  if (cparams.progressive_dc < 0) {
+    if (cparams.progressive_dc != -1) {
+      return JXL_FAILURE("Invalid progressive DC setting value (%d)",
+                         cparams.progressive_dc);
+    }
+    cparams.progressive_dc = 0;
+  }
+  if (cparams.ec_resampling < cparams.resampling) {
+    cparams.ec_resampling = cparams.resampling;
+  }
+  if (cparams.resampling > 1 || frame_info.is_preview) {
+    cparams.progressive_dc = 0;
+  }
+
+  if (frame_info.dc_level + cparams.progressive_dc > 4) {
+    return JXL_FAILURE("Too many levels of progressive DC");
+  }
+
+  if (cparams.butteraugli_distance != 0 &&
+      cparams.butteraugli_distance < kMinButteraugliDistance) {
+    return JXL_FAILURE("Butteraugli distance is too low (%f)",
+                       cparams.butteraugli_distance);
+  }
+
+  if (ib.IsJPEG()) {
+    cparams.gaborish = Override::kOff;
+    cparams.epf = 0;
+    cparams.modular_mode = false;
+  }
+
+  if (ib.xsize() == 0 || ib.ysize() == 0) return JXL_FAILURE("Empty image");
+
+  // Assert that this metadata is correctly set up for the compression params,
+  // this should have been done by enc_file.cc
+  JXL_ASSERT(metadata->m.xyb_encoded ==
+             (cparams.color_transform == ColorTransform::kXYB));
+  std::unique_ptr<FrameHeader> frame_header =
+      jxl::make_unique<FrameHeader>(metadata);
+  JXL_RETURN_IF_ERROR(MakeFrameHeader(cparams,
+                                      passes_enc_state->progressive_splitter,
+                                      frame_info, ib, frame_header.get()));
+  // Check that if the codestream header says xyb_encoded, the color_transform
+  // matches the requirement. This is checked from the cparams here, even though
+  // optimally we'd be able to check this against what has actually been written
+  // in the main codestream header, but since ib is a const object and the data
+  // written to the main codestream header is (in modified form) in ib, the
+  // encoder cannot indicate this fact in the ib's metadata.
+  if (cparams_orig.color_transform == ColorTransform::kXYB) {
+    if (frame_header->color_transform != ColorTransform::kXYB) {
+      return JXL_FAILURE(
+          "The color transform of frames must be xyb if the codestream is xyb "
+          "encoded");
+    }
+  } else {
+    if (frame_header->color_transform == ColorTransform::kXYB) {
+      return JXL_FAILURE(
+          "The color transform of frames cannot be xyb if the codestream is "
+          "not xyb encoded");
+    }
+  }
+
+  FrameDimensions frame_dim = frame_header->ToFrameDimensions();
+
+  const size_t num_groups = frame_dim.num_groups;
+
+  Image3F opsin;
+  const ColorEncoding& c_linear = ColorEncoding::LinearSRGB(ib.IsGray());
+  std::unique_ptr<ImageMetadata> metadata_linear =
+      jxl::make_unique<ImageMetadata>();
+  metadata_linear->xyb_encoded =
+      (cparams.color_transform == ColorTransform::kXYB);
+  metadata_linear->color_encoding = c_linear;
+  ImageBundle linear_storage(metadata_linear.get());
+
+  std::vector<AuxOut> aux_outs;
+  // LossyFrameEncoder stores a reference to a std::function<Status(size_t)>
+  // so we need to keep the std::function<Status(size_t)> being referenced
+  // alive while lossy_frame_encoder is used. We could make resize_aux_outs a
+  // lambda type by making LossyFrameEncoder a template instead, but this is
+  // simpler.
+  const std::function<Status(size_t)> resize_aux_outs =
+      [&aux_outs, aux_out](const size_t num_threads) -> Status {
+    if (aux_out != nullptr) {
+      size_t old_size = aux_outs.size();
+      for (size_t i = num_threads; i < old_size; i++) {
+        aux_out->Assimilate(aux_outs[i]);
+      }
+      aux_outs.resize(num_threads);
+      // Each thread needs these INPUTS. Don't copy the entire AuxOut
+      // because it may contain stats which would be Assimilated multiple
+      // times below.
+      for (size_t i = old_size; i < aux_outs.size(); i++) {
+        aux_outs[i].dump_image = aux_out->dump_image;
+        aux_outs[i].debug_prefix = aux_out->debug_prefix;
+      }
+    }
+    return true;
+  };
+
+  LossyFrameEncoder lossy_frame_encoder(cparams, *frame_header,
+                                        passes_enc_state, cms, pool, aux_out);
+  std::unique_ptr<ModularFrameEncoder> modular_frame_encoder =
+      jxl::make_unique<ModularFrameEncoder>(*frame_header, cparams);
+
+  const std::vector<ImageF>* extra_channels = &ib.extra_channels();
+  std::vector<ImageF> extra_channels_storage;
+  // Clear patches
+  passes_enc_state->shared.image_features.patches = PatchDictionary();
+  passes_enc_state->shared.image_features.patches.SetPassesSharedState(
+      &passes_enc_state->shared);
+
+  if (ib.IsJPEG()) {
+    JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeJPEGTranscodingData(
+        *ib.jpeg_data, modular_frame_encoder.get(), frame_header.get()));
+  } else if (!lossy_frame_encoder.State()->heuristics->HandlesColorConversion(
+                 cparams, ib) ||
+             frame_header->encoding != FrameEncoding::kVarDCT) {
+    // Allocating a large enough image avoids a copy when padding.
+    opsin =
+        Image3F(RoundUpToBlockDim(ib.xsize()), RoundUpToBlockDim(ib.ysize()));
+    opsin.ShrinkTo(ib.xsize(), ib.ysize());
+
+    const bool want_linear = frame_header->encoding == FrameEncoding::kVarDCT &&
+                             cparams.speed_tier <= SpeedTier::kKitten;
+    const ImageBundle* JXL_RESTRICT ib_or_linear = &ib;
+
+    if (frame_header->color_transform == ColorTransform::kXYB &&
+        frame_info.ib_needs_color_transform) {
+      // linear_storage would only be used by the Butteraugli loop (passing
+      // linear sRGB avoids a color conversion there). Otherwise, don't
+      // fill it to reduce memory usage.
+      ib_or_linear =
+          ToXYB(ib, pool, &opsin, cms, want_linear ? &linear_storage : nullptr);
+    } else {  // RGB or YCbCr: don't do anything (forward YCbCr is not
+              // implemented, this is only used when the input is already in
+              // YCbCr)
+              // If encoding a special DC or reference frame, don't do anything:
+              // input is already in XYB.
+      CopyImageTo(ib.color(), &opsin);
+    }
+    bool lossless = cparams.IsLossless();
+    if (ib.HasAlpha() && !ib.AlphaIsPremultiplied() &&
+        frame_header->frame_type == FrameType::kRegularFrame &&
+        !ApplyOverride(cparams.keep_invisible, lossless) &&
+        cparams.ec_resampling == cparams.resampling) {
+      // simplify invisible pixels
+      SimplifyInvisible(&opsin, ib.alpha(), lossless);
+      if (want_linear) {
+        SimplifyInvisible(const_cast<Image3F*>(&ib_or_linear->color()),
+                          ib.alpha(), lossless);
+      }
+    }
+    if (aux_out != nullptr) {
+      JXL_RETURN_IF_ERROR(
+          aux_out->InspectImage3F("enc_frame:OpsinDynamicsImage", opsin));
+    }
+    if (frame_header->encoding == FrameEncoding::kVarDCT) {
+      PadImageToBlockMultipleInPlace(&opsin);
+      JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeEncodingData(
+          ib_or_linear, &opsin, cms, pool, modular_frame_encoder.get(),
+          frame_header.get()));
+    } else if (frame_header->upsampling != 1 && !cparams.already_downsampled) {
+      // In VarDCT mode, LossyFrameHeuristics takes care of running downsampling
+      // after noise, if necessary.
+      DownsampleImage(&opsin, frame_header->upsampling);
+    }
+  } else {
+    JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeEncodingData(
+        &ib, &opsin, cms, pool, modular_frame_encoder.get(),
+        frame_header.get()));
+  }
+  if (cparams.ec_resampling != 1 && !cparams.already_downsampled) {
+    extra_channels = &extra_channels_storage;
+    for (size_t i = 0; i < ib.extra_channels().size(); i++) {
+      extra_channels_storage.emplace_back(CopyImage(ib.extra_channels()[i]));
+      DownsampleImage(&extra_channels_storage.back(), cparams.ec_resampling);
+    }
+  }
+  // needs to happen *AFTER* VarDCT-ComputeEncodingData.
+  JXL_RETURN_IF_ERROR(modular_frame_encoder->ComputeEncodingData(
+      *frame_header, *ib.metadata(), &opsin, *extra_channels,
+      lossy_frame_encoder.State(), cms, pool, aux_out,
+      /* do_color=*/frame_header->encoding == FrameEncoding::kModular));
+
+  writer->AppendByteAligned(lossy_frame_encoder.State()->special_frames);
+  frame_header->UpdateFlag(
+      lossy_frame_encoder.State()->shared.image_features.patches.HasAny(),
+      FrameHeader::kPatches);
+  frame_header->UpdateFlag(
+      lossy_frame_encoder.State()->shared.image_features.splines.HasAny(),
+      FrameHeader::kSplines);
+  JXL_RETURN_IF_ERROR(WriteFrameHeader(*frame_header, writer, aux_out));
+
+  const size_t num_passes =
+      passes_enc_state->progressive_splitter.GetNumPasses();
+
+  // DC global info + DC groups + AC global info + AC groups *
+  // num_passes.
+  const bool has_ac_global = true;
+  std::vector<BitWriter> group_codes(NumTocEntries(frame_dim.num_groups,
+                                                   frame_dim.num_dc_groups,
+                                                   num_passes, has_ac_global));
+  const size_t global_ac_index = frame_dim.num_dc_groups + 1;
+  const bool is_small_image = frame_dim.num_groups == 1 && num_passes == 1;
+  const auto get_output = [&](const size_t index) {
+    return &group_codes[is_small_image ? 0 : index];
+  };
+  auto ac_group_code = [&](size_t pass, size_t group) {
+    return get_output(AcGroupIndex(pass, group, frame_dim.num_groups,
+                                   frame_dim.num_dc_groups, has_ac_global));
+  };
+
+  if (frame_header->flags & FrameHeader::kPatches) {
+    PatchDictionaryEncoder::Encode(
+        lossy_frame_encoder.State()->shared.image_features.patches,
+        get_output(0), kLayerDictionary, aux_out);
+  }
+
+  if (frame_header->flags & FrameHeader::kSplines) {
+    EncodeSplines(lossy_frame_encoder.State()->shared.image_features.splines,
+                  get_output(0), kLayerSplines, HistogramParams(), aux_out);
+  }
+
+  if (cparams.photon_noise_iso > 0) {
+    lossy_frame_encoder.State()->shared.image_features.noise_params =
+        SimulatePhotonNoise(ib.xsize(), ib.ysize(), cparams.photon_noise_iso);
+  }
+  if (cparams.manual_noise.size() == NoiseParams::kNumNoisePoints) {
+    for (size_t i = 0; i < NoiseParams::kNumNoisePoints; i++) {
+      lossy_frame_encoder.State()->shared.image_features.noise_params.lut[i] =
+          cparams.manual_noise[i];
+    }
+  }
+  if (frame_header->flags & FrameHeader::kNoise) {
+    EncodeNoise(lossy_frame_encoder.State()->shared.image_features.noise_params,
+                get_output(0), kLayerNoise, aux_out);
+  }
+
+  JXL_RETURN_IF_ERROR(
+      DequantMatricesEncodeDC(&lossy_frame_encoder.State()->shared.matrices,
+                              get_output(0), kLayerQuant, aux_out));
+  if (frame_header->encoding == FrameEncoding::kVarDCT) {
+    JXL_RETURN_IF_ERROR(
+        lossy_frame_encoder.EncodeGlobalDCInfo(*frame_header, get_output(0)));
+  }
+  JXL_RETURN_IF_ERROR(
+      modular_frame_encoder->EncodeGlobalInfo(get_output(0), aux_out));
+  JXL_RETURN_IF_ERROR(modular_frame_encoder->EncodeStream(
+      get_output(0), aux_out, kLayerModularGlobal, ModularStreamId::Global()));
+
+  const auto process_dc_group = [&](const uint32_t group_index,
+                                    const size_t thread) {
+    AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr;
+    BitWriter* output = get_output(group_index + 1);
+    if (frame_header->encoding == FrameEncoding::kVarDCT &&
+        !(frame_header->flags & FrameHeader::kUseDcFrame)) {
+      BitWriter::Allotment allotment(output, 2);
+      output->Write(2, modular_frame_encoder->extra_dc_precision[group_index]);
+      allotment.ReclaimAndCharge(output, kLayerDC, my_aux_out);
+      JXL_CHECK(modular_frame_encoder->EncodeStream(
+          output, my_aux_out, kLayerDC,
+          ModularStreamId::VarDCTDC(group_index)));
+    }
+    JXL_CHECK(modular_frame_encoder->EncodeStream(
+        output, my_aux_out, kLayerModularDcGroup,
+        ModularStreamId::ModularDC(group_index)));
+    if (frame_header->encoding == FrameEncoding::kVarDCT) {
+      const Rect& rect =
+          lossy_frame_encoder.State()->shared.DCGroupRect(group_index);
+      size_t nb_bits = CeilLog2Nonzero(rect.xsize() * rect.ysize());
+      if (nb_bits != 0) {
+        BitWriter::Allotment allotment(output, nb_bits);
+        output->Write(nb_bits,
+                      modular_frame_encoder->ac_metadata_size[group_index] - 1);
+        allotment.ReclaimAndCharge(output, kLayerControlFields, my_aux_out);
+      }
+      JXL_CHECK(modular_frame_encoder->EncodeStream(
+          output, my_aux_out, kLayerControlFields,
+          ModularStreamId::ACMetadata(group_index)));
+    }
+  };
+  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, frame_dim.num_dc_groups,
+                                resize_aux_outs, process_dc_group,
+                                "EncodeDCGroup"));
+
+  if (frame_header->encoding == FrameEncoding::kVarDCT) {
+    JXL_RETURN_IF_ERROR(lossy_frame_encoder.EncodeGlobalACInfo(
+        get_output(global_ac_index), modular_frame_encoder.get()));
+  }
+
+  std::atomic<int> num_errors{0};
+  const auto process_group = [&](const uint32_t group_index,
+                                 const size_t thread) {
+    AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr;
+
+    for (size_t i = 0; i < num_passes; i++) {
+      if (frame_header->encoding == FrameEncoding::kVarDCT) {
+        if (!lossy_frame_encoder.EncodeACGroup(
+                i, group_index, ac_group_code(i, group_index), my_aux_out)) {
+          num_errors.fetch_add(1, std::memory_order_relaxed);
+          return;
+        }
+      }
+      // Write all modular encoded data (color?, alpha, depth, extra channels)
+      if (!modular_frame_encoder->EncodeStream(
+              ac_group_code(i, group_index), my_aux_out, kLayerModularAcGroup,
+              ModularStreamId::ModularAC(group_index, i))) {
+        num_errors.fetch_add(1, std::memory_order_relaxed);
+        return;
+      }
+    }
+  };
+  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, num_groups, resize_aux_outs,
+                                process_group, "EncodeGroupCoefficients"));
+
+  // Resizing aux_outs to 0 also Assimilates the array.
+  static_cast<void>(resize_aux_outs(0));
+  JXL_RETURN_IF_ERROR(num_errors.load(std::memory_order_relaxed) == 0);
+
+  for (BitWriter& bw : group_codes) {
+    BitWriter::Allotment allotment(&bw, 8);
+    bw.ZeroPadToByte();  // end of group.
+    allotment.ReclaimAndCharge(&bw, kLayerAC, aux_out);
+  }
+
+  std::vector<coeff_order_t>* permutation_ptr = nullptr;
+  std::vector<coeff_order_t> permutation;
+  if (cparams.centerfirst && !(num_passes == 1 && num_groups == 1)) {
+    permutation_ptr = &permutation;
+    // Don't permute global DC/AC or DC.
+    permutation.resize(global_ac_index + 1);
+    std::iota(permutation.begin(), permutation.end(), 0);
+    std::vector<coeff_order_t> ac_group_order(num_groups);
+    std::iota(ac_group_order.begin(), ac_group_order.end(), 0);
+    size_t group_dim = frame_dim.group_dim;
+
+    // The center of the image is either given by parameters or chosen
+    // to be the middle of the image by default if center_x, center_y resp.
+    // are not provided.
+
+    int64_t imag_cx;
+    if (cparams.center_x != static_cast<size_t>(-1)) {
+      JXL_RETURN_IF_ERROR(cparams.center_x < ib.xsize());
+      imag_cx = cparams.center_x;
+    } else {
+      imag_cx = ib.xsize() / 2;
+    }
+
+    int64_t imag_cy;
+    if (cparams.center_y != static_cast<size_t>(-1)) {
+      JXL_RETURN_IF_ERROR(cparams.center_y < ib.ysize());
+      imag_cy = cparams.center_y;
+    } else {
+      imag_cy = ib.ysize() / 2;
+    }
+
+    // The center of the group containing the center of the image.
+    int64_t cx = (imag_cx / group_dim) * group_dim + group_dim / 2;
+    int64_t cy = (imag_cy / group_dim) * group_dim + group_dim / 2;
+    // This identifies in what area of the central group the center of the image
+    // lies in.
+    double direction = -std::atan2(imag_cy - cy, imag_cx - cx);
+    // This identifies the side of the central group the center of the image
+    // lies closest to. This can take values 0, 1, 2, 3 corresponding to left,
+    // bottom, right, top.
+    int64_t side = std::fmod((direction + 5 * kPi / 4), 2 * kPi) * 2 / kPi;
+    auto get_distance_from_center = [&](size_t gid) {
+      Rect r = passes_enc_state->shared.GroupRect(gid);
+      int64_t gcx = r.x0() + group_dim / 2;
+      int64_t gcy = r.y0() + group_dim / 2;
+      int64_t dx = gcx - cx;
+      int64_t dy = gcy - cy;
+      // The angle is determined by taking atan2 and adding an appropriate
+      // starting point depending on the side we want to start on.
+      double angle = std::remainder(
+          std::atan2(dy, dx) + kPi / 4 + side * (kPi / 2), 2 * kPi);
+      // Concentric squares in clockwise order.
+      return std::make_pair(std::max(std::abs(dx), std::abs(dy)), angle);
+    };
+    std::sort(ac_group_order.begin(), ac_group_order.end(),
+              [&](coeff_order_t a, coeff_order_t b) {
+                return get_distance_from_center(a) <
+                       get_distance_from_center(b);
+              });
+    std::vector<coeff_order_t> inv_ac_group_order(ac_group_order.size(), 0);
+    for (size_t i = 0; i < ac_group_order.size(); i++) {
+      inv_ac_group_order[ac_group_order[i]] = i;
+    }
+    for (size_t i = 0; i < num_passes; i++) {
+      size_t pass_start = permutation.size();
+      for (coeff_order_t v : inv_ac_group_order) {
+        permutation.push_back(pass_start + v);
+      }
+    }
+    std::vector<BitWriter> new_group_codes(group_codes.size());
+    for (size_t i = 0; i < permutation.size(); i++) {
+      new_group_codes[permutation[i]] = std::move(group_codes[i]);
+    }
+    group_codes = std::move(new_group_codes);
+  }
+
+  JXL_RETURN_IF_ERROR(
+      WriteGroupOffsets(group_codes, permutation_ptr, writer, aux_out));
+  writer->AppendByteAligned(group_codes);
+
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_frame.h b/third_party/jpeg-xl/lib/jxl/enc_frame.h
new file mode 100644
index 0000000000..b1dc637eb0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_frame.h
@@ -0,0 +1,78 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_FRAME_H_
+#define LIB_JXL_ENC_FRAME_H_
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Information needed for encoding a frame that is not contained elsewhere and
+// does not belong to `cparams`.
+// TODO(lode): if possible, it might be better to replace FrameInfo and several
+// fields from ImageBundle (such as frame name and duration) by direct usage of
+// jxl::FrameHeader itself.
+struct FrameInfo {
+  // TODO(veluca): consider adding more parameters, such as custom patches.
+  bool save_before_color_transform = false;
+  // Whether or not the input image bundle is already in the codestream
+  // colorspace (as deduced by cparams).
+  // TODO(veluca): this is a hack - ImageBundle doesn't have a simple way to say
+  // "this is already in XYB".
+  bool ib_needs_color_transform = true;
+  FrameType frame_type = FrameType::kRegularFrame;
+  size_t dc_level = 0;
+  // Only used for kRegularFrame.
+  bool is_last = true;
+  bool is_preview = false;
+  // Information for storing this frame for future use (only for non-DC frames).
+  size_t save_as_reference = 0;
+  // The source frame for blending of a next frame, matching the
+  // save_as_reference value of a previous frame. Animated frames can use
+  // save_as_reference values 1, 2 and 3, while composite still frames can use
+  // save_as_reference values 0, 1, 2 and 3. The current C++ encoder
+  // implementation is assuming and using 1 for all frames of animations, so
+  // using that as the default value here.
+  // Corresponds to BlendingInfo::source from the FrameHeader.
+  size_t source = 1;
+  // Corresponds to BlendingInfo::clamp from the FrameHeader.
+  size_t clamp = 1;
+  // Corresponds to BlendingInfo::alpha_channel from the FrameHeader, or set to
+  // -1 to automatically choose it as the index of the first extra channel of
+  // type alpha.
+  int alpha_channel = -1;
+
+  // If non-empty, uses this blending info for the extra channels, otherwise
+  // automatically chooses it. The encoder API will fill this vector with the
+  // extra channel info and allows more options. The non-API cjxl leaves it
+  // empty and relies on the default behavior.
+  std::vector<BlendingInfo> extra_channel_blending_info;
+};
+
+// Checks and adjusts CompressParams when they are all initialized.
+Status ParamsPostInit(CompressParams* p);
+
+// Encodes a single frame (including its header) into a byte stream.  Groups may
+// be processed in parallel by `pool`. metadata is the ImageMetadata encoded in
+// the codestream, and must be used for the FrameHeaders, do not use
+// ib.metadata.
+Status EncodeFrame(const CompressParams& cparams_orig,
+                   const FrameInfo& frame_info, const CodecMetadata* metadata,
+                   const ImageBundle& ib, PassesEncoderState* passes_enc_state,
+                   const JxlCmsInterface& cms, ThreadPool* pool,
+                   BitWriter* writer, AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_FRAME_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_gaborish.cc b/third_party/jpeg-xl/lib/jxl/enc_gaborish.cc
new file mode 100644
index 0000000000..d57bb68b7f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_gaborish.cc
@@ -0,0 +1,61 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_gaborish.h"
+
+#include <stddef.h>
+
+#include <hwy/base.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+void GaborishInverse(Image3F* in_out, float mul[3], ThreadPool* pool) {
+  WeightsSymmetric5 weights[3];
+  // Only an approximation. One or even two 3x3, and rank-1 (separable) 5x5
+  // are insufficient. The numbers here have been obtained by butteraugli
+  // based optimizing the whole system and the errors produced are likely
+  // more favorable for good rate-distortion compromises rather than
+  // just using mathematical optimization to find the inverse.
+  static const float kGaborish[5] = {
+      -0.090881924078487886f, -0.043663953593472138f, 0.01392497846646211f,
+      0.0036189602184591141f, 0.0030557936884763499f};
+  for (int i = 0; i < 3; ++i) {
+    double sum = 1.0 + mul[i] * 4 *
+                           (kGaborish[0] + kGaborish[1] + kGaborish[2] +
+                            kGaborish[4] + 2 * kGaborish[3]);
+    if (sum < 1e-5) {
+      sum = 1e-5;
+    }
+    const float normalize = static_cast<float>(1.0 / sum);
+    const float normalize_mul = mul[i] * normalize;
+    weights[i] = WeightsSymmetric5{{HWY_REP4(normalize)},
+                                   {HWY_REP4(normalize_mul * kGaborish[0])},
+                                   {HWY_REP4(normalize_mul * kGaborish[2])},
+                                   {HWY_REP4(normalize_mul * kGaborish[1])},
+                                   {HWY_REP4(normalize_mul * kGaborish[4])},
+                                   {HWY_REP4(normalize_mul * kGaborish[3])}};
+  }
+  // Reduce memory footprint by only allocating a single plane and swapping it
+  // into the output Image3F. Better still would be tiling.
+  // Note that we cannot *allocate* a plane, as doing so might cause Image3F to
+  // have planes of different stride. Instead, we copy one plane in a temporary
+  // image and reuse the existing planes of the in/out image.
+  ImageF temp = CopyImage(in_out->Plane(2));
+  Symmetric5(in_out->Plane(0), Rect(*in_out), weights[0], pool,
+             &in_out->Plane(2));
+  Symmetric5(in_out->Plane(1), Rect(*in_out), weights[1], pool,
+             &in_out->Plane(0));
+  Symmetric5(temp, Rect(*in_out), weights[2], pool, &in_out->Plane(1));
+  // Now planes are 1, 2, 0.
+  in_out->Plane(0).Swap(in_out->Plane(1));
+  // 2 1 0
+  in_out->Plane(0).Swap(in_out->Plane(2));
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_gaborish.h b/third_party/jpeg-xl/lib/jxl/enc_gaborish.h
new file mode 100644
index 0000000000..102064f9a2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_gaborish.h
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_GABORISH_H_
+#define LIB_JXL_GABORISH_H_
+
+// Linear smoothing (3x3 convolution) for deblocking without too much blur.
+
+#include <stdint.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+// Used in encoder to reduce the impact of the decoder's smoothing.
+// This is not exact. Works in-place to reduce memory use.
+// The input is typically in XYB space.
+void GaborishInverse(Image3F* in_out, float mul[3], ThreadPool* pool);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_GABORISH_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc b/third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc
new file mode 100644
index 0000000000..57a18e3338
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc
@@ -0,0 +1,77 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_gaborish.h"
+
+#include <hwy/base.h>
+
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+// weight1,2 need not be normalized.
+WeightsSymmetric3 GaborishKernel(float weight1, float weight2) {
+  constexpr float weight0 = 1.0f;
+
+  // Normalize
+  const float mul = 1.0f / (weight0 + 4 * (weight1 + weight2));
+  const float w0 = weight0 * mul;
+  const float w1 = weight1 * mul;
+  const float w2 = weight2 * mul;
+
+  const WeightsSymmetric3 w = {{HWY_REP4(w0)}, {HWY_REP4(w1)}, {HWY_REP4(w2)}};
+  return w;
+}
+
+void ConvolveGaborish(const ImageF& in, float weight1, float weight2,
+                      ThreadPool* pool, ImageF* JXL_RESTRICT out) {
+  JXL_CHECK(SameSize(in, *out));
+  Symmetric3(in, Rect(in), GaborishKernel(weight1, weight2), pool, out);
+}
+
+void TestRoundTrip(const Image3F& in, float max_l1) {
+  Image3F fwd(in.xsize(), in.ysize());
+  ThreadPool* null_pool = nullptr;
+  ConvolveGaborish(in.Plane(0), 0, 0, null_pool, &fwd.Plane(0));
+  ConvolveGaborish(in.Plane(1), 0, 0, null_pool, &fwd.Plane(1));
+  ConvolveGaborish(in.Plane(2), 0, 0, null_pool, &fwd.Plane(2));
+  float w = 0.92718927264540152f;
+  float weights[3] = {
+      w,
+      w,
+      w,
+  };
+  GaborishInverse(&fwd, weights, null_pool);
+  JXL_ASSERT_OK(VerifyRelativeError(in, fwd, max_l1, 1E-4f, _));
+}
+
+TEST(GaborishTest, TestZero) {
+  Image3F in(20, 20);
+  ZeroFillImage(&in);
+  TestRoundTrip(in, 0.0f);
+}
+
+// Disabled: large difference.
+#if 0
+TEST(GaborishTest, TestDirac) {
+  Image3F in(20, 20);
+  ZeroFillImage(&in);
+  in.PlaneRow(1, 10)[10] = 10.0f;
+  TestRoundTrip(in, 0.26f);
+}
+#endif
+
+TEST(GaborishTest, TestFlat) {
+  Image3F in(20, 20);
+  FillImage(1.0f, &in);
+  TestRoundTrip(in, 1E-5f);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_gamma_correct.h b/third_party/jpeg-xl/lib/jxl/enc_gamma_correct.h
new file mode 100644
index 0000000000..0db7012bbe
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_gamma_correct.h
@@ -0,0 +1,36 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_GAMMA_CORRECT_H_
+#define LIB_JXL_ENC_GAMMA_CORRECT_H_
+
+// Deprecated: sRGB transfer function. Use color_management.h instead.
+
+#include <cmath>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+namespace jxl {
+
+// Values are in [0, 1].
+static JXL_INLINE double Srgb8ToLinearDirect(double srgb) {
+  if (srgb <= 0.0) return 0.0;
+  if (srgb <= 0.04045) return srgb / 12.92;
+  if (srgb >= 1.0) return 1.0;
+  return std::pow((srgb + 0.055) / 1.055, 2.4);
+}
+
+// Values are in [0, 1].
+static JXL_INLINE double LinearToSrgb8Direct(double linear) {
+  if (linear <= 0.0) return 0.0;
+  if (linear >= 1.0) return 1.0;
+  if (linear <= 0.0031308) return linear * 12.92;
+  return std::pow(linear, 1.0 / 2.4) * 1.055 - 0.055;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_GAMMA_CORRECT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_group.cc b/third_party/jpeg-xl/lib/jxl/enc_group.cc
new file mode 100644
index 0000000000..074cf1553a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_group.cc
@@ -0,0 +1,426 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_group.h"
+
+#include <hwy/aligned_allocator.h>
+#include <utility>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_group.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/dec_transforms-inl.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_transforms-inl.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/quantizer-inl.h"
+#include "lib/jxl/quantizer.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Ge;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::MaskFromVec;
+using hwy::HWY_NAMESPACE::Round;
+
+// NOTE: caller takes care of extracting quant from rect of RawQuantField.
+void QuantizeBlockAC(const Quantizer& quantizer, const bool error_diffusion,
+                     size_t c, float qm_multiplier, size_t quant_kind,
+                     size_t xsize, size_t ysize, float* thresholds,
+                     const float* JXL_RESTRICT block_in, int32_t* quant,
+                     int32_t* JXL_RESTRICT block_out) {
+  PROFILER_FUNC;
+  const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c);
+  float qac = quantizer.Scale() * (*quant);
+  // Not SIMD-fied for now.
+  if (c != 1 && (xsize > 1 || ysize > 1)) {
+    for (int i = 0; i < 4; ++i) {
+      thresholds[i] -= Clamp1(0.003f * xsize * ysize, 0.f, 0.08f);
+      if (thresholds[i] < 0.54) {
+        thresholds[i] = 0.54;
+      }
+    }
+  }
+  HWY_CAPPED(float, kBlockDim) df;
+  HWY_CAPPED(int32_t, kBlockDim) di;
+  HWY_CAPPED(uint32_t, kBlockDim) du;
+  const auto quantv = Set(df, qac * qm_multiplier);
+  for (size_t y = 0; y < ysize * kBlockDim; y++) {
+    size_t yfix = static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2;
+    const size_t off = y * kBlockDim * xsize;
+    for (size_t x = 0; x < xsize * kBlockDim; x += Lanes(df)) {
+      auto thr = Zero(df);
+      if (xsize == 1) {
+        HWY_ALIGN uint32_t kMask[kBlockDim] = {0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u};
+        const auto mask = MaskFromVec(BitCast(df, Load(du, kMask + x)));
+        thr = IfThenElse(mask, Set(df, thresholds[yfix + 1]),
+                         Set(df, thresholds[yfix]));
+      } else {
+        // Same for all lanes in the vector.
+        thr = Set(
+            df,
+            thresholds[yfix + static_cast<size_t>(x >= xsize * kBlockDim / 2)]);
+      }
+      const auto q = Mul(Load(df, qm + off + x), quantv);
+      const auto in = Load(df, block_in + off + x);
+      const auto val = Mul(q, in);
+      const auto nzero_mask = Ge(Abs(val), thr);
+      const auto v = ConvertTo(di, IfThenElseZero(nzero_mask, Round(val)));
+      Store(v, di, block_out + off + x);
+    }
+  }
+}
+
+void AdjustQuantBlockAC(const Quantizer& quantizer, size_t c,
+                        float qm_multiplier, size_t quant_kind, size_t xsize,
+                        size_t ysize, float* thresholds,
+                        const float* JXL_RESTRICT block_in, int32_t* quant) {
+  // No quantization adjusting for these small blocks.
+  // Quantization adjusting attempts to fix some known issues
+  // with larger blocks and on the 8x8 dct's emerging 8x8 blockiness
+  // when there are not many non-zeros.
+  constexpr size_t kPartialBlockKinds =
+      (1 << AcStrategy::Type::IDENTITY) | (1 << AcStrategy::Type::DCT2X2) |
+      (1 << AcStrategy::Type::DCT4X4) | (1 << AcStrategy::Type::DCT4X8) |
+      (1 << AcStrategy::Type::DCT8X4) | (1 << AcStrategy::Type::AFV0) |
+      (1 << AcStrategy::Type::AFV1) | (1 << AcStrategy::Type::AFV2) |
+      (1 << AcStrategy::Type::AFV3);
+  if ((1 << quant_kind) & kPartialBlockKinds) return;
+
+  const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c);
+  float qac = quantizer.Scale() * (*quant);
+  if (xsize > 1 || ysize > 1) {
+    for (int i = 0; i < 4; ++i) {
+      thresholds[i] -= Clamp1(0.003f * xsize * ysize, 0.f, 0.08f);
+      if (thresholds[i] < 0.54) {
+        thresholds[i] = 0.54;
+      }
+    }
+  }
+  float sum_of_highest_freq_row_and_column = 0;
+  float hfNonZeros[4] = {};
+  float hfMaxError[4] = {};
+
+  for (size_t y = 0; y < ysize * kBlockDim; y++) {
+    for (size_t x = 0; x < xsize * kBlockDim; x++) {
+      const size_t pos = y * kBlockDim * xsize + x;
+      if (x < xsize && y < ysize) {
+        continue;
+      }
+      const size_t hfix = (static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2 +
+                           static_cast<size_t>(x >= xsize * kBlockDim / 2));
+      const float val = block_in[pos] * (qm[pos] * qac * qm_multiplier);
+      const float v = (std::abs(val) < thresholds[hfix]) ? 0 : rintf(val);
+      if (c == 1 && v == 0) {
+        const float error = std::abs(val);
+        if (hfMaxError[hfix] < error) {
+          hfMaxError[hfix] = error;
+        }
+      }
+      if (v != 0.0f) {
+        hfNonZeros[hfix] += std::abs(v);
+        if ((y == ysize * kBlockDim - 1 || x == xsize * kBlockDim - 1) &&
+            (x >= xsize * 4 && y >= ysize * 4)) {
+          sum_of_highest_freq_row_and_column += std::abs(val);
+        }
+      }
+    }
+  }
+  if (c == 1) {
+    static const double kLimit = 0.49f;
+    for (int i = 1; i < 4; ++i) {
+      if (hfNonZeros[i] == 0.0 && hfMaxError[i] > kLimit) {
+        thresholds[i] = 0.9999 * hfMaxError[i];
+      }
+    }
+  }
+  // Heuristic for improving accuracy of high-frequency patterns
+  // occurring in an environment with no medium-frequency masking
+  // patterns. This should be improved later to be done in X and B
+  // planes too as 32x32 and larger transforms become rather ugly
+  // when this is not compensated for.
+  if (15 * sum_of_highest_freq_row_and_column >= hfNonZeros[0] + 1) {
+    constexpr int inc = 5;
+    *quant += inc;
+    if (8 * sum_of_highest_freq_row_and_column >= hfNonZeros[0] + 1) {
+      *quant += inc;
+    }
+    if (5 * sum_of_highest_freq_row_and_column >= hfNonZeros[0] + 1) {
+      *quant += inc;
+    }
+    if (3 * sum_of_highest_freq_row_and_column >= hfNonZeros[0] + 1) {
+      *quant += inc;
+    }
+    if (*quant >= Quantizer::kQuantMax) {
+      *quant = Quantizer::kQuantMax - 1;
+    }
+  }
+  if (quant_kind == AcStrategy::Type::DCT) {
+    // If this 8x8 block is too flat, increase the adaptive quantization level
+    // a bit to reduce visible block boundaries and requantize the block.
+    if (hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] < 11) {
+      *quant += 1;
+      if (*quant >= Quantizer::kQuantMax) {
+        *quant = Quantizer::kQuantMax - 1;
+      }
+    }
+  }
+  {
+    // Reduce quant in highly active areas.
+    int32_t div = (xsize + ysize) / 2;
+    int32_t activity = (hfNonZeros[0] + div / 2) / div;
+    int32_t orig_qp_limit = std::max(4, *quant / 2);
+    for (int i = 1; i < 4; ++i) {
+      activity = std::min<int32_t>(activity, (hfNonZeros[i] + div / 2) / div);
+    }
+    if (activity >= 15) {
+      activity = 15;
+    }
+    int32_t qp = *quant - activity;
+    if (qp < orig_qp_limit) {
+      qp = orig_qp_limit;
+    }
+    *quant = qp;
+  }
+}
+
+// NOTE: caller takes care of extracting quant from rect of RawQuantField.
+void QuantizeRoundtripYBlockAC(PassesEncoderState* enc_state, const size_t size,
+                               const Quantizer& quantizer,
+                               const bool error_diffusion, size_t quant_kind,
+                               size_t xsize, size_t ysize,
+                               const float* JXL_RESTRICT biases, int32_t* quant,
+                               float* JXL_RESTRICT inout,
+                               int32_t* JXL_RESTRICT quantized) {
+  float thres_y[4] = {0.58f, 0.64f, 0.64f, 0.64f};
+  {
+    int32_t max_quant = 0;
+    int quant_orig = *quant;
+    float val[3] = {enc_state->x_qm_multiplier, 1.0f,
+                    enc_state->b_qm_multiplier};
+    int clut[3] = {1, 0, 2};
+    for (int ii = 0; ii < 3; ++ii) {
+      float thres[4] = {0.58f, 0.64f, 0.64f, 0.64f};
+      int c = clut[ii];
+      *quant = quant_orig;
+      AdjustQuantBlockAC(quantizer, c, val[c], quant_kind, xsize, ysize,
+                         &thres[0], inout + c * size, quant);
+      // Dead zone adjustment
+      if (c == 1) {
+        for (int k = 0; k < 4; ++k) {
+          thres_y[k] = thres[k];
+        }
+      }
+      max_quant = std::max(*quant, max_quant);
+    }
+    *quant = max_quant;
+  }
+
+  QuantizeBlockAC(quantizer, error_diffusion, 1, 1.0f, quant_kind, xsize, ysize,
+                  &thres_y[0], inout + size, quant, quantized + size);
+
+  PROFILER_ZONE("enc quant adjust bias");
+  const float* JXL_RESTRICT dequant_matrix =
+      quantizer.DequantMatrix(quant_kind, 1);
+
+  HWY_CAPPED(float, kDCTBlockSize) df;
+  HWY_CAPPED(int32_t, kDCTBlockSize) di;
+  const auto inv_qac = Set(df, quantizer.inv_quant_ac(*quant));
+  for (size_t k = 0; k < kDCTBlockSize * xsize * ysize; k += Lanes(df)) {
+    const auto quant = Load(di, quantized + size + k);
+    const auto adj_quant = AdjustQuantBias(di, 1, quant, biases);
+    const auto dequantm = Load(df, dequant_matrix + k);
+    Store(Mul(Mul(adj_quant, dequantm), inv_qac), df, inout + size + k);
+  }
+}
+
+void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
+                         const Image3F& opsin, Image3F* dc) {
+  PROFILER_FUNC;
+  const Rect block_group_rect = enc_state->shared.BlockGroupRect(group_idx);
+  const Rect group_rect = enc_state->shared.GroupRect(group_idx);
+  const Rect cmap_rect(
+      block_group_rect.x0() / kColorTileDimInBlocks,
+      block_group_rect.y0() / kColorTileDimInBlocks,
+      DivCeil(block_group_rect.xsize(), kColorTileDimInBlocks),
+      DivCeil(block_group_rect.ysize(), kColorTileDimInBlocks));
+
+  const size_t xsize_blocks = block_group_rect.xsize();
+  const size_t ysize_blocks = block_group_rect.ysize();
+
+  const size_t dc_stride = static_cast<size_t>(dc->PixelsPerRow());
+  const size_t opsin_stride = static_cast<size_t>(opsin.PixelsPerRow());
+
+  ImageI& full_quant_field = enc_state->shared.raw_quant_field;
+  const CompressParams& cparams = enc_state->cparams;
+
+  // TODO(veluca): consider strategies to reduce this memory.
+  auto mem = hwy::AllocateAligned<int32_t>(3 * AcStrategy::kMaxCoeffArea);
+  auto fmem = hwy::AllocateAligned<float>(5 * AcStrategy::kMaxCoeffArea);
+  float* JXL_RESTRICT scratch_space =
+      fmem.get() + 3 * AcStrategy::kMaxCoeffArea;
+  {
+    // Only use error diffusion in Squirrel mode or slower.
+    const bool error_diffusion = cparams.speed_tier <= SpeedTier::kSquirrel;
+    constexpr HWY_CAPPED(float, kDCTBlockSize) d;
+
+    int32_t* JXL_RESTRICT coeffs[3][kMaxNumPasses] = {};
+    size_t num_passes = enc_state->progressive_splitter.GetNumPasses();
+    JXL_DASSERT(num_passes > 0);
+    for (size_t i = 0; i < num_passes; i++) {
+      // TODO(veluca): 16-bit quantized coeffs are not implemented yet.
+      JXL_ASSERT(enc_state->coeffs[i]->Type() == ACType::k32);
+      for (size_t c = 0; c < 3; c++) {
+        coeffs[c][i] = enc_state->coeffs[i]->PlaneRow(c, group_idx, 0).ptr32;
+      }
+    }
+
+    HWY_ALIGN float* coeffs_in = fmem.get();
+    HWY_ALIGN int32_t* quantized = mem.get();
+
+    for (size_t by = 0; by < ysize_blocks; ++by) {
+      int32_t* JXL_RESTRICT row_quant_ac =
+          block_group_rect.Row(&full_quant_field, by);
+      size_t ty = by / kColorTileDimInBlocks;
+      const int8_t* JXL_RESTRICT row_cmap[3] = {
+          cmap_rect.ConstRow(enc_state->shared.cmap.ytox_map, ty),
+          nullptr,
+          cmap_rect.ConstRow(enc_state->shared.cmap.ytob_map, ty),
+      };
+      const float* JXL_RESTRICT opsin_rows[3] = {
+          group_rect.ConstPlaneRow(opsin, 0, by * kBlockDim),
+          group_rect.ConstPlaneRow(opsin, 1, by * kBlockDim),
+          group_rect.ConstPlaneRow(opsin, 2, by * kBlockDim),
+      };
+      float* JXL_RESTRICT dc_rows[3] = {
+          block_group_rect.PlaneRow(dc, 0, by),
+          block_group_rect.PlaneRow(dc, 1, by),
+          block_group_rect.PlaneRow(dc, 2, by),
+      };
+      AcStrategyRow ac_strategy_row =
+          enc_state->shared.ac_strategy.ConstRow(block_group_rect, by);
+      for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks);
+           tx++) {
+        const auto x_factor =
+            Set(d, enc_state->shared.cmap.YtoXRatio(row_cmap[0][tx]));
+        const auto b_factor =
+            Set(d, enc_state->shared.cmap.YtoBRatio(row_cmap[2][tx]));
+        for (size_t bx = tx * kColorTileDimInBlocks;
+             bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks; ++bx) {
+          const AcStrategy acs = ac_strategy_row[bx];
+          if (!acs.IsFirstBlock()) continue;
+
+          size_t xblocks = acs.covered_blocks_x();
+          size_t yblocks = acs.covered_blocks_y();
+
+          CoefficientLayout(&yblocks, &xblocks);
+
+          size_t size = kDCTBlockSize * xblocks * yblocks;
+
+          // DCT Y channel, roundtrip-quantize it and set DC.
+          int32_t quant_ac = row_quant_ac[bx];
+          for (size_t c : {0, 1, 2}) {
+            TransformFromPixels(acs.Strategy(), opsin_rows[c] + bx * kBlockDim,
+                                opsin_stride, coeffs_in + c * size,
+                                scratch_space);
+          }
+          DCFromLowestFrequencies(acs.Strategy(), coeffs_in + size,
+                                  dc_rows[1] + bx, dc_stride);
+
+          QuantizeRoundtripYBlockAC(
+              enc_state, size, enc_state->shared.quantizer, error_diffusion,
+              acs.RawStrategy(), xblocks, yblocks, kDefaultQuantBias, &quant_ac,
+              coeffs_in, quantized);
+
+          // Unapply color correlation
+          for (size_t k = 0; k < size; k += Lanes(d)) {
+            const auto in_x = Load(d, coeffs_in + k);
+            const auto in_y = Load(d, coeffs_in + size + k);
+            const auto in_b = Load(d, coeffs_in + 2 * size + k);
+            const auto out_x = NegMulAdd(x_factor, in_y, in_x);
+            const auto out_b = NegMulAdd(b_factor, in_y, in_b);
+            Store(out_x, d, coeffs_in + k);
+            Store(out_b, d, coeffs_in + 2 * size + k);
+          }
+
+          // Quantize X and B channels and set DC.
+          for (size_t c : {0, 2}) {
+            float thres[4] = {0.58f, 0.62f, 0.62f, 0.62f};
+            QuantizeBlockAC(enc_state->shared.quantizer, error_diffusion, c,
+                            c == 0 ? enc_state->x_qm_multiplier
+                                   : enc_state->b_qm_multiplier,
+                            acs.RawStrategy(), xblocks, yblocks, &thres[0],
+                            coeffs_in + c * size, &quant_ac,
+                            quantized + c * size);
+            DCFromLowestFrequencies(acs.Strategy(), coeffs_in + c * size,
+                                    dc_rows[c] + bx, dc_stride);
+          }
+          row_quant_ac[bx] = quant_ac;
+          for (size_t c = 0; c < 3; c++) {
+            enc_state->progressive_splitter.SplitACCoefficients(
+                quantized + c * size, acs, bx, by, coeffs[c]);
+            for (size_t p = 0; p < num_passes; p++) {
+              coeffs[c][p] += size;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(ComputeCoefficients);
+void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
+                         const Image3F& opsin, Image3F* dc) {
+  return HWY_DYNAMIC_DISPATCH(ComputeCoefficients)(group_idx, enc_state, opsin,
+                                                   dc);
+}
+
+Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx,
+                                        size_t histogram_idx,
+                                        const PassesEncoderState& enc_state,
+                                        BitWriter* writer, AuxOut* aux_out) {
+  // Select which histogram to use among those of the current pass.
+  const size_t num_histograms = enc_state.shared.num_histograms;
+  // num_histograms is 0 only for lossless.
+  JXL_ASSERT(num_histograms == 0 || histogram_idx < num_histograms);
+  size_t histo_selector_bits = CeilLog2Nonzero(num_histograms);
+
+  if (histo_selector_bits != 0) {
+    BitWriter::Allotment allotment(writer, histo_selector_bits);
+    writer->Write(histo_selector_bits, histogram_idx);
+    allotment.ReclaimAndCharge(writer, kLayerAC, aux_out);
+  }
+  WriteTokens(enc_state.passes[pass_idx].ac_tokens[group_idx],
+              enc_state.passes[pass_idx].codes,
+              enc_state.passes[pass_idx].context_map, writer, kLayerACTokens,
+              aux_out);
+
+  return true;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_group.h b/third_party/jpeg-xl/lib/jxl/enc_group.h
new file mode 100644
index 0000000000..0caf408a03
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_group.h
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_GROUP_H_
+#define LIB_JXL_ENC_GROUP_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+struct AuxOut;
+struct PassesEncoderState;
+
+// Fills DC
+void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
+                         const Image3F& opsin, Image3F* dc);
+
+Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx,
+                                        size_t histogram_idx,
+                                        const PassesEncoderState& enc_state,
+                                        BitWriter* writer, AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_GROUP_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc b/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc
new file mode 100644
index 0000000000..18122fa769
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc
@@ -0,0 +1,948 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_heuristics.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <numeric>
+#include <string>
+
+#include "lib/jxl/enc_ac_strategy.h"
+#include "lib/jxl/enc_adaptive_quantization.h"
+#include "lib/jxl/enc_ar_control_field.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_chroma_from_luma.h"
+#include "lib/jxl/enc_gaborish.h"
+#include "lib/jxl/enc_modular.h"
+#include "lib/jxl/enc_noise.h"
+#include "lib/jxl/enc_patch_dictionary.h"
+#include "lib/jxl/enc_photon_noise.h"
+#include "lib/jxl/enc_quant_weights.h"
+#include "lib/jxl/enc_splines.h"
+#include "lib/jxl/enc_xyb.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+namespace {
+void FindBestBlockEntropyModel(PassesEncoderState& enc_state) {
+  if (enc_state.cparams.decoding_speed_tier >= 1) {
+    static constexpr uint8_t kSimpleCtxMap[] = {
+        // Cluster all blocks together
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  //
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  //
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  //
+    };
+    static_assert(
+        3 * kNumOrders == sizeof(kSimpleCtxMap) / sizeof *kSimpleCtxMap,
+        "Update simple context map");
+
+    auto bcm = enc_state.shared.block_ctx_map;
+    bcm.ctx_map.assign(std::begin(kSimpleCtxMap), std::end(kSimpleCtxMap));
+    bcm.num_ctxs = 2;
+    bcm.num_dc_ctxs = 1;
+    return;
+  }
+  if (enc_state.cparams.speed_tier >= SpeedTier::kFalcon) {
+    return;
+  }
+  const ImageI& rqf = enc_state.shared.raw_quant_field;
+  // No need to change context modeling for small images.
+  size_t tot = rqf.xsize() * rqf.ysize();
+  size_t size_for_ctx_model =
+      (1 << 10) * enc_state.cparams.butteraugli_distance;
+  if (tot < size_for_ctx_model) return;
+
+  struct OccCounters {
+    // count the occurrences of each qf value and each strategy type.
+    OccCounters(const ImageI& rqf, const AcStrategyImage& ac_strategy) {
+      for (size_t y = 0; y < rqf.ysize(); y++) {
+        const int32_t* qf_row = rqf.Row(y);
+        AcStrategyRow acs_row = ac_strategy.ConstRow(y);
+        for (size_t x = 0; x < rqf.xsize(); x++) {
+          int ord = kStrategyOrder[acs_row[x].RawStrategy()];
+          int qf = qf_row[x] - 1;
+          qf_counts[qf]++;
+          qf_ord_counts[ord][qf]++;
+          ord_counts[ord]++;
+        }
+      }
+    }
+
+    size_t qf_counts[256] = {};
+    size_t qf_ord_counts[kNumOrders][256] = {};
+    size_t ord_counts[kNumOrders] = {};
+  };
+  // The OccCounters struct is too big to allocate on the stack.
+  std::unique_ptr<OccCounters> counters(
+      new OccCounters(rqf, enc_state.shared.ac_strategy));
+
+  // Splitting the context model according to the quantization field seems to
+  // mostly benefit only large images.
+  size_t size_for_qf_split = (1 << 13) * enc_state.cparams.butteraugli_distance;
+  size_t num_qf_segments = tot < size_for_qf_split ? 1 : 2;
+  std::vector<uint32_t>& qft = enc_state.shared.block_ctx_map.qf_thresholds;
+  qft.clear();
+  // Divide the quant field in up to num_qf_segments segments.
+  size_t cumsum = 0;
+  size_t next = 1;
+  size_t last_cut = 256;
+  size_t cut = tot * next / num_qf_segments;
+  for (uint32_t j = 0; j < 256; j++) {
+    cumsum += counters->qf_counts[j];
+    if (cumsum > cut) {
+      if (j != 0) {
+        qft.push_back(j);
+      }
+      last_cut = j;
+      while (cumsum > cut) {
+        next++;
+        cut = tot * next / num_qf_segments;
+      }
+    } else if (next > qft.size() + 1) {
+      if (j - 1 == last_cut && j != 0) {
+        qft.push_back(j);
+      }
+    }
+  }
+
+  // Count the occurrences of each segment.
+  std::vector<size_t> counts(kNumOrders * (qft.size() + 1));
+  size_t qft_pos = 0;
+  for (size_t j = 0; j < 256; j++) {
+    if (qft_pos < qft.size() && j == qft[qft_pos]) {
+      qft_pos++;
+    }
+    for (size_t i = 0; i < kNumOrders; i++) {
+      counts[qft_pos + i * (qft.size() + 1)] += counters->qf_ord_counts[i][j];
+    }
+  }
+
+  // Repeatedly merge the lowest-count pair.
+  std::vector<uint8_t> remap((qft.size() + 1) * kNumOrders);
+  std::iota(remap.begin(), remap.end(), 0);
+  std::vector<uint8_t> clusters(remap);
+  size_t nb_clusters = Clamp1((int)(tot / size_for_ctx_model / 2), 2, 9);
+  size_t nb_clusters_chroma = Clamp1((int)(tot / size_for_ctx_model / 3), 1, 5);
+  // This is O(n^2 log n), but n is small.
+  while (clusters.size() > nb_clusters) {
+    std::sort(clusters.begin(), clusters.end(),
+              [&](int a, int b) { return counts[a] > counts[b]; });
+    counts[clusters[clusters.size() - 2]] += counts[clusters.back()];
+    counts[clusters.back()] = 0;
+    remap[clusters.back()] = clusters[clusters.size() - 2];
+    clusters.pop_back();
+  }
+  for (size_t i = 0; i < remap.size(); i++) {
+    while (remap[remap[i]] != remap[i]) {
+      remap[i] = remap[remap[i]];
+    }
+  }
+  // Relabel starting from 0.
+  std::vector<uint8_t> remap_remap(remap.size(), remap.size());
+  size_t num = 0;
+  for (size_t i = 0; i < remap.size(); i++) {
+    if (remap_remap[remap[i]] == remap.size()) {
+      remap_remap[remap[i]] = num++;
+    }
+    remap[i] = remap_remap[remap[i]];
+  }
+  // Write the block context map.
+  auto& ctx_map = enc_state.shared.block_ctx_map.ctx_map;
+  ctx_map = remap;
+  ctx_map.resize(remap.size() * 3);
+  // for chroma, only use up to nb_clusters_chroma separate block contexts
+  // (those for the biggest clusters)
+  for (size_t i = remap.size(); i < remap.size() * 3; i++) {
+    ctx_map[i] = num + Clamp1((int)remap[i % remap.size()], 0,
+                              (int)nb_clusters_chroma - 1);
+  }
+  enc_state.shared.block_ctx_map.num_ctxs =
+      *std::max_element(ctx_map.begin(), ctx_map.end()) + 1;
+}
+
+}  // namespace
+
+void FindBestDequantMatrices(const CompressParams& cparams,
+                             const Image3F& opsin,
+                             ModularFrameEncoder* modular_frame_encoder,
+                             DequantMatrices* dequant_matrices) {
+  // TODO(veluca): quant matrices for no-gaborish.
+  // TODO(veluca): heuristics for in-bitstream quant tables.
+  *dequant_matrices = DequantMatrices();
+  if (cparams.max_error_mode) {
+    // Set numerators of all quantization matrices to constant values.
+    float weights[3][1] = {{1.0f / cparams.max_error[0]},
+                           {1.0f / cparams.max_error[1]},
+                           {1.0f / cparams.max_error[2]}};
+    DctQuantWeightParams dct_params(weights);
+    std::vector<QuantEncoding> encodings(DequantMatrices::kNum,
+                                         QuantEncoding::DCT(dct_params));
+    DequantMatricesSetCustom(dequant_matrices, encodings,
+                             modular_frame_encoder);
+    float dc_weights[3] = {1.0f / cparams.max_error[0],
+                           1.0f / cparams.max_error[1],
+                           1.0f / cparams.max_error[2]};
+    DequantMatricesSetCustomDC(dequant_matrices, dc_weights);
+  }
+}
+
+bool DefaultEncoderHeuristics::HandlesColorConversion(
+    const CompressParams& cparams, const ImageBundle& ib) {
+  return cparams.noise != Override::kOn && cparams.patches != Override::kOn &&
+         cparams.speed_tier >= SpeedTier::kWombat && cparams.resampling == 1 &&
+         cparams.color_transform == ColorTransform::kXYB &&
+         !cparams.modular_mode && !ib.HasAlpha();
+}
+
+namespace {
+
+void StoreMin2(const float v, float& min1, float& min2) {
+  if (v < min2) {
+    if (v < min1) {
+      min2 = min1;
+      min1 = v;
+    } else {
+      min2 = v;
+    }
+  }
+}
+
+void CreateMask(const ImageF& image, ImageF& mask) {
+  for (size_t y = 0; y < image.ysize(); y++) {
+    auto* row_n = y > 0 ? image.Row(y - 1) : image.Row(y);
+    auto* row_in = image.Row(y);
+    auto* row_s = y + 1 < image.ysize() ? image.Row(y + 1) : image.Row(y);
+    auto* row_out = mask.Row(y);
+    for (size_t x = 0; x < image.xsize(); x++) {
+      // Center, west, east, north, south values and their absolute difference
+      float c = row_in[x];
+      float w = x > 0 ? row_in[x - 1] : row_in[x];
+      float e = x + 1 < image.xsize() ? row_in[x + 1] : row_in[x];
+      float n = row_n[x];
+      float s = row_s[x];
+      float dw = std::abs(c - w);
+      float de = std::abs(c - e);
+      float dn = std::abs(c - n);
+      float ds = std::abs(c - s);
+      float min = std::numeric_limits<float>::max();
+      float min2 = std::numeric_limits<float>::max();
+      StoreMin2(dw, min, min2);
+      StoreMin2(de, min, min2);
+      StoreMin2(dn, min, min2);
+      StoreMin2(ds, min, min2);
+      row_out[x] = min2;
+    }
+  }
+}
+
+// Downsamples the image by a factor of 2 with a kernel that's sharper than
+// the standard 2x2 box kernel used by DownsampleImage.
+// The kernel is optimized against the result of the 2x2 upsampling kernel used
+// by the decoder. Ringing is slightly reduced by clamping the values of the
+// resulting pixels within certain bounds of a small region in the original
+// image.
+void DownsampleImage2_Sharper(const ImageF& input, ImageF* output) {
+  const int64_t kernelx = 12;
+  const int64_t kernely = 12;
+
+  static const float kernel[144] = {
+      -0.000314256996835, -0.000314256996835, -0.000897597057705,
+      -0.000562751488849, -0.000176807273646, 0.001864627368902,
+      0.001864627368902,  -0.000176807273646, -0.000562751488849,
+      -0.000897597057705, -0.000314256996835, -0.000314256996835,
+      -0.000314256996835, -0.001527942804748, -0.000121760530512,
+      0.000191123989093,  0.010193185932466,  0.058637519197110,
+      0.058637519197110,  0.010193185932466,  0.000191123989093,
+      -0.000121760530512, -0.001527942804748, -0.000314256996835,
+      -0.000897597057705, -0.000121760530512, 0.000946363683751,
+      0.007113577630288,  0.000437956841058,  -0.000372823835211,
+      -0.000372823835211, 0.000437956841058,  0.007113577630288,
+      0.000946363683751,  -0.000121760530512, -0.000897597057705,
+      -0.000562751488849, 0.000191123989093,  0.007113577630288,
+      0.044592622228814,  0.000222278879007,  -0.162864473015945,
+      -0.162864473015945, 0.000222278879007,  0.044592622228814,
+      0.007113577630288,  0.000191123989093,  -0.000562751488849,
+      -0.000176807273646, 0.010193185932466,  0.000437956841058,
+      0.000222278879007,  -0.000913092543974, -0.017071696107902,
+      -0.017071696107902, -0.000913092543974, 0.000222278879007,
+      0.000437956841058,  0.010193185932466,  -0.000176807273646,
+      0.001864627368902,  0.058637519197110,  -0.000372823835211,
+      -0.162864473015945, -0.017071696107902, 0.414660099370354,
+      0.414660099370354,  -0.017071696107902, -0.162864473015945,
+      -0.000372823835211, 0.058637519197110,  0.001864627368902,
+      0.001864627368902,  0.058637519197110,  -0.000372823835211,
+      -0.162864473015945, -0.017071696107902, 0.414660099370354,
+      0.414660099370354,  -0.017071696107902, -0.162864473015945,
+      -0.000372823835211, 0.058637519197110,  0.001864627368902,
+      -0.000176807273646, 0.010193185932466,  0.000437956841058,
+      0.000222278879007,  -0.000913092543974, -0.017071696107902,
+      -0.017071696107902, -0.000913092543974, 0.000222278879007,
+      0.000437956841058,  0.010193185932466,  -0.000176807273646,
+      -0.000562751488849, 0.000191123989093,  0.007113577630288,
+      0.044592622228814,  0.000222278879007,  -0.162864473015945,
+      -0.162864473015945, 0.000222278879007,  0.044592622228814,
+      0.007113577630288,  0.000191123989093,  -0.000562751488849,
+      -0.000897597057705, -0.000121760530512, 0.000946363683751,
+      0.007113577630288,  0.000437956841058,  -0.000372823835211,
+      -0.000372823835211, 0.000437956841058,  0.007113577630288,
+      0.000946363683751,  -0.000121760530512, -0.000897597057705,
+      -0.000314256996835, -0.001527942804748, -0.000121760530512,
+      0.000191123989093,  0.010193185932466,  0.058637519197110,
+      0.058637519197110,  0.010193185932466,  0.000191123989093,
+      -0.000121760530512, -0.001527942804748, -0.000314256996835,
+      -0.000314256996835, -0.000314256996835, -0.000897597057705,
+      -0.000562751488849, -0.000176807273646, 0.001864627368902,
+      0.001864627368902,  -0.000176807273646, -0.000562751488849,
+      -0.000897597057705, -0.000314256996835, -0.000314256996835};
+
+  int64_t xsize = input.xsize();
+  int64_t ysize = input.ysize();
+
+  ImageF box_downsample = CopyImage(input);
+  DownsampleImage(&box_downsample, 2);
+
+  ImageF mask(box_downsample.xsize(), box_downsample.ysize());
+  CreateMask(box_downsample, mask);
+
+  for (size_t y = 0; y < output->ysize(); y++) {
+    float* row_out = output->Row(y);
+    const float* row_in[kernely];
+    const float* row_mask = mask.Row(y);
+    // get the rows in the support
+    for (size_t ky = 0; ky < kernely; ky++) {
+      int64_t iy = y * 2 + ky - (kernely - 1) / 2;
+      if (iy < 0) iy = 0;
+      if (iy >= ysize) iy = ysize - 1;
+      row_in[ky] = input.Row(iy);
+    }
+
+    for (size_t x = 0; x < output->xsize(); x++) {
+      // get min and max values of the original image in the support
+      float min = std::numeric_limits<float>::max();
+      float max = std::numeric_limits<float>::min();
+      // kernelx - R and kernely - R are the radius of a rectangular region in
+      // which the values of a pixel are bounded to reduce ringing.
+      static constexpr int64_t R = 5;
+      for (int64_t ky = R; ky + R < kernely; ky++) {
+        for (int64_t kx = R; kx + R < kernelx; kx++) {
+          int64_t ix = x * 2 + kx - (kernelx - 1) / 2;
+          if (ix < 0) ix = 0;
+          if (ix >= xsize) ix = xsize - 1;
+          min = std::min<float>(min, row_in[ky][ix]);
+          max = std::max<float>(max, row_in[ky][ix]);
+        }
+      }
+
+      float sum = 0;
+      for (int64_t ky = 0; ky < kernely; ky++) {
+        for (int64_t kx = 0; kx < kernelx; kx++) {
+          int64_t ix = x * 2 + kx - (kernelx - 1) / 2;
+          if (ix < 0) ix = 0;
+          if (ix >= xsize) ix = xsize - 1;
+          sum += row_in[ky][ix] * kernel[ky * kernelx + kx];
+        }
+      }
+
+      row_out[x] = sum;
+
+      // Clamp the pixel within the value  of a small area to prevent ringning.
+      // The mask determines how much to clamp, clamp more to reduce more
+      // ringing in smooth areas, clamp less in noisy areas to get more
+      // sharpness. Higher mask_multiplier gives less clamping, so less
+      // ringing reduction.
+      const constexpr float mask_multiplier = 1;
+      float a = row_mask[x] * mask_multiplier;
+      float clip_min = min - a;
+      float clip_max = max + a;
+      if (row_out[x] < clip_min) {
+        row_out[x] = clip_min;
+      } else if (row_out[x] > clip_max) {
+        row_out[x] = clip_max;
+      }
+    }
+  }
+}
+
+void DownsampleImage2_Sharper(Image3F* opsin) {
+  // Allocate extra space to avoid a reallocation when padding.
+  Image3F downsampled(DivCeil(opsin->xsize(), 2) + kBlockDim,
+                      DivCeil(opsin->ysize(), 2) + kBlockDim);
+  downsampled.ShrinkTo(downsampled.xsize() - kBlockDim,
+                       downsampled.ysize() - kBlockDim);
+
+  for (size_t c = 0; c < 3; c++) {
+    DownsampleImage2_Sharper(opsin->Plane(c), &downsampled.Plane(c));
+  }
+  *opsin = std::move(downsampled);
+}
+
+// The default upsampling kernels used by Upsampler in the decoder.
+static const constexpr int64_t kSize = 5;
+
+static const float kernel00[25] = {
+    -0.01716200f, -0.03452303f, -0.04022174f, -0.02921014f, -0.00624645f,
+    -0.03452303f, 0.14111091f,  0.28896755f,  0.00278718f,  -0.01610267f,
+    -0.04022174f, 0.28896755f,  0.56661550f,  0.03777607f,  -0.01986694f,
+    -0.02921014f, 0.00278718f,  0.03777607f,  -0.03144731f, -0.01185068f,
+    -0.00624645f, -0.01610267f, -0.01986694f, -0.01185068f, -0.00213539f,
+};
+static const float kernel01[25] = {
+    -0.00624645f, -0.01610267f, -0.01986694f, -0.01185068f, -0.00213539f,
+    -0.02921014f, 0.00278718f,  0.03777607f,  -0.03144731f, -0.01185068f,
+    -0.04022174f, 0.28896755f,  0.56661550f,  0.03777607f,  -0.01986694f,
+    -0.03452303f, 0.14111091f,  0.28896755f,  0.00278718f,  -0.01610267f,
+    -0.01716200f, -0.03452303f, -0.04022174f, -0.02921014f, -0.00624645f,
+};
+static const float kernel10[25] = {
+    -0.00624645f, -0.02921014f, -0.04022174f, -0.03452303f, -0.01716200f,
+    -0.01610267f, 0.00278718f,  0.28896755f,  0.14111091f,  -0.03452303f,
+    -0.01986694f, 0.03777607f,  0.56661550f,  0.28896755f,  -0.04022174f,
+    -0.01185068f, -0.03144731f, 0.03777607f,  0.00278718f,  -0.02921014f,
+    -0.00213539f, -0.01185068f, -0.01986694f, -0.01610267f, -0.00624645f,
+};
+static const float kernel11[25] = {
+    -0.00213539f, -0.01185068f, -0.01986694f, -0.01610267f, -0.00624645f,
+    -0.01185068f, -0.03144731f, 0.03777607f,  0.00278718f,  -0.02921014f,
+    -0.01986694f, 0.03777607f,  0.56661550f,  0.28896755f,  -0.04022174f,
+    -0.01610267f, 0.00278718f,  0.28896755f,  0.14111091f,  -0.03452303f,
+    -0.00624645f, -0.02921014f, -0.04022174f, -0.03452303f, -0.01716200f,
+};
+
+// Does exactly the same as the Upsampler in dec_upsampler for 2x2 pixels, with
+// default CustomTransformData.
+// TODO(lode): use Upsampler instead. However, it requires pre-initialization
+// and padding on the left side of the image which requires refactoring the
+// other code using this.
+static void UpsampleImage(const ImageF& input, ImageF* output) {
+  int64_t xsize = input.xsize();
+  int64_t ysize = input.ysize();
+  int64_t xsize2 = output->xsize();
+  int64_t ysize2 = output->ysize();
+  for (int64_t y = 0; y < ysize2; y++) {
+    for (int64_t x = 0; x < xsize2; x++) {
+      auto kernel = kernel00;
+      if ((x & 1) && (y & 1)) {
+        kernel = kernel11;
+      } else if (x & 1) {
+        kernel = kernel10;
+      } else if (y & 1) {
+        kernel = kernel01;
+      }
+      float sum = 0;
+      int64_t x2 = x / 2;
+      int64_t y2 = y / 2;
+
+      // get min and max values of the original image in the support
+      float min = std::numeric_limits<float>::max();
+      float max = std::numeric_limits<float>::min();
+
+      for (int64_t ky = 0; ky < kSize; ky++) {
+        for (int64_t kx = 0; kx < kSize; kx++) {
+          int64_t xi = x2 - kSize / 2 + kx;
+          int64_t yi = y2 - kSize / 2 + ky;
+          if (xi < 0) xi = 0;
+          if (xi >= xsize) xi = input.xsize() - 1;
+          if (yi < 0) yi = 0;
+          if (yi >= ysize) yi = input.ysize() - 1;
+          min = std::min<float>(min, input.Row(yi)[xi]);
+          max = std::max<float>(max, input.Row(yi)[xi]);
+        }
+      }
+
+      for (int64_t ky = 0; ky < kSize; ky++) {
+        for (int64_t kx = 0; kx < kSize; kx++) {
+          int64_t xi = x2 - kSize / 2 + kx;
+          int64_t yi = y2 - kSize / 2 + ky;
+          if (xi < 0) xi = 0;
+          if (xi >= xsize) xi = input.xsize() - 1;
+          if (yi < 0) yi = 0;
+          if (yi >= ysize) yi = input.ysize() - 1;
+          sum += input.Row(yi)[xi] * kernel[ky * kSize + kx];
+        }
+      }
+      output->Row(y)[x] = sum;
+      if (output->Row(y)[x] < min) output->Row(y)[x] = min;
+      if (output->Row(y)[x] > max) output->Row(y)[x] = max;
+    }
+  }
+}
+
+// Returns the derivative of Upsampler, with respect to input pixel x2, y2, to
+// output pixel x, y (ignoring the clamping).
+float UpsamplerDeriv(int64_t x2, int64_t y2, int64_t x, int64_t y) {
+  auto kernel = kernel00;
+  if ((x & 1) && (y & 1)) {
+    kernel = kernel11;
+  } else if (x & 1) {
+    kernel = kernel10;
+  } else if (y & 1) {
+    kernel = kernel01;
+  }
+
+  int64_t ix = x / 2;
+  int64_t iy = y / 2;
+  int64_t kx = x2 - ix + kSize / 2;
+  int64_t ky = y2 - iy + kSize / 2;
+
+  // This should not happen.
+  if (kx < 0 || kx >= kSize || ky < 0 || ky >= kSize) return 0;
+
+  return kernel[ky * kSize + kx];
+}
+
+// Apply the derivative of the Upsampler to the input, reversing the effect of
+// its coefficients. The output image is 2x2 times smaller than the input.
+void AntiUpsample(const ImageF& input, ImageF* d) {
+  int64_t xsize = input.xsize();
+  int64_t ysize = input.ysize();
+  int64_t xsize2 = d->xsize();
+  int64_t ysize2 = d->ysize();
+  int64_t k0 = kSize - 1;
+  int64_t k1 = kSize;
+  for (int64_t y2 = 0; y2 < ysize2; ++y2) {
+    auto* row = d->Row(y2);
+    for (int64_t x2 = 0; x2 < xsize2; ++x2) {
+      int64_t x0 = x2 * 2 - k0;
+      if (x0 < 0) x0 = 0;
+      int64_t x1 = x2 * 2 + k1 + 1;
+      if (x1 > xsize) x1 = xsize;
+      int64_t y0 = y2 * 2 - k0;
+      if (y0 < 0) y0 = 0;
+      int64_t y1 = y2 * 2 + k1 + 1;
+      if (y1 > ysize) y1 = ysize;
+
+      float sum = 0;
+      for (int64_t y = y0; y < y1; ++y) {
+        const auto* row_in = input.Row(y);
+        for (int64_t x = x0; x < x1; ++x) {
+          double deriv = UpsamplerDeriv(x2, y2, x, y);
+          sum += deriv * row_in[x];
+        }
+      }
+      row[x2] = sum;
+    }
+  }
+}
+
+// Element-wise multiplies two images.
+template <typename T>
+void ElwiseMul(const Plane<T>& image1, const Plane<T>& image2, Plane<T>* out) {
+  const size_t xsize = image1.xsize();
+  const size_t ysize = image1.ysize();
+  JXL_CHECK(xsize == image2.xsize());
+  JXL_CHECK(ysize == image2.ysize());
+  JXL_CHECK(xsize == out->xsize());
+  JXL_CHECK(ysize == out->ysize());
+  for (size_t y = 0; y < ysize; ++y) {
+    const T* const JXL_RESTRICT row1 = image1.Row(y);
+    const T* const JXL_RESTRICT row2 = image2.Row(y);
+    T* const JXL_RESTRICT row_out = out->Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      row_out[x] = row1[x] * row2[x];
+    }
+  }
+}
+
+// Element-wise divides two images.
+template <typename T>
+void ElwiseDiv(const Plane<T>& image1, const Plane<T>& image2, Plane<T>* out) {
+  const size_t xsize = image1.xsize();
+  const size_t ysize = image1.ysize();
+  JXL_CHECK(xsize == image2.xsize());
+  JXL_CHECK(ysize == image2.ysize());
+  JXL_CHECK(xsize == out->xsize());
+  JXL_CHECK(ysize == out->ysize());
+  for (size_t y = 0; y < ysize; ++y) {
+    const T* const JXL_RESTRICT row1 = image1.Row(y);
+    const T* const JXL_RESTRICT row2 = image2.Row(y);
+    T* const JXL_RESTRICT row_out = out->Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      row_out[x] = row1[x] / row2[x];
+    }
+  }
+}
+
+void ReduceRinging(const ImageF& initial, const ImageF& mask, ImageF& down) {
+  int64_t xsize2 = down.xsize();
+  int64_t ysize2 = down.ysize();
+
+  for (size_t y = 0; y < down.ysize(); y++) {
+    const float* row_mask = mask.Row(y);
+    float* row_out = down.Row(y);
+    for (size_t x = 0; x < down.xsize(); x++) {
+      float v = down.Row(y)[x];
+      float min = initial.Row(y)[x];
+      float max = initial.Row(y)[x];
+      for (int64_t yi = -1; yi < 2; yi++) {
+        for (int64_t xi = -1; xi < 2; xi++) {
+          int64_t x2 = (int64_t)x + xi;
+          int64_t y2 = (int64_t)y + yi;
+          if (x2 < 0 || y2 < 0 || x2 >= (int64_t)xsize2 ||
+              y2 >= (int64_t)ysize2)
+            continue;
+          min = std::min<float>(min, initial.Row(y2)[x2]);
+          max = std::max<float>(max, initial.Row(y2)[x2]);
+        }
+      }
+
+      row_out[x] = v;
+
+      // Clamp the pixel within the value  of a small area to prevent ringning.
+      // The mask determines how much to clamp, clamp more to reduce more
+      // ringing in smooth areas, clamp less in noisy areas to get more
+      // sharpness. Higher mask_multiplier gives less clamping, so less
+      // ringing reduction.
+      const constexpr float mask_multiplier = 2;
+      float a = row_mask[x] * mask_multiplier;
+      float clip_min = min - a;
+      float clip_max = max + a;
+      if (row_out[x] < clip_min) row_out[x] = clip_min;
+      if (row_out[x] > clip_max) row_out[x] = clip_max;
+    }
+  }
+}
+
+// TODO(lode): move this to a separate file enc_downsample.cc
+void DownsampleImage2_Iterative(const ImageF& orig, ImageF* output) {
+  int64_t xsize = orig.xsize();
+  int64_t ysize = orig.ysize();
+  int64_t xsize2 = DivCeil(orig.xsize(), 2);
+  int64_t ysize2 = DivCeil(orig.ysize(), 2);
+
+  ImageF box_downsample = CopyImage(orig);
+  DownsampleImage(&box_downsample, 2);
+  ImageF mask(box_downsample.xsize(), box_downsample.ysize());
+  CreateMask(box_downsample, mask);
+
+  output->ShrinkTo(xsize2, ysize2);
+
+  // Initial result image using the sharper downsampling.
+  // Allocate extra space to avoid a reallocation when padding.
+  ImageF initial(DivCeil(orig.xsize(), 2) + kBlockDim,
+                 DivCeil(orig.ysize(), 2) + kBlockDim);
+  initial.ShrinkTo(initial.xsize() - kBlockDim, initial.ysize() - kBlockDim);
+  DownsampleImage2_Sharper(orig, &initial);
+
+  ImageF down = CopyImage(initial);
+  ImageF up(xsize, ysize);
+  ImageF corr(xsize, ysize);
+  ImageF corr2(xsize2, ysize2);
+
+  // In the weights map, relatively higher values will allow less ringing but
+  // also less sharpness. With all constant values, it optimizes equally
+  // everywhere. Even in this case, the weights2 computed from
+  // this is still used and differs at the borders of the image.
+  // TODO(lode): Make use of the weights field for anti-ringing and clamping,
+  // the values are all set to 1 for now, but it is intended to be used for
+  // reducing ringing based on the mask, and taking clamping into account.
+  ImageF weights(xsize, ysize);
+  for (size_t y = 0; y < weights.ysize(); y++) {
+    auto* row = weights.Row(y);
+    for (size_t x = 0; x < weights.xsize(); x++) {
+      row[x] = 1;
+    }
+  }
+  ImageF weights2(xsize2, ysize2);
+  AntiUpsample(weights, &weights2);
+
+  const size_t num_it = 3;
+  for (size_t it = 0; it < num_it; ++it) {
+    UpsampleImage(down, &up);
+    corr = LinComb<float>(1, orig, -1, up);
+    ElwiseMul(corr, weights, &corr);
+    AntiUpsample(corr, &corr2);
+    ElwiseDiv(corr2, weights2, &corr2);
+
+    down = LinComb<float>(1, down, 1, corr2);
+  }
+
+  ReduceRinging(initial, mask, down);
+
+  // can't just use CopyImage, because the output image was prepared with
+  // padding.
+  for (size_t y = 0; y < down.ysize(); y++) {
+    for (size_t x = 0; x < down.xsize(); x++) {
+      float v = down.Row(y)[x];
+      output->Row(y)[x] = v;
+    }
+  }
+}
+
+void DownsampleImage2_Iterative(Image3F* opsin) {
+  // Allocate extra space to avoid a reallocation when padding.
+  Image3F downsampled(DivCeil(opsin->xsize(), 2) + kBlockDim,
+                      DivCeil(opsin->ysize(), 2) + kBlockDim);
+  downsampled.ShrinkTo(downsampled.xsize() - kBlockDim,
+                       downsampled.ysize() - kBlockDim);
+
+  Image3F rgb(opsin->xsize(), opsin->ysize());
+  OpsinParams opsin_params;  // TODO: use the ones that are actually used
+  opsin_params.Init(kDefaultIntensityTarget);
+  OpsinToLinear(*opsin, Rect(rgb), nullptr, &rgb, opsin_params);
+
+  ImageF mask(opsin->xsize(), opsin->ysize());
+  ButteraugliParams butter_params;
+  ButteraugliComparator butter(rgb, butter_params);
+  butter.Mask(&mask);
+  ImageF mask_fuzzy(opsin->xsize(), opsin->ysize());
+
+  for (size_t c = 0; c < 3; c++) {
+    DownsampleImage2_Iterative(opsin->Plane(c), &downsampled.Plane(c));
+  }
+  *opsin = std::move(downsampled);
+}
+}  // namespace
+
+Status DefaultEncoderHeuristics::LossyFrameHeuristics(
+    PassesEncoderState* enc_state, ModularFrameEncoder* modular_frame_encoder,
+    const ImageBundle* original_pixels, Image3F* opsin,
+    const JxlCmsInterface& cms, ThreadPool* pool, AuxOut* aux_out) {
+  PROFILER_ZONE("JxlLossyFrameHeuristics uninstrumented");
+
+  CompressParams& cparams = enc_state->cparams;
+  PassesSharedState& shared = enc_state->shared;
+
+  // Compute parameters for noise synthesis.
+  if (shared.frame_header.flags & FrameHeader::kNoise) {
+    PROFILER_ZONE("enc GetNoiseParam");
+    if (cparams.photon_noise_iso == 0) {
+      // Don't start at zero amplitude since adding noise is expensive -- it
+      // significantly slows down decoding, and this is unlikely to
+      // completely go away even with advanced optimizations. After the
+      // kNoiseModelingRampUpDistanceRange we have reached the full level,
+      // i.e. noise is no longer represented by the compressed image, so we
+      // can add full noise by the noise modeling itself.
+      static const float kNoiseModelingRampUpDistanceRange = 0.6;
+      static const float kNoiseLevelAtStartOfRampUp = 0.25;
+      static const float kNoiseRampupStart = 1.0;
+      // TODO(user) test and properly select quality_coef with smooth
+      // filter
+      float quality_coef = 1.0f;
+      const float rampup = (cparams.butteraugli_distance - kNoiseRampupStart) /
+                           kNoiseModelingRampUpDistanceRange;
+      if (rampup < 1.0f) {
+        quality_coef = kNoiseLevelAtStartOfRampUp +
+                       (1.0f - kNoiseLevelAtStartOfRampUp) * rampup;
+      }
+      if (rampup < 0.0f) {
+        quality_coef = kNoiseRampupStart;
+      }
+      if (!GetNoiseParameter(*opsin, &shared.image_features.noise_params,
+                             quality_coef)) {
+        shared.frame_header.flags &= ~FrameHeader::kNoise;
+      }
+    }
+  }
+  if (enc_state->shared.frame_header.upsampling != 1 &&
+      !cparams.already_downsampled) {
+    // In VarDCT mode, LossyFrameHeuristics takes care of running downsampling
+    // after noise, if necessary.
+    if (cparams.resampling == 2) {
+      // TODO(lode): use the regular DownsampleImage, or adapt to the custom
+      // coefficients, if there is are custom upscaling coefficients in
+      // CustomTransformData
+      if (cparams.speed_tier <= SpeedTier::kSquirrel) {
+        // TODO(lode): DownsampleImage2_Iterative is currently too slow to
+        // be used for squirrel, make it faster, and / or enable it only for
+        // kitten.
+        DownsampleImage2_Iterative(opsin);
+      } else {
+        DownsampleImage2_Sharper(opsin);
+      }
+    } else {
+      DownsampleImage(opsin, cparams.resampling);
+    }
+    PadImageToBlockMultipleInPlace(opsin);
+  }
+
+  if (cparams.butteraugli_distance < 0) {
+    return JXL_FAILURE("Expected non-negative distance");
+  }
+
+  // Find and subtract splines.
+  if (cparams.speed_tier <= SpeedTier::kSquirrel) {
+    // If we do already have them, they were passed upstream to EncodeFile.
+    if (!shared.image_features.splines.HasAny()) {
+      shared.image_features.splines = FindSplines(*opsin);
+    }
+    JXL_RETURN_IF_ERROR(shared.image_features.splines.InitializeDrawCache(
+        opsin->xsize(), opsin->ysize(), shared.cmap));
+    shared.image_features.splines.SubtractFrom(opsin);
+  }
+
+  // Find and subtract patches/dots.
+  if (ApplyOverride(cparams.patches,
+                    cparams.speed_tier <= SpeedTier::kSquirrel)) {
+    FindBestPatchDictionary(*opsin, enc_state, cms, pool, aux_out);
+    PatchDictionaryEncoder::SubtractFrom(shared.image_features.patches, opsin);
+  }
+
+  static const float kAcQuant = 0.79f;
+  const float quant_dc = InitialQuantDC(cparams.butteraugli_distance);
+  Quantizer& quantizer = enc_state->shared.quantizer;
+  // We don't know the quant field yet, but for computing the global scale
+  // assuming that it will be the same as for Falcon mode is good enough.
+  quantizer.ComputeGlobalScaleAndQuant(
+      quant_dc, kAcQuant / cparams.butteraugli_distance, 0);
+
+  // TODO(veluca): we can now run all the code from here to FindBestQuantizer
+  // (excluded) one rect at a time. Do that.
+
+  // Dependency graph:
+  //
+  // input: either XYB or input image
+  //
+  // input image -> XYB [optional]
+  // XYB -> initial quant field
+  // XYB -> Gaborished XYB
+  // Gaborished XYB -> CfL1
+  // initial quant field, Gaborished XYB, CfL1 -> ACS
+  // initial quant field, ACS, Gaborished XYB -> EPF control field
+  // initial quant field -> adjusted initial quant field
+  // adjusted initial quant field, ACS -> raw quant field
+  // raw quant field, ACS, Gaborished XYB -> CfL2
+  //
+  // output: Gaborished XYB, CfL, ACS, raw quant field, EPF control field.
+
+  ArControlFieldHeuristics ar_heuristics;
+  AcStrategyHeuristics acs_heuristics;
+  CfLHeuristics cfl_heuristics;
+
+  if (!opsin->xsize()) {
+    JXL_ASSERT(HandlesColorConversion(cparams, *original_pixels));
+    *opsin = Image3F(RoundUpToBlockDim(original_pixels->xsize()),
+                     RoundUpToBlockDim(original_pixels->ysize()));
+    opsin->ShrinkTo(original_pixels->xsize(), original_pixels->ysize());
+    ToXYB(*original_pixels, pool, opsin, cms, /*linear=*/nullptr);
+    PadImageToBlockMultipleInPlace(opsin);
+  }
+
+  // Compute an initial estimate of the quantization field.
+  // Call InitialQuantField only in Hare mode or slower. Otherwise, rely
+  // on simple heuristics in FindBestAcStrategy, or set a constant for Falcon
+  // mode.
+  if (cparams.speed_tier > SpeedTier::kHare || cparams.uniform_quant > 0) {
+    enc_state->initial_quant_field =
+        ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks);
+    enc_state->initial_quant_masking =
+        ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks);
+    float q = cparams.uniform_quant > 0
+                  ? cparams.uniform_quant
+                  : kAcQuant / cparams.butteraugli_distance;
+    FillImage(q, &enc_state->initial_quant_field);
+    FillImage(1.0f / (q + 0.001f), &enc_state->initial_quant_masking);
+  } else {
+    // Call this here, as it relies on pre-gaborish values.
+    float butteraugli_distance_for_iqf = cparams.butteraugli_distance;
+    if (!shared.frame_header.loop_filter.gab) {
+      butteraugli_distance_for_iqf *= 0.73f;
+    }
+    enc_state->initial_quant_field = InitialQuantField(
+        butteraugli_distance_for_iqf, *opsin, shared.frame_dim, pool, 1.0f,
+        &enc_state->initial_quant_masking);
+    quantizer.SetQuantField(quant_dc, enc_state->initial_quant_field, nullptr);
+  }
+
+  // TODO(veluca): do something about animations.
+
+  // Apply inverse-gaborish.
+  if (shared.frame_header.loop_filter.gab) {
+    // Unsure why better to do some more gaborish on X and B than Y.
+    float weight[3] = {
+        1.0036278514398933f,
+        0.99406123118127299f,
+        0.99719338015886894f,
+    };
+    GaborishInverse(opsin, weight, pool);
+  }
+
+  FindBestDequantMatrices(cparams, *opsin, modular_frame_encoder,
+                          &enc_state->shared.matrices);
+
+  cfl_heuristics.Init(*opsin);
+  acs_heuristics.Init(*opsin, enc_state);
+
+  auto process_tile = [&](const uint32_t tid, const size_t thread) {
+    size_t n_enc_tiles =
+        DivCeil(enc_state->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks);
+    size_t tx = tid % n_enc_tiles;
+    size_t ty = tid / n_enc_tiles;
+    size_t by0 = ty * kEncTileDimInBlocks;
+    size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks,
+                          enc_state->shared.frame_dim.ysize_blocks);
+    size_t bx0 = tx * kEncTileDimInBlocks;
+    size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks,
+                          enc_state->shared.frame_dim.xsize_blocks);
+    Rect r(bx0, by0, bx1 - bx0, by1 - by0);
+
+    // For speeds up to Wombat, we only compute the color correlation map
+    // once we know the transform type and the quantization map.
+    if (cparams.speed_tier <= SpeedTier::kSquirrel) {
+      cfl_heuristics.ComputeTile(r, *opsin, enc_state->shared.matrices,
+                                 /*ac_strategy=*/nullptr,
+                                 /*raw_quant_field=*/nullptr,
+                                 /*quantizer=*/nullptr, /*fast=*/false, thread,
+                                 &enc_state->shared.cmap);
+    }
+
+    // Choose block sizes.
+    acs_heuristics.ProcessRect(r);
+
+    // Choose amount of post-processing smoothing.
+    // TODO(veluca): should this go *after* AdjustQuantField?
+    ar_heuristics.RunRect(r, *opsin, enc_state, thread);
+
+    // Always set the initial quant field, so we can compute the CfL map with
+    // more accuracy. The initial quant field might change in slower modes, but
+    // adjusting the quant field with butteraugli when all the other encoding
+    // parameters are fixed is likely a more reliable choice anyway.
+    AdjustQuantField(enc_state->shared.ac_strategy, r,
+                     &enc_state->initial_quant_field);
+    quantizer.SetQuantFieldRect(enc_state->initial_quant_field, r,
+                                &enc_state->shared.raw_quant_field);
+
+    // Compute a non-default CfL map if we are at Hare speed, or slower.
+    if (cparams.speed_tier <= SpeedTier::kHare) {
+      cfl_heuristics.ComputeTile(
+          r, *opsin, enc_state->shared.matrices, &enc_state->shared.ac_strategy,
+          &enc_state->shared.raw_quant_field, &enc_state->shared.quantizer,
+          /*fast=*/cparams.speed_tier >= SpeedTier::kWombat, thread,
+          &enc_state->shared.cmap);
+    }
+  };
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0,
+      DivCeil(enc_state->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks) *
+          DivCeil(enc_state->shared.frame_dim.ysize_blocks,
+                  kEncTileDimInBlocks),
+      [&](const size_t num_threads) {
+        ar_heuristics.PrepareForThreads(num_threads);
+        cfl_heuristics.PrepareForThreads(num_threads);
+        return true;
+      },
+      process_tile, "Enc Heuristics"));
+
+  acs_heuristics.Finalize(aux_out);
+  if (cparams.speed_tier <= SpeedTier::kHare) {
+    cfl_heuristics.ComputeDC(/*fast=*/cparams.speed_tier >= SpeedTier::kWombat,
+                             &enc_state->shared.cmap);
+  }
+
+  // Refine quantization levels.
+  FindBestQuantizer(original_pixels, *opsin, enc_state, cms, pool, aux_out);
+
+  // Choose a context model that depends on the amount of quantization for AC.
+  if (cparams.speed_tier < SpeedTier::kFalcon) {
+    FindBestBlockEntropyModel(*enc_state);
+  }
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_heuristics.h b/third_party/jpeg-xl/lib/jxl/enc_heuristics.h
new file mode 100644
index 0000000000..3cb9b506a6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_heuristics.h
@@ -0,0 +1,81 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_HEURISTICS_H_
+#define LIB_JXL_ENC_HEURISTICS_H_
+
+// Hook for custom encoder heuristics (VarDCT only for now).
+
+#include <jxl/cms_interface.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/modular/encoding/enc_ma.h"
+
+namespace jxl {
+
+struct AuxOut;
+struct PassesEncoderState;
+class DequantMatrices;
+class ImageBundle;
+class ModularFrameEncoder;
+
+class EncoderHeuristics {
+ public:
+  virtual ~EncoderHeuristics() = default;
+  // Initializes encoder structures in `enc_state` using the original image data
+  // in `original_pixels`, and the XYB image data in `opsin`. Also modifies the
+  // `opsin` image by applying Gaborish, and doing other modifications if
+  // necessary. `pool` is used for running the computations on multiple threads.
+  // `aux_out` collects statistics and can be used to print debug images.
+  virtual Status LossyFrameHeuristics(
+      PassesEncoderState* enc_state, ModularFrameEncoder* modular_frame_encoder,
+      const ImageBundle* original_pixels, Image3F* opsin,
+      const JxlCmsInterface& cms, ThreadPool* pool, AuxOut* aux_out) = 0;
+
+  // Custom fixed tree for lossless mode. Must set `tree` to a valid tree if
+  // the function returns true.
+  virtual bool CustomFixedTreeLossless(const FrameDimensions& frame_dim,
+                                       Tree* tree) {
+    return false;
+  }
+
+  // If this method returns `true`, the `opsin` parameter to
+  // LossyFrameHeuristics will not be initialized, and should be initialized
+  // during the call. Moreover, `original_pixels` may not be in a linear
+  // colorspace (but will be the same as the `ib` value passed to this
+  // function).
+  virtual bool HandlesColorConversion(const CompressParams& cparams,
+                                      const ImageBundle& ib) {
+    return false;
+  }
+};
+
+class DefaultEncoderHeuristics : public EncoderHeuristics {
+ public:
+  Status LossyFrameHeuristics(PassesEncoderState* enc_state,
+                              ModularFrameEncoder* modular_frame_encoder,
+                              const ImageBundle* original_pixels,
+                              Image3F* opsin, const JxlCmsInterface& cms,
+                              ThreadPool* pool, AuxOut* aux_out) override;
+  bool HandlesColorConversion(const CompressParams& cparams,
+                              const ImageBundle& ib) override;
+};
+
+// Exposed here since it may be used by other EncoderHeuristics implementations
+// outside this project.
+void FindBestDequantMatrices(const CompressParams& cparams,
+                             const Image3F& opsin,
+                             ModularFrameEncoder* modular_frame_encoder,
+                             DequantMatrices* dequant_matrices);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_HEURISTICS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_huffman.cc b/third_party/jpeg-xl/lib/jxl/enc_huffman.cc
new file mode 100644
index 0000000000..3eab2c218a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_huffman.cc
@@ -0,0 +1,214 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_huffman.h"
+
+#include <algorithm>
+#include <memory>
+
+#include "lib/jxl/enc_huffman_tree.h"
+
+namespace jxl {
+
+namespace {
+
+constexpr int kCodeLengthCodes = 18;
+
+void StoreHuffmanTreeOfHuffmanTreeToBitMask(const int num_codes,
+                                            const uint8_t* code_length_bitdepth,
+                                            BitWriter* writer) {
+  static const uint8_t kStorageOrder[kCodeLengthCodes] = {
+      1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  // The bit lengths of the Huffman code over the code length alphabet
+  // are compressed with the following static Huffman code:
+  //   Symbol   Code
+  //   ------   ----
+  //   0          00
+  //   1        1110
+  //   2         110
+  //   3          01
+  //   4          10
+  //   5        1111
+  static const uint8_t kHuffmanBitLengthHuffmanCodeSymbols[6] = {0, 7, 3,
+                                                                 2, 1, 15};
+  static const uint8_t kHuffmanBitLengthHuffmanCodeBitLengths[6] = {2, 4, 3,
+                                                                    2, 2, 4};
+
+  // Throw away trailing zeros:
+  size_t codes_to_store = kCodeLengthCodes;
+  if (num_codes > 1) {
+    for (; codes_to_store > 0; --codes_to_store) {
+      if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) {
+        break;
+      }
+    }
+  }
+  size_t skip_some = 0;  // skips none.
+  if (code_length_bitdepth[kStorageOrder[0]] == 0 &&
+      code_length_bitdepth[kStorageOrder[1]] == 0) {
+    skip_some = 2;  // skips two.
+    if (code_length_bitdepth[kStorageOrder[2]] == 0) {
+      skip_some = 3;  // skips three.
+    }
+  }
+  writer->Write(2, skip_some);
+  for (size_t i = skip_some; i < codes_to_store; ++i) {
+    size_t l = code_length_bitdepth[kStorageOrder[i]];
+    writer->Write(kHuffmanBitLengthHuffmanCodeBitLengths[l],
+                  kHuffmanBitLengthHuffmanCodeSymbols[l]);
+  }
+}
+
+void StoreHuffmanTreeToBitMask(const size_t huffman_tree_size,
+                               const uint8_t* huffman_tree,
+                               const uint8_t* huffman_tree_extra_bits,
+                               const uint8_t* code_length_bitdepth,
+                               const uint16_t* code_length_bitdepth_symbols,
+                               BitWriter* writer) {
+  for (size_t i = 0; i < huffman_tree_size; ++i) {
+    size_t ix = huffman_tree[i];
+    writer->Write(code_length_bitdepth[ix], code_length_bitdepth_symbols[ix]);
+    // Extra bits
+    switch (ix) {
+      case 16:
+        writer->Write(2, huffman_tree_extra_bits[i]);
+        break;
+      case 17:
+        writer->Write(3, huffman_tree_extra_bits[i]);
+        break;
+    }
+  }
+}
+
+void StoreSimpleHuffmanTree(const uint8_t* depths, size_t symbols[4],
+                            size_t num_symbols, size_t max_bits,
+                            BitWriter* writer) {
+  // value of 1 indicates a simple Huffman code
+  writer->Write(2, 1);
+  writer->Write(2, num_symbols - 1);  // NSYM - 1
+
+  // Sort
+  for (size_t i = 0; i < num_symbols; i++) {
+    for (size_t j = i + 1; j < num_symbols; j++) {
+      if (depths[symbols[j]] < depths[symbols[i]]) {
+        std::swap(symbols[j], symbols[i]);
+      }
+    }
+  }
+
+  if (num_symbols == 2) {
+    writer->Write(max_bits, symbols[0]);
+    writer->Write(max_bits, symbols[1]);
+  } else if (num_symbols == 3) {
+    writer->Write(max_bits, symbols[0]);
+    writer->Write(max_bits, symbols[1]);
+    writer->Write(max_bits, symbols[2]);
+  } else {
+    writer->Write(max_bits, symbols[0]);
+    writer->Write(max_bits, symbols[1]);
+    writer->Write(max_bits, symbols[2]);
+    writer->Write(max_bits, symbols[3]);
+    // tree-select
+    writer->Write(1, depths[symbols[0]] == 1 ? 1 : 0);
+  }
+}
+
+// num = alphabet size
+// depths = symbol depths
+void StoreHuffmanTree(const uint8_t* depths, size_t num, BitWriter* writer) {
+  // Write the Huffman tree into the compact representation.
+  std::unique_ptr<uint8_t[]> arena(new uint8_t[2 * num]);
+  uint8_t* huffman_tree = arena.get();
+  uint8_t* huffman_tree_extra_bits = arena.get() + num;
+  size_t huffman_tree_size = 0;
+  WriteHuffmanTree(depths, num, &huffman_tree_size, huffman_tree,
+                   huffman_tree_extra_bits);
+
+  // Calculate the statistics of the Huffman tree in the compact representation.
+  uint32_t huffman_tree_histogram[kCodeLengthCodes] = {0};
+  for (size_t i = 0; i < huffman_tree_size; ++i) {
+    ++huffman_tree_histogram[huffman_tree[i]];
+  }
+
+  int num_codes = 0;
+  int code = 0;
+  for (int i = 0; i < kCodeLengthCodes; ++i) {
+    if (huffman_tree_histogram[i]) {
+      if (num_codes == 0) {
+        code = i;
+        num_codes = 1;
+      } else if (num_codes == 1) {
+        num_codes = 2;
+        break;
+      }
+    }
+  }
+
+  // Calculate another Huffman tree to use for compressing both the
+  // earlier Huffman tree with.
+  uint8_t code_length_bitdepth[kCodeLengthCodes] = {0};
+  uint16_t code_length_bitdepth_symbols[kCodeLengthCodes] = {0};
+  CreateHuffmanTree(&huffman_tree_histogram[0], kCodeLengthCodes, 5,
+                    &code_length_bitdepth[0]);
+  ConvertBitDepthsToSymbols(code_length_bitdepth, kCodeLengthCodes,
+                            &code_length_bitdepth_symbols[0]);
+
+  // Now, we have all the data, let's start storing it
+  StoreHuffmanTreeOfHuffmanTreeToBitMask(num_codes, code_length_bitdepth,
+                                         writer);
+
+  if (num_codes == 1) {
+    code_length_bitdepth[code] = 0;
+  }
+
+  // Store the real huffman tree now.
+  StoreHuffmanTreeToBitMask(huffman_tree_size, huffman_tree,
+                            huffman_tree_extra_bits, &code_length_bitdepth[0],
+                            code_length_bitdepth_symbols, writer);
+}
+
+}  // namespace
+
+void BuildAndStoreHuffmanTree(const uint32_t* histogram, const size_t length,
+                              uint8_t* depth, uint16_t* bits,
+                              BitWriter* writer) {
+  size_t count = 0;
+  size_t s4[4] = {0};
+  for (size_t i = 0; i < length; i++) {
+    if (histogram[i]) {
+      if (count < 4) {
+        s4[count] = i;
+      } else if (count > 4) {
+        break;
+      }
+      count++;
+    }
+  }
+
+  size_t max_bits_counter = length - 1;
+  size_t max_bits = 0;
+  while (max_bits_counter) {
+    max_bits_counter >>= 1;
+    ++max_bits;
+  }
+
+  if (count <= 1) {
+    // Output symbol bits and depths are initialized with 0, nothing to do.
+    writer->Write(4, 1);
+    writer->Write(max_bits, s4[0]);
+    return;
+  }
+
+  CreateHuffmanTree(histogram, length, 15, depth);
+  ConvertBitDepthsToSymbols(depth, length, bits);
+
+  if (count <= 4) {
+    StoreSimpleHuffmanTree(depth, s4, count, max_bits, writer);
+  } else {
+    StoreHuffmanTree(depth, length, writer);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_huffman.h b/third_party/jpeg-xl/lib/jxl/enc_huffman.h
new file mode 100644
index 0000000000..d7a66584e8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_huffman.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_HUFFMAN_H_
+#define LIB_JXL_ENC_HUFFMAN_H_
+
+#include "lib/jxl/enc_bit_writer.h"
+
+namespace jxl {
+
+// Builds a Huffman tree for the given histogram, and encodes it into writer
+// in a format that can be read by HuffmanDecodingData::ReadFromBitstream.
+// An allotment for `writer` must already have been created by the caller.
+void BuildAndStoreHuffmanTree(const uint32_t* histogram, size_t length,
+                              uint8_t* depth, uint16_t* bits,
+                              BitWriter* writer);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_HUFFMAN_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.cc b/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.cc
new file mode 100644
index 0000000000..5c40dea770
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.cc
@@ -0,0 +1,328 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_huffman_tree.h"
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+void SetDepth(const HuffmanTree& p, HuffmanTree* pool, uint8_t* depth,
+              uint8_t level) {
+  if (p.index_left >= 0) {
+    ++level;
+    SetDepth(pool[p.index_left], pool, depth, level);
+    SetDepth(pool[p.index_right_or_value], pool, depth, level);
+  } else {
+    depth[p.index_right_or_value] = level;
+  }
+}
+
+// Sort the root nodes, least popular first.
+static JXL_INLINE bool Compare(const HuffmanTree& v0, const HuffmanTree& v1) {
+  return v0.total_count < v1.total_count;
+}
+
+// This function will create a Huffman tree.
+//
+// The catch here is that the tree cannot be arbitrarily deep.
+// Brotli specifies a maximum depth of 15 bits for "code trees"
+// and 7 bits for "code length code trees."
+//
+// count_limit is the value that is to be faked as the minimum value
+// and this minimum value is raised until the tree matches the
+// maximum length requirement.
+//
+// This algorithm is not of excellent performance for very long data blocks,
+// especially when population counts are longer than 2**tree_limit, but
+// we are not planning to use this with extremely long blocks.
+//
+// See http://en.wikipedia.org/wiki/Huffman_coding
+void CreateHuffmanTree(const uint32_t* data, const size_t length,
+                       const int tree_limit, uint8_t* depth) {
+  // For block sizes below 64 kB, we never need to do a second iteration
+  // of this loop. Probably all of our block sizes will be smaller than
+  // that, so this loop is mostly of academic interest. If we actually
+  // would need this, we would be better off with the Katajainen algorithm.
+  for (uint32_t count_limit = 1;; count_limit *= 2) {
+    std::vector<HuffmanTree> tree;
+    tree.reserve(2 * length + 1);
+
+    for (size_t i = length; i != 0;) {
+      --i;
+      if (data[i]) {
+        const uint32_t count = std::max(data[i], count_limit - 1);
+        tree.emplace_back(count, -1, static_cast<int16_t>(i));
+      }
+    }
+
+    const size_t n = tree.size();
+    if (n == 1) {
+      // Fake value; will be fixed on upper level.
+      depth[tree[0].index_right_or_value] = 1;
+      break;
+    }
+
+    std::stable_sort(tree.begin(), tree.end(), Compare);
+
+    // The nodes are:
+    // [0, n): the sorted leaf nodes that we start with.
+    // [n]: we add a sentinel here.
+    // [n + 1, 2n): new parent nodes are added here, starting from
+    //              (n+1). These are naturally in ascending order.
+    // [2n]: we add a sentinel at the end as well.
+    // There will be (2n+1) elements at the end.
+    const HuffmanTree sentinel(std::numeric_limits<uint32_t>::max(), -1, -1);
+    tree.push_back(sentinel);
+    tree.push_back(sentinel);
+
+    size_t i = 0;      // Points to the next leaf node.
+    size_t j = n + 1;  // Points to the next non-leaf node.
+    for (size_t k = n - 1; k != 0; --k) {
+      size_t left, right;
+      if (tree[i].total_count <= tree[j].total_count) {
+        left = i;
+        ++i;
+      } else {
+        left = j;
+        ++j;
+      }
+      if (tree[i].total_count <= tree[j].total_count) {
+        right = i;
+        ++i;
+      } else {
+        right = j;
+        ++j;
+      }
+
+      // The sentinel node becomes the parent node.
+      size_t j_end = tree.size() - 1;
+      tree[j_end].total_count =
+          tree[left].total_count + tree[right].total_count;
+      tree[j_end].index_left = static_cast<int16_t>(left);
+      tree[j_end].index_right_or_value = static_cast<int16_t>(right);
+
+      // Add back the last sentinel node.
+      tree.push_back(sentinel);
+    }
+    JXL_DASSERT(tree.size() == 2 * n + 1);
+    SetDepth(tree[2 * n - 1], &tree[0], depth, 0);
+
+    // We need to pack the Huffman tree in tree_limit bits.
+    // If this was not successful, add fake entities to the lowest values
+    // and retry.
+    if (*std::max_element(&depth[0], &depth[length]) <= tree_limit) {
+      break;
+    }
+  }
+}
+
+void Reverse(uint8_t* v, size_t start, size_t end) {
+  --end;
+  while (start < end) {
+    uint8_t tmp = v[start];
+    v[start] = v[end];
+    v[end] = tmp;
+    ++start;
+    --end;
+  }
+}
+
+void WriteHuffmanTreeRepetitions(const uint8_t previous_value,
+                                 const uint8_t value, size_t repetitions,
+                                 size_t* tree_size, uint8_t* tree,
+                                 uint8_t* extra_bits_data) {
+  JXL_DASSERT(repetitions > 0);
+  if (previous_value != value) {
+    tree[*tree_size] = value;
+    extra_bits_data[*tree_size] = 0;
+    ++(*tree_size);
+    --repetitions;
+  }
+  if (repetitions == 7) {
+    tree[*tree_size] = value;
+    extra_bits_data[*tree_size] = 0;
+    ++(*tree_size);
+    --repetitions;
+  }
+  if (repetitions < 3) {
+    for (size_t i = 0; i < repetitions; ++i) {
+      tree[*tree_size] = value;
+      extra_bits_data[*tree_size] = 0;
+      ++(*tree_size);
+    }
+  } else {
+    repetitions -= 3;
+    size_t start = *tree_size;
+    while (true) {
+      tree[*tree_size] = 16;
+      extra_bits_data[*tree_size] = repetitions & 0x3;
+      ++(*tree_size);
+      repetitions >>= 2;
+      if (repetitions == 0) {
+        break;
+      }
+      --repetitions;
+    }
+    Reverse(tree, start, *tree_size);
+    Reverse(extra_bits_data, start, *tree_size);
+  }
+}
+
+void WriteHuffmanTreeRepetitionsZeros(size_t repetitions, size_t* tree_size,
+                                      uint8_t* tree, uint8_t* extra_bits_data) {
+  if (repetitions == 11) {
+    tree[*tree_size] = 0;
+    extra_bits_data[*tree_size] = 0;
+    ++(*tree_size);
+    --repetitions;
+  }
+  if (repetitions < 3) {
+    for (size_t i = 0; i < repetitions; ++i) {
+      tree[*tree_size] = 0;
+      extra_bits_data[*tree_size] = 0;
+      ++(*tree_size);
+    }
+  } else {
+    repetitions -= 3;
+    size_t start = *tree_size;
+    while (true) {
+      tree[*tree_size] = 17;
+      extra_bits_data[*tree_size] = repetitions & 0x7;
+      ++(*tree_size);
+      repetitions >>= 3;
+      if (repetitions == 0) {
+        break;
+      }
+      --repetitions;
+    }
+    Reverse(tree, start, *tree_size);
+    Reverse(extra_bits_data, start, *tree_size);
+  }
+}
+
+static void DecideOverRleUse(const uint8_t* depth, const size_t length,
+                             bool* use_rle_for_non_zero,
+                             bool* use_rle_for_zero) {
+  size_t total_reps_zero = 0;
+  size_t total_reps_non_zero = 0;
+  size_t count_reps_zero = 1;
+  size_t count_reps_non_zero = 1;
+  for (size_t i = 0; i < length;) {
+    const uint8_t value = depth[i];
+    size_t reps = 1;
+    for (size_t k = i + 1; k < length && depth[k] == value; ++k) {
+      ++reps;
+    }
+    if (reps >= 3 && value == 0) {
+      total_reps_zero += reps;
+      ++count_reps_zero;
+    }
+    if (reps >= 4 && value != 0) {
+      total_reps_non_zero += reps;
+      ++count_reps_non_zero;
+    }
+    i += reps;
+  }
+  *use_rle_for_non_zero = total_reps_non_zero > count_reps_non_zero * 2;
+  *use_rle_for_zero = total_reps_zero > count_reps_zero * 2;
+}
+
+void WriteHuffmanTree(const uint8_t* depth, size_t length, size_t* tree_size,
+                      uint8_t* tree, uint8_t* extra_bits_data) {
+  uint8_t previous_value = 8;
+
+  // Throw away trailing zeros.
+  size_t new_length = length;
+  for (size_t i = 0; i < length; ++i) {
+    if (depth[length - i - 1] == 0) {
+      --new_length;
+    } else {
+      break;
+    }
+  }
+
+  // First gather statistics on if it is a good idea to do rle.
+  bool use_rle_for_non_zero = false;
+  bool use_rle_for_zero = false;
+  if (length > 50) {
+    // Find rle coding for longer codes.
+    // Shorter codes seem not to benefit from rle.
+    DecideOverRleUse(depth, new_length, &use_rle_for_non_zero,
+                     &use_rle_for_zero);
+  }
+
+  // Actual rle coding.
+  for (size_t i = 0; i < new_length;) {
+    const uint8_t value = depth[i];
+    size_t reps = 1;
+    if ((value != 0 && use_rle_for_non_zero) ||
+        (value == 0 && use_rle_for_zero)) {
+      for (size_t k = i + 1; k < new_length && depth[k] == value; ++k) {
+        ++reps;
+      }
+    }
+    if (value == 0) {
+      WriteHuffmanTreeRepetitionsZeros(reps, tree_size, tree, extra_bits_data);
+    } else {
+      WriteHuffmanTreeRepetitions(previous_value, value, reps, tree_size, tree,
+                                  extra_bits_data);
+      previous_value = value;
+    }
+    i += reps;
+  }
+}
+
+namespace {
+
+uint16_t ReverseBits(int num_bits, uint16_t bits) {
+  static const size_t kLut[16] = {// Pre-reversed 4-bit values.
+                                  0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
+                                  0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf};
+  size_t retval = kLut[bits & 0xf];
+  for (int i = 4; i < num_bits; i += 4) {
+    retval <<= 4;
+    bits = static_cast<uint16_t>(bits >> 4);
+    retval |= kLut[bits & 0xf];
+  }
+  retval >>= (-num_bits & 0x3);
+  return static_cast<uint16_t>(retval);
+}
+
+}  // namespace
+
+void ConvertBitDepthsToSymbols(const uint8_t* depth, size_t len,
+                               uint16_t* bits) {
+  // In Brotli, all bit depths are [1..15]
+  // 0 bit depth means that the symbol does not exist.
+  const int kMaxBits = 16;  // 0..15 are values for bits
+  uint16_t bl_count[kMaxBits] = {0};
+  {
+    for (size_t i = 0; i < len; ++i) {
+      ++bl_count[depth[i]];
+    }
+    bl_count[0] = 0;
+  }
+  uint16_t next_code[kMaxBits];
+  next_code[0] = 0;
+  {
+    int code = 0;
+    for (size_t i = 1; i < kMaxBits; ++i) {
+      code = (code + bl_count[i - 1]) << 1;
+      next_code[i] = static_cast<uint16_t>(code);
+    }
+  }
+  for (size_t i = 0; i < len; ++i) {
+    if (depth[i]) {
+      bits[i] = ReverseBits(depth[i], next_code[depth[i]]++);
+    }
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.h b/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.h
new file mode 100644
index 0000000000..7d716cd3b5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.h
@@ -0,0 +1,52 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Library for creating Huffman codes from population counts.
+
+#ifndef LIB_JXL_HUFFMAN_TREE_H_
+#define LIB_JXL_HUFFMAN_TREE_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+namespace jxl {
+
+// A node of a Huffman tree.
+struct HuffmanTree {
+  HuffmanTree(uint32_t count, int16_t left, int16_t right)
+      : total_count(count), index_left(left), index_right_or_value(right) {}
+  uint32_t total_count;
+  int16_t index_left;
+  int16_t index_right_or_value;
+};
+
+void SetDepth(const HuffmanTree& p, HuffmanTree* pool, uint8_t* depth,
+              uint8_t level);
+
+// This function will create a Huffman tree.
+//
+// The (data,length) contains the population counts.
+// The tree_limit is the maximum bit depth of the Huffman codes.
+//
+// The depth contains the tree, i.e., how many bits are used for
+// the symbol.
+//
+// See http://en.wikipedia.org/wiki/Huffman_coding
+void CreateHuffmanTree(const uint32_t* data, size_t length, int tree_limit,
+                       uint8_t* depth);
+
+// Write a Huffman tree from bit depths into the bitstream representation
+// of a Huffman tree. The generated Huffman tree is to be compressed once
+// more using a Huffman tree
+void WriteHuffmanTree(const uint8_t* depth, size_t length, size_t* tree_size,
+                      uint8_t* tree, uint8_t* extra_bits_data);
+
+// Get the actual bit values for a tree of bit depths.
+void ConvertBitDepthsToSymbols(const uint8_t* depth, size_t len,
+                               uint16_t* bits);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_HUFFMAN_TREE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_icc_codec.cc b/third_party/jpeg-xl/lib/jxl/enc_icc_codec.cc
new file mode 100644
index 0000000000..a6782f6a45
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_icc_codec.cc
@@ -0,0 +1,406 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_icc_codec.h"
+
+#include <stdint.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/icc_codec_common.h"
+
+namespace jxl {
+namespace {
+
+// Unshuffles or de-interleaves bytes, for example with width 2, turns
+// "AaBbCcDc" into "ABCDabcd", this for example de-interleaves UTF-16 bytes into
+// first all the high order bytes, then all the low order bytes.
+// Transposes a matrix of width columns and ceil(size / width) rows. There are
+// size elements, size may be < width * height, if so the
+// last elements of the bottom row are missing, the missing spots are
+// transposed along with the filled spots, and the result has the missing
+// elements at the bottom of the rightmost column. The input is the input matrix
+// in scanline order, the output is the result matrix in scanline order, with
+// missing elements skipped over (this may occur at multiple positions).
+void Unshuffle(uint8_t* data, size_t size, size_t width) {
+  size_t height = (size + width - 1) / width;  // amount of rows of input
+  PaddedBytes result(size);
+  // i = input index, j output index
+  size_t s = 0, j = 0;
+  for (size_t i = 0; i < size; i++) {
+    result[j] = data[i];
+    j += height;
+    if (j >= size) j = ++s;
+  }
+
+  for (size_t i = 0; i < size; i++) {
+    data[i] = result[i];
+  }
+}
+
+// This is performed by the encoder, the encoder must be able to encode any
+// random byte stream (not just byte streams that are a valid ICC profile), so
+// an error returned by this function is an implementation error.
+Status PredictAndShuffle(size_t stride, size_t width, int order, size_t num,
+                         const uint8_t* data, size_t size, size_t* pos,
+                         PaddedBytes* result) {
+  JXL_RETURN_IF_ERROR(CheckOutOfBounds(*pos, num, size));
+  // Required by the specification, see decoder. stride * 4 must be < *pos.
+  if (!*pos || ((*pos - 1u) >> 2u) < stride) {
+    return JXL_FAILURE("Invalid stride");
+  }
+  if (*pos < stride * 4) return JXL_FAILURE("Too large stride");
+  size_t start = result->size();
+  for (size_t i = 0; i < num; i++) {
+    uint8_t predicted =
+        LinearPredictICCValue(data, *pos, i, stride, width, order);
+    result->push_back(data[*pos + i] - predicted);
+  }
+  *pos += num;
+  if (width > 1) Unshuffle(result->data() + start, num, width);
+  return true;
+}
+}  // namespace
+
+// Outputs a transformed form of the given icc profile. The result itself is
+// not particularly smaller than the input data in bytes, but it will be in a
+// form that is easier to compress (more zeroes, ...) and will compress better
+// with brotli.
+Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result) {
+  PaddedBytes commands;
+  PaddedBytes data;
+
+  EncodeVarInt(size, result);
+
+  // Header
+  PaddedBytes header = ICCInitialHeaderPrediction();
+  EncodeUint32(0, size, &header);
+  for (size_t i = 0; i < kICCHeaderSize && i < size; i++) {
+    ICCPredictHeader(icc, size, header.data(), i);
+    data.push_back(icc[i] - header[i]);
+  }
+  if (size <= kICCHeaderSize) {
+    EncodeVarInt(0, result);  // 0 commands
+    for (size_t i = 0; i < data.size(); i++) {
+      result->push_back(data[i]);
+    }
+    return true;
+  }
+
+  std::vector<Tag> tags;
+  std::vector<size_t> tagstarts;
+  std::vector<size_t> tagsizes;
+  std::map<size_t, size_t> tagmap;
+
+  // Tag list
+  size_t pos = kICCHeaderSize;
+  if (pos + 4 <= size) {
+    uint64_t numtags = DecodeUint32(icc, size, pos);
+    pos += 4;
+    EncodeVarInt(numtags + 1, &commands);
+    uint64_t prevtagstart = kICCHeaderSize + numtags * 12;
+    uint32_t prevtagsize = 0;
+    for (size_t i = 0; i < numtags; i++) {
+      if (pos + 12 > size) break;
+
+      Tag tag = DecodeKeyword(icc, size, pos + 0);
+      uint32_t tagstart = DecodeUint32(icc, size, pos + 4);
+      uint32_t tagsize = DecodeUint32(icc, size, pos + 8);
+      pos += 12;
+
+      tags.push_back(tag);
+      tagstarts.push_back(tagstart);
+      tagsizes.push_back(tagsize);
+      tagmap[tagstart] = tags.size() - 1;
+
+      uint8_t tagcode = kCommandTagUnknown;
+      for (size_t j = 0; j < kNumTagStrings; j++) {
+        if (tag == *kTagStrings[j]) {
+          tagcode = j + kCommandTagStringFirst;
+          break;
+        }
+      }
+
+      if (tag == kRtrcTag && pos + 24 < size) {
+        bool ok = true;
+        ok &= DecodeKeyword(icc, size, pos + 0) == kGtrcTag;
+        ok &= DecodeKeyword(icc, size, pos + 12) == kBtrcTag;
+        if (ok) {
+          for (size_t kk = 0; kk < 8; kk++) {
+            if (icc[pos - 8 + kk] != icc[pos + 4 + kk]) ok = false;
+            if (icc[pos - 8 + kk] != icc[pos + 16 + kk]) ok = false;
+          }
+        }
+        if (ok) {
+          tagcode = kCommandTagTRC;
+          pos += 24;
+          i += 2;
+        }
+      }
+
+      if (tag == kRxyzTag && pos + 24 < size) {
+        bool ok = true;
+        ok &= DecodeKeyword(icc, size, pos + 0) == kGxyzTag;
+        ok &= DecodeKeyword(icc, size, pos + 12) == kBxyzTag;
+        uint32_t offsetr = tagstart;
+        uint32_t offsetg = DecodeUint32(icc, size, pos + 4);
+        uint32_t offsetb = DecodeUint32(icc, size, pos + 16);
+        uint32_t sizer = tagsize;
+        uint32_t sizeg = DecodeUint32(icc, size, pos + 8);
+        uint32_t sizeb = DecodeUint32(icc, size, pos + 20);
+        ok &= sizer == 20;
+        ok &= sizeg == 20;
+        ok &= sizeb == 20;
+        ok &= (offsetg == offsetr + 20);
+        ok &= (offsetb == offsetr + 40);
+        if (ok) {
+          tagcode = kCommandTagXYZ;
+          pos += 24;
+          i += 2;
+        }
+      }
+
+      uint8_t command = tagcode;
+      uint64_t predicted_tagstart = prevtagstart + prevtagsize;
+      if (predicted_tagstart != tagstart) command |= kFlagBitOffset;
+      size_t predicted_tagsize = prevtagsize;
+      if (tag == kRxyzTag || tag == kGxyzTag || tag == kBxyzTag ||
+          tag == kKxyzTag || tag == kWtptTag || tag == kBkptTag ||
+          tag == kLumiTag) {
+        predicted_tagsize = 20;
+      }
+      if (predicted_tagsize != tagsize) command |= kFlagBitSize;
+      commands.push_back(command);
+      if (tagcode == 1) {
+        AppendKeyword(tag, &data);
+      }
+      if (command & kFlagBitOffset) EncodeVarInt(tagstart, &commands);
+      if (command & kFlagBitSize) EncodeVarInt(tagsize, &commands);
+
+      prevtagstart = tagstart;
+      prevtagsize = tagsize;
+    }
+  }
+  // Indicate end of tag list or varint indicating there's none
+  commands.push_back(0);
+
+  // Main content
+  // The main content in a valid ICC profile contains tagged elements, with the
+  // tag types (4 letter names) given by the tag list above, and the tag list
+  // pointing to the start and indicating the size of each tagged element. It is
+  // allowed for tagged elements to overlap, e.g. the curve for R, G and B could
+  // all point to the same one.
+  Tag tag;
+  size_t tagstart = 0, tagsize = 0, clutstart = 0;
+
+  size_t last0 = pos;
+  // This loop appends commands to the output, processing some sub-section of a
+  // current tagged element each time. We need to keep track of the tagtype of
+  // the current element, and update it when we encounter the boundary of a
+  // next one.
+  // It is not required that the input data is a valid ICC profile, if the
+  // encoder does not recognize the data it will still be able to output bytes
+  // but will not predict as well.
+  while (pos <= size) {
+    size_t last1 = pos;
+    PaddedBytes commands_add;
+    PaddedBytes data_add;
+
+    // This means the loop brought the position beyond the tag end.
+    if (pos > tagstart + tagsize) {
+      tag = {{0, 0, 0, 0}};  // nonsensical value
+    }
+
+    if (commands_add.empty() && data_add.empty() && tagmap.count(pos) &&
+        pos + 4 <= size) {
+      size_t index = tagmap[pos];
+      tag = DecodeKeyword(icc, size, pos);
+      tagstart = tagstarts[index];
+      tagsize = tagsizes[index];
+
+      if (tag == kMlucTag && pos + tagsize <= size && tagsize > 8 &&
+          icc[pos + 4] == 0 && icc[pos + 5] == 0 && icc[pos + 6] == 0 &&
+          icc[pos + 7] == 0) {
+        size_t num = tagsize - 8;
+        commands_add.push_back(kCommandTypeStartFirst + 3);
+        pos += 8;
+        commands_add.push_back(kCommandShuffle2);
+        EncodeVarInt(num, &commands_add);
+        size_t start = data_add.size();
+        for (size_t i = 0; i < num; i++) {
+          data_add.push_back(icc[pos]);
+          pos++;
+        }
+        Unshuffle(data_add.data() + start, num, 2);
+      }
+
+      if (tag == kCurvTag && pos + tagsize <= size && tagsize > 8 &&
+          icc[pos + 4] == 0 && icc[pos + 5] == 0 && icc[pos + 6] == 0 &&
+          icc[pos + 7] == 0) {
+        size_t num = tagsize - 8;
+        if (num > 16 && num < (1 << 28) && pos + num <= size && pos > 0) {
+          commands_add.push_back(kCommandTypeStartFirst + 5);
+          pos += 8;
+          commands_add.push_back(kCommandPredict);
+          int order = 1, width = 2, stride = width;
+          commands_add.push_back((order << 2) | (width - 1));
+          EncodeVarInt(num, &commands_add);
+          JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc,
+                                                size, &pos, &data_add));
+        }
+      }
+    }
+
+    if (tag == kMab_Tag || tag == kMba_Tag) {
+      Tag subTag = DecodeKeyword(icc, size, pos);
+      if (pos + 12 < size && (subTag == kCurvTag || subTag == kVcgtTag) &&
+          DecodeUint32(icc, size, pos + 4) == 0) {
+        uint32_t num = DecodeUint32(icc, size, pos + 8) * 2;
+        if (num > 16 && num < (1 << 28) && pos + 12 + num <= size) {
+          pos += 12;
+          last1 = pos;
+          commands_add.push_back(kCommandPredict);
+          int order = 1, width = 2, stride = width;
+          commands_add.push_back((order << 2) | (width - 1));
+          EncodeVarInt(num, &commands_add);
+          JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc,
+                                                size, &pos, &data_add));
+        }
+      }
+
+      if (pos == tagstart + 24 && pos + 4 < size) {
+        // Note that this value can be remembered for next iterations of the
+        // loop, so the "pos == clutstart" if below can trigger during a later
+        // iteration.
+        clutstart = tagstart + DecodeUint32(icc, size, pos);
+      }
+
+      if (pos == clutstart && clutstart + 16 < size) {
+        size_t numi = icc[tagstart + 8];
+        size_t numo = icc[tagstart + 9];
+        size_t width = icc[clutstart + 16];
+        size_t stride = width * numo;
+        size_t num = width * numo;
+        for (size_t i = 0; i < numi && clutstart + i < size; i++) {
+          num *= icc[clutstart + i];
+        }
+        if ((width == 1 || width == 2) && num > 64 && num < (1 << 28) &&
+            pos + num <= size && pos > stride * 4) {
+          commands_add.push_back(kCommandPredict);
+          int order = 1;
+          uint8_t flags =
+              (order << 2) | (width - 1) | (stride == width ? 0 : 16);
+          commands_add.push_back(flags);
+          if (flags & 16) EncodeVarInt(stride, &commands_add);
+          EncodeVarInt(num, &commands_add);
+          JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc,
+                                                size, &pos, &data_add));
+        }
+      }
+    }
+
+    if (commands_add.empty() && data_add.empty() && tag == kGbd_Tag &&
+        pos == tagstart + 8 && pos + tagsize - 8 <= size && pos > 16 &&
+        tagsize > 8) {
+      size_t width = 4, order = 0, stride = width;
+      size_t num = tagsize - 8;
+      uint8_t flags = (order << 2) | (width - 1) | (stride == width ? 0 : 16);
+      commands_add.push_back(kCommandPredict);
+      commands_add.push_back(flags);
+      if (flags & 16) EncodeVarInt(stride, &commands_add);
+      EncodeVarInt(num, &commands_add);
+      JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc,
+                                            size, &pos, &data_add));
+    }
+
+    if (commands_add.empty() && data_add.empty() && pos + 20 <= size) {
+      Tag subTag = DecodeKeyword(icc, size, pos);
+      if (subTag == kXyz_Tag && DecodeUint32(icc, size, pos + 4) == 0) {
+        commands_add.push_back(kCommandXYZ);
+        pos += 8;
+        for (size_t j = 0; j < 12; j++) data_add.push_back(icc[pos++]);
+      }
+    }
+
+    if (commands_add.empty() && data_add.empty() && pos + 8 <= size) {
+      if (DecodeUint32(icc, size, pos + 4) == 0) {
+        Tag subTag = DecodeKeyword(icc, size, pos);
+        for (size_t i = 0; i < kNumTypeStrings; i++) {
+          if (subTag == *kTypeStrings[i]) {
+            commands_add.push_back(kCommandTypeStartFirst + i);
+            pos += 8;
+            break;
+          }
+        }
+      }
+    }
+
+    if (!(commands_add.empty() && data_add.empty()) || pos == size) {
+      if (last0 < last1) {
+        commands.push_back(kCommandInsert);
+        EncodeVarInt(last1 - last0, &commands);
+        while (last0 < last1) {
+          data.push_back(icc[last0++]);
+        }
+      }
+      for (size_t i = 0; i < commands_add.size(); i++) {
+        commands.push_back(commands_add[i]);
+      }
+      for (size_t i = 0; i < data_add.size(); i++) {
+        data.push_back(data_add[i]);
+      }
+      last0 = pos;
+    }
+    if (commands_add.empty() && data_add.empty()) {
+      pos++;
+    }
+  }
+
+  EncodeVarInt(commands.size(), result);
+  for (size_t i = 0; i < commands.size(); i++) {
+    result->push_back(commands[i]);
+  }
+  for (size_t i = 0; i < data.size(); i++) {
+    result->push_back(data[i]);
+  }
+
+  return true;
+}
+
+Status WriteICC(const PaddedBytes& icc, BitWriter* JXL_RESTRICT writer,
+                size_t layer, AuxOut* JXL_RESTRICT aux_out) {
+  if (icc.empty()) return JXL_FAILURE("ICC must be non-empty");
+  PaddedBytes enc;
+  JXL_RETURN_IF_ERROR(PredictICC(icc.data(), icc.size(), &enc));
+  std::vector<std::vector<Token>> tokens(1);
+  BitWriter::Allotment allotment(writer, 128);
+  JXL_RETURN_IF_ERROR(U64Coder::Write(enc.size(), writer));
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+
+  for (size_t i = 0; i < enc.size(); i++) {
+    tokens[0].emplace_back(
+        ICCANSContext(i, i > 0 ? enc[i - 1] : 0, i > 1 ? enc[i - 2] : 0),
+        enc[i]);
+  }
+  HistogramParams params;
+  params.lz77_method = enc.size() < 4096 ? HistogramParams::LZ77Method::kOptimal
+                                         : HistogramParams::LZ77Method::kLZ77;
+  EntropyEncodingData code;
+  std::vector<uint8_t> context_map;
+  params.force_huffman = true;
+  BuildAndEncodeHistograms(params, kNumICCContexts, tokens, &code, &context_map,
+                           writer, layer, aux_out);
+  WriteTokens(tokens[0], code, context_map, writer, layer, aux_out);
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_icc_codec.h b/third_party/jpeg-xl/lib/jxl/enc_icc_codec.h
new file mode 100644
index 0000000000..c22cf5994e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_icc_codec.h
@@ -0,0 +1,33 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_ICC_CODEC_H_
+#define LIB_JXL_ENC_ICC_CODEC_H_
+
+// Compressed representation of ICC profiles.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_bit_writer.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Should still be called if `icc.empty()` - if so, writes only 1 bit.
+Status WriteICC(const PaddedBytes& icc, BitWriter* JXL_RESTRICT writer,
+                size_t layer, AuxOut* JXL_RESTRICT aux_out);
+
+// Exposed only for testing
+Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_ICC_CODEC_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_image_bundle.cc b/third_party/jpeg-xl/lib/jxl/enc_image_bundle.cc
new file mode 100644
index 0000000000..a77d3e0743
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_image_bundle.cc
@@ -0,0 +1,154 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_image_bundle.h"
+
+#include <jxl/cms_interface.h>
+
+#include <atomic>
+#include <limits>
+#include <utility>
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+namespace {
+
+// Copies ib:rect, converts, and copies into out.
+Status CopyToT(const ImageMetadata* metadata, const ImageBundle* ib,
+               const Rect& rect, const ColorEncoding& c_desired,
+               const JxlCmsInterface& cms, ThreadPool* pool, Image3F* out) {
+  PROFILER_FUNC;
+  ColorSpaceTransform c_transform(cms);
+  // Changing IsGray is probably a bug.
+  JXL_CHECK(ib->IsGray() == c_desired.IsGray());
+  bool is_gray = ib->IsGray();
+  if (out->xsize() < rect.xsize() || out->ysize() < rect.ysize()) {
+    *out = Image3F(rect.xsize(), rect.ysize());
+  } else {
+    out->ShrinkTo(rect.xsize(), rect.ysize());
+  }
+  std::atomic<bool> ok{true};
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0, rect.ysize(),
+      [&](const size_t num_threads) {
+        return c_transform.Init(ib->c_current(), c_desired,
+                                metadata->IntensityTarget(), rect.xsize(),
+                                num_threads);
+      },
+      [&](const uint32_t y, const size_t thread) {
+        float* mutable_src_buf = c_transform.BufSrc(thread);
+        const float* src_buf = mutable_src_buf;
+        // Interleave input.
+        if (is_gray) {
+          src_buf = rect.ConstPlaneRow(ib->color(), 0, y);
+        } else if (ib->c_current().IsCMYK()) {
+          if (!ib->HasBlack()) {
+            ok.store(false);
+            return;
+          }
+          const float* JXL_RESTRICT row_in0 =
+              rect.ConstPlaneRow(ib->color(), 0, y);
+          const float* JXL_RESTRICT row_in1 =
+              rect.ConstPlaneRow(ib->color(), 1, y);
+          const float* JXL_RESTRICT row_in2 =
+              rect.ConstPlaneRow(ib->color(), 2, y);
+          const float* JXL_RESTRICT row_in3 = rect.ConstRow(ib->black(), y);
+          for (size_t x = 0; x < rect.xsize(); x++) {
+            // CMYK convention in JXL: 0 = max ink, 1 = white
+            mutable_src_buf[4 * x + 0] = row_in0[x];
+            mutable_src_buf[4 * x + 1] = row_in1[x];
+            mutable_src_buf[4 * x + 2] = row_in2[x];
+            mutable_src_buf[4 * x + 3] = row_in3[x];
+          }
+        } else {
+          const float* JXL_RESTRICT row_in0 =
+              rect.ConstPlaneRow(ib->color(), 0, y);
+          const float* JXL_RESTRICT row_in1 =
+              rect.ConstPlaneRow(ib->color(), 1, y);
+          const float* JXL_RESTRICT row_in2 =
+              rect.ConstPlaneRow(ib->color(), 2, y);
+          for (size_t x = 0; x < rect.xsize(); x++) {
+            mutable_src_buf[3 * x + 0] = row_in0[x];
+            mutable_src_buf[3 * x + 1] = row_in1[x];
+            mutable_src_buf[3 * x + 2] = row_in2[x];
+          }
+        }
+        float* JXL_RESTRICT dst_buf = c_transform.BufDst(thread);
+        if (!c_transform.Run(thread, src_buf, dst_buf)) {
+          ok.store(false);
+          return;
+        }
+        float* JXL_RESTRICT row_out0 = out->PlaneRow(0, y);
+        float* JXL_RESTRICT row_out1 = out->PlaneRow(1, y);
+        float* JXL_RESTRICT row_out2 = out->PlaneRow(2, y);
+        // De-interleave output and convert type.
+        if (is_gray) {
+          for (size_t x = 0; x < rect.xsize(); x++) {
+            row_out0[x] = dst_buf[x];
+            row_out1[x] = dst_buf[x];
+            row_out2[x] = dst_buf[x];
+          }
+        } else {
+          for (size_t x = 0; x < rect.xsize(); x++) {
+            row_out0[x] = dst_buf[3 * x + 0];
+            row_out1[x] = dst_buf[3 * x + 1];
+            row_out2[x] = dst_buf[3 * x + 2];
+          }
+        }
+      },
+      "Colorspace transform"));
+  return ok.load();
+}
+
+}  // namespace
+
+Status ImageBundle::TransformTo(const ColorEncoding& c_desired,
+                                const JxlCmsInterface& cms, ThreadPool* pool) {
+  PROFILER_FUNC;
+  JXL_RETURN_IF_ERROR(CopyTo(Rect(color_), c_desired, cms, &color_, pool));
+  c_current_ = c_desired;
+  return true;
+}
+Status ImageBundle::CopyTo(const Rect& rect, const ColorEncoding& c_desired,
+                           const JxlCmsInterface& cms, Image3F* out,
+                           ThreadPool* pool) const {
+  return CopyToT(metadata_, this, rect, c_desired, cms, pool, out);
+}
+Status TransformIfNeeded(const ImageBundle& in, const ColorEncoding& c_desired,
+                         const JxlCmsInterface& cms, ThreadPool* pool,
+                         ImageBundle* store, const ImageBundle** out) {
+  if (in.c_current().SameColorEncoding(c_desired) && !in.HasBlack()) {
+    *out = &in;
+    return true;
+  }
+  // TODO(janwas): avoid copying via createExternal+copyBackToIO
+  // instead of copy+createExternal+copyBackToIO
+  store->SetFromImage(CopyImage(in.color()), in.c_current());
+
+  // Must at least copy the alpha channel for use by external_image.
+  if (in.HasExtraChannels()) {
+    std::vector<ImageF> extra_channels;
+    for (const ImageF& extra_channel : in.extra_channels()) {
+      extra_channels.emplace_back(CopyImage(extra_channel));
+    }
+    store->SetExtraChannels(std::move(extra_channels));
+  }
+
+  if (!store->TransformTo(c_desired, cms, pool)) {
+    return false;
+  }
+  *out = store;
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_image_bundle.h b/third_party/jpeg-xl/lib/jxl/enc_image_bundle.h
new file mode 100644
index 0000000000..85f8e14e1c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_image_bundle.h
@@ -0,0 +1,25 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_IMAGE_BUNDLE_H_
+#define LIB_JXL_ENC_IMAGE_BUNDLE_H_
+
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+// Does color transformation from in.c_current() to c_desired if the color
+// encodings are different, or nothing if they are already the same.
+// If color transformation is done, stores the transformed values into store and
+// sets the out pointer to store, else leaves store untouched and sets the out
+// pointer to &in.
+// Returns false if color transform fails.
+Status TransformIfNeeded(const ImageBundle& in, const ColorEncoding& c_desired,
+                         const JxlCmsInterface& cms, ThreadPool* pool,
+                         ImageBundle* store, const ImageBundle** out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_IMAGE_BUNDLE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_jxl_skcms.h b/third_party/jpeg-xl/lib/jxl/enc_jxl_skcms.h
new file mode 100644
index 0000000000..3c364e883d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_jxl_skcms.h
@@ -0,0 +1,54 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_JXL_SKCMS_H_
+#define LIB_JXL_ENC_JXL_SKCMS_H_
+
+// skcms wrapper to rename the skcms symbols to avoid conflicting names with
+// other projects using skcms as well. When using JPEGXL_BUNDLE_SKCMS the
+// bundled functions will be renamed from skcms_ to jxl_skcms_
+
+#ifdef SKCMS_API
+#error "Must include enc_jxl_skcms.h and not skcms.h directly"
+#endif  // SKCMS_API
+
+#if JPEGXL_BUNDLE_SKCMS
+
+#define skcms_252_random_bytes jxl_skcms_252_random_bytes
+#define skcms_AdaptToXYZD50 jxl_skcms_AdaptToXYZD50
+#define skcms_ApproximateCurve jxl_skcms_ApproximateCurve
+#define skcms_ApproximatelyEqualProfiles jxl_skcms_ApproximatelyEqualProfiles
+#define skcms_AreApproximateInverses jxl_skcms_AreApproximateInverses
+#define skcms_GetCHAD jxl_skcms_GetCHAD
+#define skcms_GetTagByIndex jxl_skcms_GetTagByIndex
+#define skcms_GetTagBySignature jxl_skcms_GetTagBySignature
+#define skcms_GetWTPT jxl_skcms_GetWTPT
+#define skcms_Identity_TransferFunction jxl_skcms_Identity_TransferFunction
+#define skcms_MakeUsableAsDestination jxl_skcms_MakeUsableAsDestination
+#define skcms_MakeUsableAsDestinationWithSingleCurve \
+  jxl_skcms_MakeUsableAsDestinationWithSingleCurve
+#define skcms_Matrix3x3_concat jxl_skcms_Matrix3x3_concat
+#define skcms_Matrix3x3_invert jxl_skcms_Matrix3x3_invert
+#define skcms_MaxRoundtripError jxl_skcms_MaxRoundtripError
+#define skcms_Parse jxl_skcms_Parse
+#define skcms_PrimariesToXYZD50 jxl_skcms_PrimariesToXYZD50
+#define skcms_sRGB_Inverse_TransferFunction \
+  jxl_skcms_sRGB_Inverse_TransferFunction
+#define skcms_sRGB_profile jxl_skcms_sRGB_profile
+#define skcms_sRGB_TransferFunction jxl_skcms_sRGB_TransferFunction
+#define skcms_TransferFunction_eval jxl_skcms_TransferFunction_eval
+#define skcms_TransferFunction_invert jxl_skcms_TransferFunction_invert
+#define skcms_TransferFunction_makeHLGish jxl_skcms_TransferFunction_makeHLGish
+#define skcms_TransferFunction_makePQish jxl_skcms_TransferFunction_makePQish
+#define skcms_Transform jxl_skcms_Transform
+#define skcms_TransformWithPalette jxl_skcms_TransformWithPalette
+#define skcms_TRCs_AreApproximateInverse jxl_skcms_TRCs_AreApproximateInverse
+#define skcms_XYZD50_profile jxl_skcms_XYZD50_profile
+
+#endif  // JPEGXL_BUNDLE_SKCMS
+
+#include "skcms.h"
+
+#endif  // LIB_JXL_ENC_JXL_SKCMS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_linalg.cc b/third_party/jpeg-xl/lib/jxl/enc_linalg.cc
new file mode 100644
index 0000000000..fe2090a909
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_linalg.cc
@@ -0,0 +1,52 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_linalg.h"
+
+#include <cmath>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+void ConvertToDiagonal(const ImageD& A, ImageD* const JXL_RESTRICT diag,
+                       ImageD* const JXL_RESTRICT U) {
+#if JXL_ENABLE_ASSERT
+  JXL_ASSERT(A.xsize() == 2);
+  JXL_ASSERT(A.ysize() == 2);
+  JXL_ASSERT(std::abs(A.Row(0)[1] - A.Row(1)[0]) < 1e-15);
+#endif
+
+  if (std::abs(A.ConstRow(0)[1]) < 1e-15) {
+    // Already diagonal.
+    diag->Row(0)[0] = A.ConstRow(0)[0];
+    diag->Row(0)[1] = A.ConstRow(1)[1];
+    U->Row(0)[0] = U->Row(1)[1] = 1.0;
+    U->Row(0)[1] = U->Row(1)[0] = 0.0;
+    return;
+  }
+  double b = -(A.Row(0)[0] + A.Row(1)[1]);
+  double c = A.Row(0)[0] * A.Row(1)[1] - A.Row(0)[1] * A.Row(0)[1];
+  double d = b * b - 4.0 * c;
+  double sqd = std::sqrt(d);
+  double l1 = (-b - sqd) * 0.5;
+  double l2 = (-b + sqd) * 0.5;
+
+  double v1[2] = {A.Row(0)[0] - l1, A.Row(1)[0]};
+  double v1n = 1.0 / std::hypot(v1[0], v1[1]);
+  v1[0] = v1[0] * v1n;
+  v1[1] = v1[1] * v1n;
+
+  diag->Row(0)[0] = l1;
+  diag->Row(0)[1] = l2;
+
+  U->Row(0)[0] = v1[1];
+  U->Row(0)[1] = -v1[0];
+  U->Row(1)[0] = v1[0];
+  U->Row(1)[1] = v1[1];
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_linalg.h b/third_party/jpeg-xl/lib/jxl/enc_linalg.h
new file mode 100644
index 0000000000..791770d5d4
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_linalg.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_LINALG_H_
+#define LIB_JXL_LINALG_H_
+
+// Linear algebra.
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+using ImageD = Plane<double>;
+
+// A is symmetric, U is orthogonal, and A = U * Diagonal(diag) * Transpose(U).
+void ConvertToDiagonal(const ImageD& A, ImageD* JXL_RESTRICT diag,
+                       ImageD* JXL_RESTRICT U);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_LINALG_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_linalg_test.cc b/third_party/jpeg-xl/lib/jxl/enc_linalg_test.cc
new file mode 100644
index 0000000000..967b9a3afb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_linalg_test.cc
@@ -0,0 +1,118 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_linalg.h"
+
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+ImageD Identity(const size_t N) {
+  ImageD out(N, N);
+  for (size_t i = 0; i < N; ++i) {
+    double* JXL_RESTRICT row = out.Row(i);
+    std::fill(row, row + N, 0);
+    row[i] = 1.0;
+  }
+  return out;
+}
+
+ImageD Diagonal(const ImageD& d) {
+  JXL_ASSERT(d.ysize() == 1);
+  ImageD out(d.xsize(), d.xsize());
+  const double* JXL_RESTRICT row_diag = d.Row(0);
+  for (size_t k = 0; k < d.xsize(); ++k) {
+    double* JXL_RESTRICT row_out = out.Row(k);
+    std::fill(row_out, row_out + d.xsize(), 0.0);
+    row_out[k] = row_diag[k];
+  }
+  return out;
+}
+
+ImageD MatMul(const ImageD& A, const ImageD& B) {
+  JXL_ASSERT(A.ysize() == B.xsize());
+  ImageD out(A.xsize(), B.ysize());
+  for (size_t y = 0; y < B.ysize(); ++y) {
+    const double* const JXL_RESTRICT row_b = B.Row(y);
+    double* const JXL_RESTRICT row_out = out.Row(y);
+    for (size_t x = 0; x < A.xsize(); ++x) {
+      row_out[x] = 0.0;
+      for (size_t k = 0; k < B.xsize(); ++k) {
+        row_out[x] += A.Row(k)[x] * row_b[k];
+      }
+    }
+  }
+  return out;
+}
+
+ImageD Transpose(const ImageD& A) {
+  ImageD out(A.ysize(), A.xsize());
+  for (size_t x = 0; x < A.xsize(); ++x) {
+    double* const JXL_RESTRICT row_out = out.Row(x);
+    for (size_t y = 0; y < A.ysize(); ++y) {
+      row_out[y] = A.Row(y)[x];
+    }
+  }
+  return out;
+}
+
+ImageD RandomSymmetricMatrix(const size_t N, Rng& rng, const double vmin,
+                             const double vmax) {
+  ImageD A(N, N);
+  GenerateImage(rng, &A, vmin, vmax);
+  for (size_t i = 0; i < N; ++i) {
+    for (size_t j = 0; j < i; ++j) {
+      A.Row(j)[i] = A.Row(i)[j];
+    }
+  }
+  return A;
+}
+
+void VerifyMatrixEqual(const ImageD& A, const ImageD& B, const double eps) {
+  ASSERT_EQ(A.xsize(), B.xsize());
+  ASSERT_EQ(A.ysize(), B.ysize());
+  for (size_t y = 0; y < A.ysize(); ++y) {
+    for (size_t x = 0; x < A.xsize(); ++x) {
+      ASSERT_NEAR(A.Row(y)[x], B.Row(y)[x], eps);
+    }
+  }
+}
+
+void VerifyOrthogonal(const ImageD& A, const double eps) {
+  VerifyMatrixEqual(Identity(A.xsize()), MatMul(Transpose(A), A), eps);
+}
+
+TEST(LinAlgTest, ConvertToDiagonal) {
+  {
+    ImageD I = Identity(2);
+    ImageD U(2, 2), d(2, 1);
+    ConvertToDiagonal(I, &d, &U);
+    VerifyMatrixEqual(I, U, 1e-15);
+    for (size_t k = 0; k < 2; ++k) {
+      ASSERT_NEAR(d.Row(0)[k], 1.0, 1e-15);
+    }
+  }
+  {
+    ImageD A = Identity(2);
+    A.Row(0)[1] = A.Row(1)[0] = 2.0;
+    ImageD U(2, 2), d(2, 1);
+    ConvertToDiagonal(A, &d, &U);
+    VerifyOrthogonal(U, 1e-12);
+    VerifyMatrixEqual(A, MatMul(U, MatMul(Diagonal(d), Transpose(U))), 1e-12);
+  }
+  Rng rng(0);
+  for (size_t i = 0; i < 100; ++i) {
+    ImageD A = RandomSymmetricMatrix(2, rng, -1.0, 1.0);
+    ImageD U(2, 2), d(2, 1);
+    ConvertToDiagonal(A, &d, &U);
+    VerifyOrthogonal(U, 1e-12);
+    VerifyMatrixEqual(A, MatMul(U, MatMul(Diagonal(d), Transpose(U))), 1e-12);
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_modular.cc b/third_party/jpeg-xl/lib/jxl/enc_modular.cc
new file mode 100644
index 0000000000..0453b34654
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_modular.cc
@@ -0,0 +1,1762 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_modular.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <array>
+#include <atomic>
+#include <limits>
+#include <queue>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/compressed_dc.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_cluster.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_gaborish.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_patch_dictionary.h"
+#include "lib/jxl/enc_quant_weights.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/encoding/enc_debug_tree.h"
+#include "lib/jxl/modular/encoding/enc_encoding.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/encoding/ma_common.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/options.h"
+#include "lib/jxl/modular/transform/enc_transform.h"
+#include "lib/jxl/toc.h"
+
+namespace jxl {
+
+namespace {
+constexpr bool kPrintTree = false;
+
+// Squeeze default quantization factors
+// these quantization factors are for -Q 50  (other qualities simply scale the
+// factors; things are rounded down and obviously cannot get below 1)
+static const float squeeze_quality_factor =
+    0.35;  // for easy tweaking of the quality range (decrease this number for
+           // higher quality)
+static const float squeeze_luma_factor =
+    1.1;  // for easy tweaking of the balance between luma (or anything
+          // non-chroma) and chroma (decrease this number for higher quality
+          // luma)
+static const float squeeze_quality_factor_xyb = 2.4f;
+static const float squeeze_xyb_qtable[3][16] = {
+    {163.84, 81.92, 40.96, 20.48, 10.24, 5.12, 2.56, 1.28, 0.64, 0.32, 0.16,
+     0.08, 0.04, 0.02, 0.01, 0.005},  // Y
+    {1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5, 0.5,
+     0.5},  // X
+    {2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5,
+     0.5},  // B-Y
+};
+
+static const float squeeze_luma_qtable[16] = {
+    163.84, 81.92, 40.96, 20.48, 10.24, 5.12, 2.56, 1.28,
+    0.64,   0.32,  0.16,  0.08,  0.04,  0.02, 0.01, 0.005};
+// for 8-bit input, the range of YCoCg chroma is -255..255 so basically this
+// does 4:2:0 subsampling (two most fine grained layers get quantized away)
+static const float squeeze_chroma_qtable[16] = {
+    1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5, 0.5, 0.5};
+
+// `cutoffs` must be sorted.
+Tree MakeFixedTree(int property, const std::vector<int32_t>& cutoffs,
+                   Predictor pred, size_t num_pixels) {
+  size_t log_px = CeilLog2Nonzero(num_pixels);
+  size_t min_gap = 0;
+  // Reduce fixed tree height when encoding small images.
+  if (log_px < 14) {
+    min_gap = 8 * (14 - log_px);
+  }
+  Tree tree;
+  struct NodeInfo {
+    size_t begin, end, pos;
+  };
+  std::queue<NodeInfo> q;
+  // Leaf IDs will be set by roundtrip decoding the tree.
+  tree.push_back(PropertyDecisionNode::Leaf(pred));
+  q.push(NodeInfo{0, cutoffs.size(), 0});
+  while (!q.empty()) {
+    NodeInfo info = q.front();
+    q.pop();
+    if (info.begin + min_gap >= info.end) continue;
+    uint32_t split = (info.begin + info.end) / 2;
+    tree[info.pos] =
+        PropertyDecisionNode::Split(property, cutoffs[split], tree.size());
+    q.push(NodeInfo{split + 1, info.end, tree.size()});
+    tree.push_back(PropertyDecisionNode::Leaf(pred));
+    q.push(NodeInfo{info.begin, split, tree.size()});
+    tree.push_back(PropertyDecisionNode::Leaf(pred));
+  }
+  return tree;
+}
+
+Tree PredefinedTree(ModularOptions::TreeKind tree_kind, size_t total_pixels) {
+  if (tree_kind == ModularOptions::TreeKind::kJpegTranscodeACMeta ||
+      tree_kind == ModularOptions::TreeKind::kTrivialTreeNoPredictor) {
+    // All the data is 0, so no need for a fancy tree.
+    return {PropertyDecisionNode::Leaf(Predictor::Zero)};
+  }
+  if (tree_kind == ModularOptions::TreeKind::kFalconACMeta) {
+    // All the data is 0 except the quant field. TODO(veluca): make that 0 too.
+    return {PropertyDecisionNode::Leaf(Predictor::Left)};
+  }
+  if (tree_kind == ModularOptions::TreeKind::kACMeta) {
+    // Small image.
+    if (total_pixels < 1024) {
+      return {PropertyDecisionNode::Leaf(Predictor::Left)};
+    }
+    Tree tree;
+    // 0: c > 1
+    tree.push_back(PropertyDecisionNode::Split(0, 1, 1));
+    // 1: c > 2
+    tree.push_back(PropertyDecisionNode::Split(0, 2, 3));
+    // 2: c > 0
+    tree.push_back(PropertyDecisionNode::Split(0, 0, 5));
+    // 3: EPF control field (all 0 or 4), top > 0
+    tree.push_back(PropertyDecisionNode::Split(6, 0, 21));
+    // 4: ACS+QF, y > 0
+    tree.push_back(PropertyDecisionNode::Split(2, 0, 7));
+    // 5: CfL x
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Gradient));
+    // 6: CfL b
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Gradient));
+    // 7: QF: split according to the left quant value.
+    tree.push_back(PropertyDecisionNode::Split(7, 5, 9));
+    // 8: ACS: split in 4 segments (8x8 from 0 to 3, large square 4-5, large
+    // rectangular 6-11, 8x8 12+), according to previous ACS value.
+    tree.push_back(PropertyDecisionNode::Split(7, 5, 15));
+    // QF
+    tree.push_back(PropertyDecisionNode::Split(7, 11, 11));
+    tree.push_back(PropertyDecisionNode::Split(7, 3, 13));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
+    // ACS
+    tree.push_back(PropertyDecisionNode::Split(7, 11, 17));
+    tree.push_back(PropertyDecisionNode::Split(7, 3, 19));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    // EPF, left > 0
+    tree.push_back(PropertyDecisionNode::Split(7, 0, 23));
+    tree.push_back(PropertyDecisionNode::Split(7, 0, 25));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    return tree;
+  }
+  if (tree_kind == ModularOptions::TreeKind::kWPFixedDC) {
+    std::vector<int32_t> cutoffs = {
+        -500, -392, -255, -191, -127, -95, -63, -47, -31, -23, -15,
+        -11,  -7,   -4,   -3,   -1,   0,   1,   3,   5,   7,   11,
+        15,   23,   31,   47,   63,   95,  127, 191, 255, 392, 500};
+    return MakeFixedTree(kWPProp, cutoffs, Predictor::Weighted, total_pixels);
+  }
+  if (tree_kind == ModularOptions::TreeKind::kGradientFixedDC) {
+    std::vector<int32_t> cutoffs = {
+        -500, -392, -255, -191, -127, -95, -63, -47, -31, -23, -15,
+        -11,  -7,   -4,   -3,   -1,   0,   1,   3,   5,   7,   11,
+        15,   23,   31,   47,   63,   95,  127, 191, 255, 392, 500};
+    return MakeFixedTree(kGradientProp, cutoffs, Predictor::Gradient,
+                         total_pixels);
+  }
+  JXL_ABORT("Unreachable");
+  return {};
+}
+
+// Merges the trees in `trees` using nodes that decide on stream_id, as defined
+// by `tree_splits`.
+void MergeTrees(const std::vector<Tree>& trees,
+                const std::vector<size_t>& tree_splits, size_t begin,
+                size_t end, Tree* tree) {
+  JXL_ASSERT(trees.size() + 1 == tree_splits.size());
+  JXL_ASSERT(end > begin);
+  JXL_ASSERT(end <= trees.size());
+  if (end == begin + 1) {
+    // Insert the tree, adding the opportune offset to all child nodes.
+    // This will make the leaf IDs wrong, but subsequent roundtripping will fix
+    // them.
+    size_t sz = tree->size();
+    tree->insert(tree->end(), trees[begin].begin(), trees[begin].end());
+    for (size_t i = sz; i < tree->size(); i++) {
+      (*tree)[i].lchild += sz;
+      (*tree)[i].rchild += sz;
+    }
+    return;
+  }
+  size_t mid = (begin + end) / 2;
+  size_t splitval = tree_splits[mid] - 1;
+  size_t cur = tree->size();
+  tree->emplace_back(1 /*stream_id*/, splitval, 0, 0, Predictor::Zero, 0, 1);
+  (*tree)[cur].lchild = tree->size();
+  MergeTrees(trees, tree_splits, mid, end, tree);
+  (*tree)[cur].rchild = tree->size();
+  MergeTrees(trees, tree_splits, begin, mid, tree);
+}
+
+void QuantizeChannel(Channel& ch, const int q) {
+  if (q == 1) return;
+  for (size_t y = 0; y < ch.plane.ysize(); y++) {
+    pixel_type* row = ch.plane.Row(y);
+    for (size_t x = 0; x < ch.plane.xsize(); x++) {
+      if (row[x] < 0) {
+        row[x] = -((-row[x] + q / 2) / q) * q;
+      } else {
+        row[x] = ((row[x] + q / 2) / q) * q;
+      }
+    }
+  }
+}
+
+// convert binary32 float that corresponds to custom [bits]-bit float (with
+// [exp_bits] exponent bits) to a [bits]-bit integer representation that should
+// fit in pixel_type
+Status float_to_int(const float* const row_in, pixel_type* const row_out,
+                    size_t xsize, unsigned int bits, unsigned int exp_bits,
+                    bool fp, double dfactor) {
+  JXL_ASSERT(sizeof(pixel_type) * 8 >= bits);
+  if (!fp) {
+    if (bits > 22) {
+      for (size_t x = 0; x < xsize; ++x) {
+        row_out[x] = row_in[x] * dfactor + (row_in[x] < 0 ? -0.5 : 0.5);
+      }
+    } else {
+      float factor = dfactor;
+      for (size_t x = 0; x < xsize; ++x) {
+        row_out[x] = row_in[x] * factor + (row_in[x] < 0 ? -0.5f : 0.5f);
+      }
+    }
+    return true;
+  }
+  if (bits == 32 && fp) {
+    JXL_ASSERT(exp_bits == 8);
+    memcpy((void*)row_out, (const void*)row_in, 4 * xsize);
+    return true;
+  }
+
+  int exp_bias = (1 << (exp_bits - 1)) - 1;
+  int max_exp = (1 << exp_bits) - 1;
+  uint32_t sign = (1u << (bits - 1));
+  int mant_bits = bits - exp_bits - 1;
+  int mant_shift = 23 - mant_bits;
+  for (size_t x = 0; x < xsize; ++x) {
+    uint32_t f;
+    memcpy(&f, &row_in[x], 4);
+    int signbit = (f >> 31);
+    f &= 0x7fffffff;
+    if (f == 0) {
+      row_out[x] = (signbit ? sign : 0);
+      continue;
+    }
+    int exp = (f >> 23) - 127;
+    if (exp == 128) return JXL_FAILURE("Inf/NaN not allowed");
+    int mantissa = (f & 0x007fffff);
+    // broke up the binary32 into its parts, now reassemble into
+    // arbitrary float
+    exp += exp_bias;
+    if (exp < 0) {  // will become a subnormal number
+      // add implicit leading 1 to mantissa
+      mantissa |= 0x00800000;
+      if (exp < -mant_bits) {
+        return JXL_FAILURE(
+            "Invalid float number: %g cannot be represented with %i "
+            "exp_bits and %i mant_bits (exp %i)",
+            row_in[x], exp_bits, mant_bits, exp);
+      }
+      mantissa >>= 1 - exp;
+      exp = 0;
+    }
+    // exp should be representable in exp_bits, otherwise input was
+    // invalid
+    if (exp > max_exp) return JXL_FAILURE("Invalid float exponent");
+    if (mantissa & ((1 << mant_shift) - 1)) {
+      return JXL_FAILURE("%g is losing precision (mant: %x)", row_in[x],
+                         mantissa);
+    }
+    mantissa >>= mant_shift;
+    f = (signbit ? sign : 0);
+    f |= (exp << mant_bits);
+    f |= mantissa;
+    row_out[x] = (pixel_type)f;
+  }
+  return true;
+}
+}  // namespace
+
+ModularFrameEncoder::ModularFrameEncoder(const FrameHeader& frame_header,
+                                         const CompressParams& cparams_orig)
+    : frame_dim_(frame_header.ToFrameDimensions()), cparams_(cparams_orig) {
+  size_t num_streams =
+      ModularStreamId::Num(frame_dim_, frame_header.passes.num_passes);
+  if (cparams_.ModularPartIsLossless()) {
+    switch (cparams_.decoding_speed_tier) {
+      case 0:
+        break;
+      case 1:
+        cparams_.options.wp_tree_mode = ModularOptions::TreeMode::kWPOnly;
+        break;
+      case 2: {
+        cparams_.options.wp_tree_mode = ModularOptions::TreeMode::kGradientOnly;
+        cparams_.options.predictor = Predictor::Gradient;
+        break;
+      }
+      case 3: {  // LZ77, no Gradient.
+        cparams_.options.nb_repeats = 0;
+        cparams_.options.predictor = Predictor::Gradient;
+        break;
+      }
+      default: {  // LZ77, no predictor.
+        cparams_.options.nb_repeats = 0;
+        cparams_.options.predictor = Predictor::Zero;
+        break;
+      }
+    }
+  }
+  if (cparams_.decoding_speed_tier >= 1 && cparams_.responsive &&
+      cparams_.ModularPartIsLossless()) {
+    cparams_.options.tree_kind =
+        ModularOptions::TreeKind::kTrivialTreeNoPredictor;
+    cparams_.options.nb_repeats = 0;
+  }
+  stream_images_.resize(num_streams);
+
+  // use a sensible default if nothing explicit is specified:
+  // Squeeze for lossy, no squeeze for lossless
+  if (cparams_.responsive < 0) {
+    if (cparams_.ModularPartIsLossless()) {
+      cparams_.responsive = 0;
+    } else {
+      cparams_.responsive = 1;
+    }
+  }
+
+  if (cparams_.speed_tier > SpeedTier::kWombat) {
+    cparams_.options.splitting_heuristics_node_threshold = 192;
+  } else {
+    cparams_.options.splitting_heuristics_node_threshold = 96;
+  }
+  {
+    // Set properties.
+    std::vector<uint32_t> prop_order;
+    if (cparams_.responsive) {
+      // Properties in order of their likelihood of being useful for Squeeze
+      // residuals.
+      prop_order = {0, 1, 4, 5, 6, 7, 8, 15, 9, 10, 11, 12, 13, 14, 2, 3};
+    } else {
+      // Same, but for the non-Squeeze case.
+      prop_order = {0, 1, 15, 9, 10, 11, 12, 13, 14, 2, 3, 4, 5, 6, 7, 8};
+    }
+    switch (cparams_.speed_tier) {
+      case SpeedTier::kSquirrel:
+        cparams_.options.splitting_heuristics_properties.assign(
+            prop_order.begin(), prop_order.begin() + 8);
+        cparams_.options.max_property_values = 32;
+        break;
+      case SpeedTier::kKitten:
+        cparams_.options.splitting_heuristics_properties.assign(
+            prop_order.begin(), prop_order.begin() + 10);
+        cparams_.options.max_property_values = 64;
+        break;
+      case SpeedTier::kTortoise:
+        cparams_.options.splitting_heuristics_properties = prop_order;
+        cparams_.options.max_property_values = 256;
+        break;
+      default:
+        cparams_.options.splitting_heuristics_properties.assign(
+            prop_order.begin(), prop_order.begin() + 6);
+        cparams_.options.max_property_values = 16;
+        break;
+    }
+    if (cparams_.speed_tier > SpeedTier::kTortoise) {
+      // Gradient in previous channels.
+      for (int i = 0; i < cparams_.options.max_properties; i++) {
+        cparams_.options.splitting_heuristics_properties.push_back(
+            kNumNonrefProperties + i * 4 + 3);
+      }
+    } else {
+      // All the extra properties in Tortoise mode.
+      for (int i = 0; i < cparams_.options.max_properties * 4; i++) {
+        cparams_.options.splitting_heuristics_properties.push_back(
+            kNumNonrefProperties + i);
+      }
+    }
+  }
+
+  if (cparams_.options.predictor == static_cast<Predictor>(-1)) {
+    // no explicit predictor(s) given, set a good default
+    if ((cparams_.speed_tier <= SpeedTier::kTortoise ||
+         cparams_.modular_mode == false) &&
+        cparams_.IsLossless() && cparams_.responsive == false) {
+      // TODO(veluca): allow all predictors that don't break residual
+      // multipliers in lossy mode.
+      cparams_.options.predictor = Predictor::Variable;
+    } else if (cparams_.responsive || cparams_.lossy_palette) {
+      // zero predictor for Squeeze residues and lossy palette
+      cparams_.options.predictor = Predictor::Zero;
+    } else if (!cparams_.IsLossless()) {
+      // If not responsive and lossy. TODO(veluca): use near_lossless instead?
+      cparams_.options.predictor = Predictor::Gradient;
+    } else if (cparams_.speed_tier < SpeedTier::kFalcon) {
+      // try median and weighted predictor for anything else
+      cparams_.options.predictor = Predictor::Best;
+    } else if (cparams_.speed_tier == SpeedTier::kFalcon) {
+      // just weighted predictor in falcon mode
+      cparams_.options.predictor = Predictor::Weighted;
+    } else if (cparams_.speed_tier > SpeedTier::kFalcon) {
+      // just gradient predictor in thunder mode
+      cparams_.options.predictor = Predictor::Gradient;
+    }
+  } else {
+    delta_pred_ = cparams_.options.predictor;
+    if (cparams_.lossy_palette) cparams_.options.predictor = Predictor::Zero;
+  }
+  if (!cparams_.ModularPartIsLossless()) {
+    if (cparams_.options.predictor == Predictor::Weighted ||
+        cparams_.options.predictor == Predictor::Variable ||
+        cparams_.options.predictor == Predictor::Best)
+      cparams_.options.predictor = Predictor::Zero;
+  }
+  tree_splits_.push_back(0);
+  if (cparams_.modular_mode == false) {
+    cparams_.options.fast_decode_multiplier = 1.0f;
+    tree_splits_.push_back(ModularStreamId::VarDCTDC(0).ID(frame_dim_));
+    tree_splits_.push_back(ModularStreamId::ModularDC(0).ID(frame_dim_));
+    tree_splits_.push_back(ModularStreamId::ACMetadata(0).ID(frame_dim_));
+    tree_splits_.push_back(ModularStreamId::QuantTable(0).ID(frame_dim_));
+    tree_splits_.push_back(ModularStreamId::ModularAC(0, 0).ID(frame_dim_));
+    ac_metadata_size.resize(frame_dim_.num_dc_groups);
+    extra_dc_precision.resize(frame_dim_.num_dc_groups);
+  }
+  tree_splits_.push_back(num_streams);
+  cparams_.options.max_chan_size = frame_dim_.group_dim;
+  cparams_.options.group_dim = frame_dim_.group_dim;
+
+  // TODO(veluca): figure out how to use different predictor sets per channel.
+  stream_options_.resize(num_streams, cparams_.options);
+}
+
+bool do_transform(Image& image, const Transform& tr,
+                  const weighted::Header& wp_header,
+                  jxl::ThreadPool* pool = nullptr, bool force_jxlart = false) {
+  Transform t = tr;
+  bool did_it = true;
+  if (force_jxlart) {
+    if (!t.MetaApply(image)) return false;
+  } else {
+    did_it = TransformForward(t, image, wp_header, pool);
+  }
+  if (did_it) image.transform.push_back(t);
+  return did_it;
+}
+
+Status ModularFrameEncoder::ComputeEncodingData(
+    const FrameHeader& frame_header, const ImageMetadata& metadata,
+    Image3F* JXL_RESTRICT color, const std::vector<ImageF>& extra_channels,
+    PassesEncoderState* JXL_RESTRICT enc_state, const JxlCmsInterface& cms,
+    ThreadPool* pool, AuxOut* aux_out, bool do_color) {
+  JXL_DEBUG_V(6, "Computing modular encoding data for frame %s",
+              frame_header.DebugString().c_str());
+
+  if (do_color && frame_header.loop_filter.gab) {
+    float w = 0.9908511000000001f;
+    float weights[3] = {w, w, w};
+    GaborishInverse(color, weights, pool);
+  }
+
+  if (do_color && metadata.bit_depth.bits_per_sample <= 16 &&
+      cparams_.speed_tier < SpeedTier::kCheetah &&
+      cparams_.decoding_speed_tier < 2) {
+    FindBestPatchDictionary(*color, enc_state, cms, nullptr, aux_out,
+                            cparams_.color_transform == ColorTransform::kXYB);
+    PatchDictionaryEncoder::SubtractFrom(
+        enc_state->shared.image_features.patches, color);
+  }
+
+  // Convert ImageBundle to modular Image object
+  const size_t xsize = frame_dim_.xsize;
+  const size_t ysize = frame_dim_.ysize;
+
+  int nb_chans = 3;
+  if (metadata.color_encoding.IsGray() &&
+      cparams_.color_transform == ColorTransform::kNone) {
+    nb_chans = 1;
+  }
+  if (!do_color) nb_chans = 0;
+
+  nb_chans += extra_channels.size();
+
+  bool fp = metadata.bit_depth.floating_point_sample &&
+            cparams_.color_transform != ColorTransform::kXYB;
+
+  // bits_per_sample is just metadata for XYB images.
+  if (metadata.bit_depth.bits_per_sample >= 32 && do_color &&
+      cparams_.color_transform != ColorTransform::kXYB) {
+    if (metadata.bit_depth.bits_per_sample == 32 && fp == false) {
+      return JXL_FAILURE("uint32_t not supported in enc_modular");
+    } else if (metadata.bit_depth.bits_per_sample > 32) {
+      return JXL_FAILURE("bits_per_sample > 32 not supported");
+    }
+  }
+
+  // in the non-float case, there is an implicit 0 sign bit
+  int max_bitdepth =
+      do_color ? metadata.bit_depth.bits_per_sample + (fp ? 0 : 1) : 0;
+  Image& gi = stream_images_[0];
+  gi = Image(xsize, ysize, metadata.bit_depth.bits_per_sample, nb_chans);
+  int c = 0;
+  if (cparams_.color_transform == ColorTransform::kXYB &&
+      cparams_.modular_mode == true) {
+    float enc_factors[3] = {32768.0f, 2048.0f, 2048.0f};
+    if (cparams_.butteraugli_distance > 0 && !cparams_.responsive) {
+      // quantize XYB here and then treat it as a lossless image
+      enc_factors[0] *= 1.f / (1.f + 23.f * cparams_.butteraugli_distance);
+      enc_factors[1] *= 1.f / (1.f + 14.f * cparams_.butteraugli_distance);
+      enc_factors[2] *= 1.f / (1.f + 14.f * cparams_.butteraugli_distance);
+      cparams_.butteraugli_distance = 0;
+    }
+    if (cparams_.manual_xyb_factors.size() == 3) {
+      DequantMatricesSetCustomDC(&enc_state->shared.matrices,
+                                 cparams_.manual_xyb_factors.data());
+      // TODO(jon): update max_bitdepth in this case
+    } else {
+      DequantMatricesSetCustomDC(&enc_state->shared.matrices, enc_factors);
+      max_bitdepth = 12;
+    }
+  }
+  pixel_type maxval = gi.bitdepth < 32 ? (1u << gi.bitdepth) - 1 : 0;
+  if (do_color) {
+    for (; c < 3; c++) {
+      if (metadata.color_encoding.IsGray() &&
+          cparams_.color_transform == ColorTransform::kNone &&
+          c != (cparams_.color_transform == ColorTransform::kXYB ? 1 : 0))
+        continue;
+      int c_out = c;
+      // XYB is encoded as YX(B-Y)
+      if (cparams_.color_transform == ColorTransform::kXYB && c < 2)
+        c_out = 1 - c_out;
+      double factor = maxval;
+      if (cparams_.color_transform == ColorTransform::kXYB)
+        factor = enc_state->shared.matrices.InvDCQuant(c);
+      if (c == 2 && cparams_.color_transform == ColorTransform::kXYB) {
+        JXL_ASSERT(!fp);
+        for (size_t y = 0; y < ysize; ++y) {
+          const float* const JXL_RESTRICT row_in = color->PlaneRow(c, y);
+          pixel_type* const JXL_RESTRICT row_out = gi.channel[c_out].Row(y);
+          pixel_type* const JXL_RESTRICT row_Y = gi.channel[0].Row(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            row_out[x] = row_in[x] * factor + 0.5f;
+            row_out[x] -= row_Y[x];
+            // zero the lsb of B
+            row_out[x] = row_out[x] / 2 * 2;
+          }
+        }
+      } else {
+        int bits = metadata.bit_depth.bits_per_sample;
+        int exp_bits = metadata.bit_depth.exponent_bits_per_sample;
+        gi.channel[c_out].hshift =
+            enc_state->shared.frame_header.chroma_subsampling.HShift(c);
+        gi.channel[c_out].vshift =
+            enc_state->shared.frame_header.chroma_subsampling.VShift(c);
+        size_t xsize_shifted = DivCeil(xsize, 1 << gi.channel[c_out].hshift);
+        size_t ysize_shifted = DivCeil(ysize, 1 << gi.channel[c_out].vshift);
+        gi.channel[c_out].shrink(xsize_shifted, ysize_shifted);
+        std::atomic<bool> has_error{false};
+        JXL_RETURN_IF_ERROR(RunOnPool(
+            pool, 0, ysize_shifted, ThreadPool::NoInit,
+            [&](const int task, const int thread) {
+              const size_t y = task;
+              const float* const JXL_RESTRICT row_in = color->PlaneRow(c, y);
+              pixel_type* const JXL_RESTRICT row_out = gi.channel[c_out].Row(y);
+              if (!float_to_int(row_in, row_out, xsize_shifted, bits, exp_bits,
+                                fp, factor)) {
+                has_error = true;
+              };
+            },
+            "float2int"));
+        if (has_error) {
+          return JXL_FAILURE("Error in float to integer conversion");
+        }
+      }
+    }
+    if (metadata.color_encoding.IsGray() &&
+        cparams_.color_transform == ColorTransform::kNone)
+      c = 1;
+  }
+
+  for (size_t ec = 0; ec < extra_channels.size(); ec++, c++) {
+    const ExtraChannelInfo& eci = metadata.extra_channel_info[ec];
+    size_t ecups = frame_header.extra_channel_upsampling[ec];
+    gi.channel[c].shrink(DivCeil(frame_dim_.xsize_upsampled, ecups),
+                         DivCeil(frame_dim_.ysize_upsampled, ecups));
+    gi.channel[c].hshift = gi.channel[c].vshift =
+        CeilLog2Nonzero(ecups) - CeilLog2Nonzero(frame_header.upsampling);
+
+    int bits = eci.bit_depth.bits_per_sample;
+    int exp_bits = eci.bit_depth.exponent_bits_per_sample;
+    bool fp = eci.bit_depth.floating_point_sample;
+    double factor = (fp ? 1 : ((1u << eci.bit_depth.bits_per_sample) - 1));
+    if (bits + (fp ? 0 : 1) > max_bitdepth) max_bitdepth = bits + (fp ? 0 : 1);
+    std::atomic<bool> has_error{false};
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, gi.channel[c].plane.ysize(), ThreadPool::NoInit,
+        [&](const int task, const int thread) {
+          const size_t y = task;
+          const float* const JXL_RESTRICT row_in = extra_channels[ec].Row(y);
+          pixel_type* const JXL_RESTRICT row_out = gi.channel[c].Row(y);
+          if (!float_to_int(row_in, row_out, gi.channel[c].plane.xsize(), bits,
+                            exp_bits, fp, factor)) {
+            has_error = true;
+          };
+        },
+        "float2int"));
+    if (has_error) return JXL_FAILURE("Error in float to integer conversion");
+  }
+  JXL_ASSERT(c == nb_chans);
+
+  int level_max_bitdepth = (cparams_.level == 5 ? 16 : 32);
+  if (max_bitdepth > level_max_bitdepth)
+    return JXL_FAILURE(
+        "Bitdepth too high for level %i (need %i bits, have only %i in this "
+        "level)",
+        cparams_.level, max_bitdepth, level_max_bitdepth);
+
+  // Set options and apply transformations
+  if (!cparams_.ModularPartIsLossless()) {
+    if (cparams_.palette_colors != 0) {
+      JXL_DEBUG_V(3, "Lossy encode, not doing palette transforms");
+    }
+    if (cparams_.color_transform == ColorTransform::kXYB) {
+      cparams_.channel_colors_pre_transform_percent = 0;
+    }
+    cparams_.channel_colors_percent = 0;
+    cparams_.palette_colors = 0;
+    cparams_.lossy_palette = false;
+  }
+
+  // if few colors, do all-channel palette before trying channel palette
+  // Logic is as follows:
+  // - if you can make a palette with few colors (arbitrary threshold: 200),
+  //   then you can also make channel palettes, but they will just be extra
+  //   signaling cost for almost no benefit
+  // - if the palette needs more colors, then channel palette might help to
+  //   reduce palette signaling cost
+  if (cparams_.palette_colors != 0 &&
+      cparams_.speed_tier < SpeedTier::kFalcon) {
+    // all-channel palette (e.g. RGBA)
+    if (gi.channel.size() > 1) {
+      Transform maybe_palette(TransformId::kPalette);
+      maybe_palette.begin_c = gi.nb_meta_channels;
+      maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels;
+      maybe_palette.nb_colors =
+          std::min(std::min(200, (int)(xsize * ysize / 8)),
+                   std::abs(cparams_.palette_colors) / 16);
+      maybe_palette.ordered_palette = cparams_.palette_colors >= 0;
+      maybe_palette.lossy_palette = false;
+      do_transform(gi, maybe_palette, weighted::Header(), pool);
+    }
+  }
+
+  // Global channel palette
+  if (cparams_.channel_colors_pre_transform_percent > 0 &&
+      !cparams_.lossy_palette &&
+      (cparams_.speed_tier <= SpeedTier::kThunder ||
+       (do_color && metadata.bit_depth.bits_per_sample > 8))) {
+    // single channel palette (like FLIF's ChannelCompact)
+    size_t nb_channels = gi.channel.size() - gi.nb_meta_channels;
+    int orig_bitdepth = max_bitdepth;
+    max_bitdepth = 0;
+    for (size_t i = 0; i < nb_channels; i++) {
+      int32_t min, max;
+      compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max);
+      int64_t colors = (int64_t)max - min + 1;
+      JXL_DEBUG_V(10, "Channel %" PRIuS ": range=%i..%i", i, min, max);
+      Transform maybe_palette_1(TransformId::kPalette);
+      maybe_palette_1.begin_c = i + gi.nb_meta_channels;
+      maybe_palette_1.num_c = 1;
+      // simple heuristic: if less than X percent of the values in the range
+      // actually occur, it is probably worth it to do a compaction
+      // (but only if the channel palette is less than 6% the size of the
+      // image itself)
+      maybe_palette_1.nb_colors = std::min(
+          (int)(xsize * ysize / 16),
+          (int)(cparams_.channel_colors_pre_transform_percent / 100. * colors));
+      if (do_transform(gi, maybe_palette_1, weighted::Header(), pool)) {
+        // effective bit depth is lower, adjust quantization accordingly
+        compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max);
+        if (max < maxval) maxval = max;
+        int ch_bitdepth =
+            (max > 0 ? CeilLog2Nonzero(static_cast<uint32_t>(max)) : 0);
+        if (ch_bitdepth > max_bitdepth) max_bitdepth = ch_bitdepth;
+      } else
+        max_bitdepth = orig_bitdepth;
+    }
+  }
+
+  // Global palette
+  if ((cparams_.palette_colors != 0 || cparams_.lossy_palette) &&
+      cparams_.speed_tier < SpeedTier::kFalcon) {
+    // all-channel palette (e.g. RGBA)
+    if (gi.channel.size() - gi.nb_meta_channels > 1) {
+      Transform maybe_palette(TransformId::kPalette);
+      maybe_palette.begin_c = gi.nb_meta_channels;
+      maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels;
+      maybe_palette.nb_colors =
+          std::min((int)(xsize * ysize / 8), std::abs(cparams_.palette_colors));
+      maybe_palette.ordered_palette = cparams_.palette_colors >= 0;
+      maybe_palette.lossy_palette =
+          (cparams_.lossy_palette && maybe_palette.num_c == 3);
+      if (maybe_palette.lossy_palette) {
+        maybe_palette.predictor = delta_pred_;
+      }
+      // TODO(veluca): use a custom weighted header if using the weighted
+      // predictor.
+      do_transform(gi, maybe_palette, weighted::Header(), pool,
+                   cparams_.options.zero_tokens);
+    }
+    // all-minus-one-channel palette (RGB with separate alpha, or CMY with
+    // separate K)
+    if (gi.channel.size() - gi.nb_meta_channels > 3) {
+      Transform maybe_palette_3(TransformId::kPalette);
+      maybe_palette_3.begin_c = gi.nb_meta_channels;
+      maybe_palette_3.num_c = gi.channel.size() - gi.nb_meta_channels - 1;
+      maybe_palette_3.nb_colors =
+          std::min((int)(xsize * ysize / 8), std::abs(cparams_.palette_colors));
+      maybe_palette_3.ordered_palette = cparams_.palette_colors >= 0;
+      maybe_palette_3.lossy_palette = cparams_.lossy_palette;
+      if (maybe_palette_3.lossy_palette) {
+        maybe_palette_3.predictor = delta_pred_;
+      }
+      do_transform(gi, maybe_palette_3, weighted::Header(), pool,
+                   cparams_.options.zero_tokens);
+    }
+  }
+
+  // don't do an RCT if we're short on bits
+  if (cparams_.color_transform == ColorTransform::kNone && do_color &&
+      gi.channel.size() - gi.nb_meta_channels >= 3 &&
+      max_bitdepth + 1 < level_max_bitdepth) {
+    if (cparams_.colorspace < 0 && (!cparams_.ModularPartIsLossless() ||
+                                    cparams_.speed_tier > SpeedTier::kHare)) {
+      Transform ycocg{TransformId::kRCT};
+      ycocg.rct_type = 6;
+      ycocg.begin_c = gi.nb_meta_channels;
+      do_transform(gi, ycocg, weighted::Header(), pool);
+      max_bitdepth++;
+    } else if (cparams_.colorspace > 0) {
+      Transform sg(TransformId::kRCT);
+      sg.begin_c = gi.nb_meta_channels;
+      sg.rct_type = cparams_.colorspace;
+      do_transform(gi, sg, weighted::Header(), pool);
+      max_bitdepth++;
+    }
+  }
+
+  // don't do squeeze if we don't have some spare bits
+  if (cparams_.responsive && !gi.channel.empty() &&
+      max_bitdepth + 2 < level_max_bitdepth) {
+    Transform t(TransformId::kSqueeze);
+    t.squeezes = cparams_.squeezes;
+    do_transform(gi, t, weighted::Header(), pool);
+    max_bitdepth += 2;
+  }
+
+  if (max_bitdepth + 1 > level_max_bitdepth) {
+    // force no group RCTs if we don't have a spare bit
+    cparams_.colorspace = 0;
+  }
+  JXL_ASSERT(max_bitdepth <= level_max_bitdepth);
+
+  std::vector<uint32_t> quants;
+
+  if (!cparams_.ModularPartIsLossless()) {
+    quants.resize(gi.channel.size(), 1);
+    float quantizer = 0.25f;
+    if (!cparams_.responsive) {
+      JXL_DEBUG_V(1,
+                  "Warning: lossy compression without Squeeze "
+                  "transform is just color quantization.");
+      quantizer *= 0.1f;
+    }
+    float bitdepth_correction = 1.f;
+    if (cparams_.color_transform != ColorTransform::kXYB) {
+      bitdepth_correction = maxval / 255.f;
+    }
+    std::vector<float> quantizers;
+    float dist = cparams_.butteraugli_distance;
+    for (size_t i = 0; i < 3; i++) {
+      quantizers.push_back(quantizer * dist * bitdepth_correction);
+    }
+    for (size_t i = 0; i < extra_channels.size(); i++) {
+      int ec_bitdepth =
+          metadata.extra_channel_info[i].bit_depth.bits_per_sample;
+      pixel_type ec_maxval = ec_bitdepth < 32 ? (1u << ec_bitdepth) - 1 : 0;
+      bitdepth_correction = ec_maxval / 255.f;
+      if (i < cparams_.ec_distance.size()) dist = cparams_.ec_distance[i];
+      if (dist < 0) dist = cparams_.butteraugli_distance;
+      quantizers.push_back(quantizer * dist * bitdepth_correction);
+    }
+    if (cparams_.options.nb_repeats == 0) {
+      return JXL_FAILURE("nb_repeats = 0 not supported with modular lossy!");
+    }
+    for (uint32_t i = gi.nb_meta_channels; i < gi.channel.size(); i++) {
+      Channel& ch = gi.channel[i];
+      int shift = ch.hshift + ch.vshift;  // number of pixel halvings
+      if (shift > 16) shift = 16;
+      if (shift > 0) shift--;
+      int q;
+      // assuming default Squeeze here
+      int component =
+          (do_color ? 0 : 3) + ((i - gi.nb_meta_channels) % nb_chans);
+      // last 4 channels are final chroma residuals
+      if (nb_chans > 2 && i >= gi.channel.size() - 4 && cparams_.responsive) {
+        component = 1;
+      }
+      if (cparams_.color_transform == ColorTransform::kXYB && component < 3) {
+        q = quantizers[component] * squeeze_quality_factor_xyb *
+            squeeze_xyb_qtable[component][shift];
+      } else {
+        if (cparams_.colorspace != 0 && component > 0 && component < 3) {
+          q = quantizers[component] * squeeze_quality_factor *
+              squeeze_chroma_qtable[shift];
+        } else {
+          q = quantizers[component] * squeeze_quality_factor *
+              squeeze_luma_factor * squeeze_luma_qtable[shift];
+        }
+      }
+      if (q < 1) q = 1;
+      QuantizeChannel(gi.channel[i], q);
+      quants[i] = q;
+    }
+  }
+
+  // Fill other groups.
+  struct GroupParams {
+    Rect rect;
+    int minShift;
+    int maxShift;
+    ModularStreamId id;
+  };
+  std::vector<GroupParams> stream_params;
+
+  stream_options_[0] = cparams_.options;
+
+  // DC
+  for (size_t group_id = 0; group_id < frame_dim_.num_dc_groups; group_id++) {
+    const size_t gx = group_id % frame_dim_.xsize_dc_groups;
+    const size_t gy = group_id / frame_dim_.xsize_dc_groups;
+    const Rect rect(gx * frame_dim_.dc_group_dim, gy * frame_dim_.dc_group_dim,
+                    frame_dim_.dc_group_dim, frame_dim_.dc_group_dim);
+    // minShift==3 because (frame_dim.dc_group_dim >> 3) == frame_dim.group_dim
+    // maxShift==1000 is infinity
+    stream_params.push_back(
+        GroupParams{rect, 3, 1000, ModularStreamId::ModularDC(group_id)});
+  }
+  // AC global -> nothing.
+  // AC
+  for (size_t group_id = 0; group_id < frame_dim_.num_groups; group_id++) {
+    const size_t gx = group_id % frame_dim_.xsize_groups;
+    const size_t gy = group_id / frame_dim_.xsize_groups;
+    const Rect mrect(gx * frame_dim_.group_dim, gy * frame_dim_.group_dim,
+                     frame_dim_.group_dim, frame_dim_.group_dim);
+    for (size_t i = 0; i < enc_state->progressive_splitter.GetNumPasses();
+         i++) {
+      int maxShift, minShift;
+      frame_header.passes.GetDownsamplingBracket(i, minShift, maxShift);
+      stream_params.push_back(GroupParams{
+          mrect, minShift, maxShift, ModularStreamId::ModularAC(group_id, i)});
+    }
+  }
+  // if there's only one group, everything ends up in GlobalModular
+  // in that case, also try RCTs/WP params for the one group
+  if (stream_params.size() == 2) {
+    stream_params.push_back(GroupParams{Rect(0, 0, xsize, ysize), 0, 1000,
+                                        ModularStreamId::Global()});
+  }
+  gi_channel_.resize(stream_images_.size());
+
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0, stream_params.size(), ThreadPool::NoInit,
+      [&](const uint32_t i, size_t /* thread */) {
+        stream_options_[stream_params[i].id.ID(frame_dim_)] = cparams_.options;
+        JXL_CHECK(PrepareStreamParams(
+            stream_params[i].rect, cparams_, stream_params[i].minShift,
+            stream_params[i].maxShift, stream_params[i].id, do_color));
+      },
+      "ChooseParams"));
+  {
+    // Clear out channels that have been copied to groups.
+    Image& full_image = stream_images_[0];
+    size_t c = full_image.nb_meta_channels;
+    for (; c < full_image.channel.size(); c++) {
+      Channel& fc = full_image.channel[c];
+      if (fc.w > frame_dim_.group_dim || fc.h > frame_dim_.group_dim) break;
+    }
+    for (; c < full_image.channel.size(); c++) {
+      full_image.channel[c].plane = ImageI();
+    }
+  }
+
+  if (!quants.empty()) {
+    for (uint32_t stream_id = 0; stream_id < stream_images_.size();
+         stream_id++) {
+      // skip non-modular stream_ids
+      if (stream_id > 0 && gi_channel_[stream_id].empty()) continue;
+      const Image& image = stream_images_[stream_id];
+      const ModularOptions& options = stream_options_[stream_id];
+      for (uint32_t i = image.nb_meta_channels; i < image.channel.size(); i++) {
+        if (i >= image.nb_meta_channels &&
+            (image.channel[i].w > options.max_chan_size ||
+             image.channel[i].h > options.max_chan_size)) {
+          continue;
+        }
+        if (stream_id > 0 && gi_channel_[stream_id].empty()) continue;
+        size_t ch_id = stream_id == 0
+                           ? i
+                           : gi_channel_[stream_id][i - image.nb_meta_channels];
+        uint32_t q = quants[ch_id];
+        // Inform the tree splitting heuristics that each channel in each group
+        // used this quantization factor. This will produce a tree with the
+        // given multipliers.
+        if (multiplier_info_.empty() ||
+            multiplier_info_.back().range[1][0] != stream_id ||
+            multiplier_info_.back().multiplier != q) {
+          StaticPropRange range;
+          range[0] = {{i, i + 1}};
+          range[1] = {{stream_id, stream_id + 1}};
+          multiplier_info_.push_back({range, (uint32_t)q});
+        } else {
+          // Previous channel in the same group had the same quantization
+          // factor. Don't provide two different ranges, as that creates
+          // unnecessary nodes.
+          multiplier_info_.back().range[0][1] = i + 1;
+        }
+      }
+    }
+    // Merge group+channel settings that have the same channels and quantization
+    // factors, to avoid unnecessary nodes.
+    std::sort(multiplier_info_.begin(), multiplier_info_.end(),
+              [](ModularMultiplierInfo a, ModularMultiplierInfo b) {
+                return std::make_tuple(a.range, a.multiplier) <
+                       std::make_tuple(b.range, b.multiplier);
+              });
+    size_t new_num = 1;
+    for (size_t i = 1; i < multiplier_info_.size(); i++) {
+      ModularMultiplierInfo& prev = multiplier_info_[new_num - 1];
+      ModularMultiplierInfo& cur = multiplier_info_[i];
+      if (prev.range[0] == cur.range[0] && prev.multiplier == cur.multiplier &&
+          prev.range[1][1] == cur.range[1][0]) {
+        prev.range[1][1] = cur.range[1][1];
+      } else {
+        multiplier_info_[new_num++] = multiplier_info_[i];
+      }
+    }
+    multiplier_info_.resize(new_num);
+  }
+
+  JXL_RETURN_IF_ERROR(ValidateChannelDimensions(gi, stream_options_[0]));
+
+  return PrepareEncoding(frame_header, pool, enc_state->heuristics.get(),
+                         aux_out);
+}
+
+Status ModularFrameEncoder::PrepareEncoding(const FrameHeader& frame_header,
+                                            ThreadPool* pool,
+                                            EncoderHeuristics* heuristics,
+                                            AuxOut* aux_out) {
+  if (!tree_.empty()) return true;
+
+  // Compute tree.
+  size_t num_streams = stream_images_.size();
+  stream_headers_.resize(num_streams);
+  tokens_.resize(num_streams);
+
+  if (heuristics->CustomFixedTreeLossless(frame_dim_, &tree_)) {
+    // Using a fixed tree.
+  } else if (cparams_.speed_tier < SpeedTier::kFalcon ||
+             !cparams_.modular_mode) {
+    // Avoid creating a tree with leaves that don't correspond to any pixels.
+    std::vector<size_t> useful_splits;
+    useful_splits.reserve(tree_splits_.size());
+    for (size_t chunk = 0; chunk < tree_splits_.size() - 1; chunk++) {
+      bool has_pixels = false;
+      size_t start = tree_splits_[chunk];
+      size_t stop = tree_splits_[chunk + 1];
+      for (size_t i = start; i < stop; i++) {
+        if (!stream_images_[i].empty()) has_pixels = true;
+      }
+      if (has_pixels) {
+        useful_splits.push_back(tree_splits_[chunk]);
+      }
+    }
+    // Don't do anything if modular mode does not have any pixels in this image
+    if (useful_splits.empty()) return true;
+    useful_splits.push_back(tree_splits_.back());
+
+    std::atomic_flag invalid_force_wp = ATOMIC_FLAG_INIT;
+
+    std::vector<Tree> trees(useful_splits.size() - 1);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, useful_splits.size() - 1, ThreadPool::NoInit,
+        [&](const uint32_t chunk, size_t /* thread */) {
+          // TODO(veluca): parallelize more.
+          size_t total_pixels = 0;
+          uint32_t start = useful_splits[chunk];
+          uint32_t stop = useful_splits[chunk + 1];
+          while (start < stop && stream_images_[start].empty()) ++start;
+          while (start < stop && stream_images_[stop - 1].empty()) --stop;
+          uint32_t max_c = 0;
+          if (stream_options_[start].tree_kind !=
+              ModularOptions::TreeKind::kLearn) {
+            for (size_t i = start; i < stop; i++) {
+              for (const Channel& ch : stream_images_[i].channel) {
+                total_pixels += ch.w * ch.h;
+              }
+            }
+            trees[chunk] =
+                PredefinedTree(stream_options_[start].tree_kind, total_pixels);
+            return;
+          }
+          TreeSamples tree_samples;
+          if (!tree_samples.SetPredictor(stream_options_[start].predictor,
+                                         stream_options_[start].wp_tree_mode)) {
+            invalid_force_wp.test_and_set(std::memory_order_acq_rel);
+            return;
+          }
+          if (!tree_samples.SetProperties(
+                  stream_options_[start].splitting_heuristics_properties,
+                  stream_options_[start].wp_tree_mode)) {
+            invalid_force_wp.test_and_set(std::memory_order_acq_rel);
+            return;
+          }
+          std::vector<pixel_type> pixel_samples;
+          std::vector<pixel_type> diff_samples;
+          std::vector<uint32_t> group_pixel_count;
+          std::vector<uint32_t> channel_pixel_count;
+          for (size_t i = start; i < stop; i++) {
+            max_c = std::max<uint32_t>(stream_images_[i].channel.size(), max_c);
+            CollectPixelSamples(stream_images_[i], stream_options_[i], i,
+                                group_pixel_count, channel_pixel_count,
+                                pixel_samples, diff_samples);
+          }
+          StaticPropRange range;
+          range[0] = {{0, max_c}};
+          range[1] = {{start, stop}};
+          auto local_multiplier_info = multiplier_info_;
+
+          tree_samples.PreQuantizeProperties(
+              range, local_multiplier_info, group_pixel_count,
+              channel_pixel_count, pixel_samples, diff_samples,
+              stream_options_[start].max_property_values);
+          for (size_t i = start; i < stop; i++) {
+            JXL_CHECK(ModularGenericCompress(
+                stream_images_[i], stream_options_[i], /*writer=*/nullptr,
+                /*aux_out=*/nullptr, 0, i, &tree_samples, &total_pixels));
+          }
+
+          // TODO(veluca): parallelize more.
+          trees[chunk] =
+              LearnTree(std::move(tree_samples), total_pixels,
+                        stream_options_[start], local_multiplier_info, range);
+        },
+        "LearnTrees"));
+    if (invalid_force_wp.test_and_set(std::memory_order_acq_rel)) {
+      return JXL_FAILURE("PrepareEncoding: force_no_wp with {Weighted}");
+    }
+    tree_.clear();
+    MergeTrees(trees, useful_splits, 0, useful_splits.size() - 1, &tree_);
+  } else {
+    // Fixed tree.
+    size_t total_pixels = 0;
+    for (const Image& img : stream_images_) {
+      for (const Channel& ch : img.channel) {
+        total_pixels += ch.w * ch.h;
+      }
+    }
+    if (cparams_.speed_tier <= SpeedTier::kFalcon) {
+      tree_ =
+          PredefinedTree(ModularOptions::TreeKind::kWPFixedDC, total_pixels);
+    } else if (cparams_.speed_tier <= SpeedTier::kThunder) {
+      tree_ = PredefinedTree(ModularOptions::TreeKind::kGradientFixedDC,
+                             total_pixels);
+    } else {
+      tree_ = {PropertyDecisionNode::Leaf(Predictor::Gradient)};
+    }
+  }
+  tree_tokens_.resize(1);
+  tree_tokens_[0].clear();
+  Tree decoded_tree;
+  TokenizeTree(tree_, &tree_tokens_[0], &decoded_tree);
+  JXL_ASSERT(tree_.size() == decoded_tree.size());
+  tree_ = std::move(decoded_tree);
+
+  if (kPrintTree && WantDebugOutput(aux_out)) {
+    if (frame_header.dc_level > 0) {
+      PrintTree(tree_, aux_out->debug_prefix + "/dc_frame_level" +
+                           std::to_string(frame_header.dc_level) + "_tree");
+    } else {
+      PrintTree(tree_, aux_out->debug_prefix + "/global_tree");
+    }
+  }
+
+  image_widths_.resize(num_streams);
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0, num_streams, ThreadPool::NoInit,
+      [&](const uint32_t stream_id, size_t /* thread */) {
+        AuxOut my_aux_out;
+        if (aux_out) {
+          my_aux_out.dump_image = aux_out->dump_image;
+          my_aux_out.debug_prefix = aux_out->debug_prefix;
+        }
+        tokens_[stream_id].clear();
+        JXL_CHECK(ModularGenericCompress(
+            stream_images_[stream_id], stream_options_[stream_id],
+            /*writer=*/nullptr, &my_aux_out, 0, stream_id,
+            /*tree_samples=*/nullptr,
+            /*total_pixels=*/nullptr,
+            /*tree=*/&tree_, /*header=*/&stream_headers_[stream_id],
+            /*tokens=*/&tokens_[stream_id],
+            /*widths=*/&image_widths_[stream_id]));
+      },
+      "ComputeTokens"));
+  return true;
+}
+
+Status ModularFrameEncoder::EncodeGlobalInfo(BitWriter* writer,
+                                             AuxOut* aux_out) {
+  BitWriter::Allotment allotment(writer, 1);
+  // If we are using brotli, or not using modular mode.
+  if (tree_tokens_.empty() || tree_tokens_[0].empty()) {
+    writer->Write(1, 0);
+    allotment.ReclaimAndCharge(writer, kLayerModularTree, aux_out);
+    return true;
+  }
+  writer->Write(1, 1);
+  allotment.ReclaimAndCharge(writer, kLayerModularTree, aux_out);
+
+  // Write tree
+  HistogramParams params;
+  if (cparams_.speed_tier > SpeedTier::kKitten) {
+    params.clustering = HistogramParams::ClusteringType::kFast;
+    params.ans_histogram_strategy =
+        cparams_.speed_tier > SpeedTier::kThunder
+            ? HistogramParams::ANSHistogramStrategy::kFast
+            : HistogramParams::ANSHistogramStrategy::kApproximate;
+    params.lz77_method =
+        cparams_.decoding_speed_tier >= 3 && cparams_.modular_mode
+            ? (cparams_.speed_tier >= SpeedTier::kFalcon
+                   ? HistogramParams::LZ77Method::kRLE
+                   : HistogramParams::LZ77Method::kLZ77)
+            : HistogramParams::LZ77Method::kNone;
+    // Near-lossless DC, as well as modular mode, require choosing hybrid uint
+    // more carefully.
+    if ((!extra_dc_precision.empty() && extra_dc_precision[0] != 0) ||
+        (cparams_.modular_mode && cparams_.speed_tier < SpeedTier::kCheetah)) {
+      params.uint_method = HistogramParams::HybridUintMethod::kFast;
+    } else {
+      params.uint_method = HistogramParams::HybridUintMethod::kNone;
+    }
+  } else if (cparams_.speed_tier <= SpeedTier::kTortoise) {
+    params.lz77_method = HistogramParams::LZ77Method::kOptimal;
+  } else {
+    params.lz77_method = HistogramParams::LZ77Method::kLZ77;
+  }
+  if (cparams_.decoding_speed_tier >= 1) {
+    params.max_histograms = 12;
+  }
+  if (cparams_.decoding_speed_tier >= 1 && cparams_.responsive) {
+    params.lz77_method = cparams_.speed_tier >= SpeedTier::kCheetah
+                             ? HistogramParams::LZ77Method::kRLE
+                         : cparams_.speed_tier >= SpeedTier::kKitten
+                             ? HistogramParams::LZ77Method::kLZ77
+                             : HistogramParams::LZ77Method::kOptimal;
+  }
+  if (cparams_.decoding_speed_tier >= 2 && cparams_.responsive) {
+    params.uint_method = HistogramParams::HybridUintMethod::k000;
+    params.force_huffman = true;
+  }
+  BuildAndEncodeHistograms(params, kNumTreeContexts, tree_tokens_, &code_,
+                           &context_map_, writer, kLayerModularTree, aux_out);
+  WriteTokens(tree_tokens_[0], code_, context_map_, writer, kLayerModularTree,
+              aux_out);
+  params.image_widths = image_widths_;
+  // Write histograms.
+  BuildAndEncodeHistograms(params, (tree_.size() + 1) / 2, tokens_, &code_,
+                           &context_map_, writer, kLayerModularGlobal, aux_out);
+  return true;
+}
+
+Status ModularFrameEncoder::EncodeStream(BitWriter* writer, AuxOut* aux_out,
+                                         size_t layer,
+                                         const ModularStreamId& stream) {
+  size_t stream_id = stream.ID(frame_dim_);
+  if (stream_images_[stream_id].channel.empty()) {
+    return true;  // Image with no channels, header never gets decoded.
+  }
+  JXL_RETURN_IF_ERROR(
+      Bundle::Write(stream_headers_[stream_id], writer, layer, aux_out));
+  WriteTokens(tokens_[stream_id], code_, context_map_, writer, layer, aux_out);
+  return true;
+}
+
+namespace {
+float EstimateWPCost(const Image& img, size_t i) {
+  size_t extra_bits = 0;
+  float histo_cost = 0;
+  HybridUintConfig config;
+  int32_t cutoffs[] = {-500, -392, -255, -191, -127, -95, -63, -47, -31,
+                       -23,  -15,  -11,  -7,   -4,   -3,  -1,  0,   1,
+                       3,    5,    7,    11,   15,   23,  31,  47,  63,
+                       95,   127,  191,  255,  392,  500};
+  constexpr size_t nc = sizeof(cutoffs) / sizeof(*cutoffs) + 1;
+  Histogram histo[nc] = {};
+  weighted::Header wp_header;
+  PredictorMode(i, &wp_header);
+  for (const Channel& ch : img.channel) {
+    const intptr_t onerow = ch.plane.PixelsPerRow();
+    weighted::State wp_state(wp_header, ch.w, ch.h);
+    Properties properties(1);
+    for (size_t y = 0; y < ch.h; y++) {
+      const pixel_type* JXL_RESTRICT r = ch.Row(y);
+      for (size_t x = 0; x < ch.w; x++) {
+        size_t offset = 0;
+        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+        pixel_type_w top = (y ? *(r + x - onerow) : left);
+        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
+        pixel_type_w topright =
+            (x + 1 < ch.w && y ? *(r + x + 1 - onerow) : top);
+        pixel_type_w toptop = (y > 1 ? *(r + x - onerow - onerow) : top);
+        pixel_type guess = wp_state.Predict</*compute_properties=*/true>(
+            x, y, ch.w, top, left, topright, topleft, toptop, &properties,
+            offset);
+        size_t ctx = 0;
+        for (int c : cutoffs) {
+          ctx += c >= properties[0];
+        }
+        pixel_type res = r[x] - guess;
+        uint32_t token, nbits, bits;
+        config.Encode(PackSigned(res), &token, &nbits, &bits);
+        histo[ctx].Add(token);
+        extra_bits += nbits;
+        wp_state.UpdateErrors(r[x], x, y, ch.w);
+      }
+    }
+    for (size_t h = 0; h < nc; h++) {
+      histo_cost += histo[h].ShannonEntropy();
+      histo[h].Clear();
+    }
+  }
+  return histo_cost + extra_bits;
+}
+
+float EstimateCost(const Image& img) {
+  // TODO(veluca): consider SIMDfication of this code.
+  size_t extra_bits = 0;
+  float histo_cost = 0;
+  HybridUintConfig config;
+  uint32_t cutoffs[] = {0,  1,  3,  5,   7,   11,  15,  23, 31,
+                        47, 63, 95, 127, 191, 255, 392, 500};
+  constexpr size_t nc = sizeof(cutoffs) / sizeof(*cutoffs) + 1;
+  Histogram histo[nc] = {};
+  for (const Channel& ch : img.channel) {
+    const intptr_t onerow = ch.plane.PixelsPerRow();
+    for (size_t y = 0; y < ch.h; y++) {
+      const pixel_type* JXL_RESTRICT r = ch.Row(y);
+      for (size_t x = 0; x < ch.w; x++) {
+        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+        pixel_type_w top = (y ? *(r + x - onerow) : left);
+        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
+        size_t maxdiff = std::max(std::max(left, top), topleft) -
+                         std::min(std::min(left, top), topleft);
+        size_t ctx = 0;
+        for (uint32_t c : cutoffs) {
+          ctx += c > maxdiff;
+        }
+        pixel_type res = r[x] - ClampedGradient(top, left, topleft);
+        uint32_t token, nbits, bits;
+        config.Encode(PackSigned(res), &token, &nbits, &bits);
+        histo[ctx].Add(token);
+        extra_bits += nbits;
+      }
+    }
+    for (size_t h = 0; h < nc; h++) {
+      histo_cost += histo[h].ShannonEntropy();
+      histo[h].Clear();
+    }
+  }
+  return histo_cost + extra_bits;
+}
+
+}  // namespace
+
+Status ModularFrameEncoder::PrepareStreamParams(const Rect& rect,
+                                                const CompressParams& cparams_,
+                                                int minShift, int maxShift,
+                                                const ModularStreamId& stream,
+                                                bool do_color) {
+  size_t stream_id = stream.ID(frame_dim_);
+  Image& full_image = stream_images_[0];
+  const size_t xsize = rect.xsize();
+  const size_t ysize = rect.ysize();
+  Image& gi = stream_images_[stream_id];
+  if (stream_id > 0) {
+    gi = Image(xsize, ysize, full_image.bitdepth, 0);
+    // start at the first bigger-than-frame_dim.group_dim non-metachannel
+    size_t c = full_image.nb_meta_channels;
+    for (; c < full_image.channel.size(); c++) {
+      Channel& fc = full_image.channel[c];
+      if (fc.w > frame_dim_.group_dim || fc.h > frame_dim_.group_dim) break;
+    }
+    for (; c < full_image.channel.size(); c++) {
+      Channel& fc = full_image.channel[c];
+      int shift = std::min(fc.hshift, fc.vshift);
+      if (shift > maxShift) continue;
+      if (shift < minShift) continue;
+      Rect r(rect.x0() >> fc.hshift, rect.y0() >> fc.vshift,
+             rect.xsize() >> fc.hshift, rect.ysize() >> fc.vshift, fc.w, fc.h);
+      if (r.xsize() == 0 || r.ysize() == 0) continue;
+      gi_channel_[stream_id].push_back(c);
+      Channel gc(r.xsize(), r.ysize());
+      gc.hshift = fc.hshift;
+      gc.vshift = fc.vshift;
+      for (size_t y = 0; y < r.ysize(); ++y) {
+        memcpy(gc.Row(y), r.ConstRow(fc.plane, y),
+               r.xsize() * sizeof(pixel_type));
+      }
+      gi.channel.emplace_back(std::move(gc));
+    }
+
+    if (gi.channel.empty()) return true;
+    // Do some per-group transforms
+
+    // Local palette
+    // TODO(veluca): make this work with quantize-after-prediction in lossy
+    // mode.
+    if (cparams_.butteraugli_distance == 0.f && cparams_.palette_colors != 0 &&
+        cparams_.speed_tier < SpeedTier::kCheetah) {
+      // all-channel palette (e.g. RGBA)
+      if (gi.channel.size() - gi.nb_meta_channels > 1) {
+        Transform maybe_palette(TransformId::kPalette);
+        maybe_palette.begin_c = gi.nb_meta_channels;
+        maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels;
+        maybe_palette.nb_colors = std::abs(cparams_.palette_colors);
+        maybe_palette.ordered_palette = cparams_.palette_colors >= 0;
+        do_transform(gi, maybe_palette, weighted::Header());
+      }
+      // all-minus-one-channel palette (RGB with separate alpha, or CMY with
+      // separate K)
+      if (gi.channel.size() - gi.nb_meta_channels > 3) {
+        Transform maybe_palette_3(TransformId::kPalette);
+        maybe_palette_3.begin_c = gi.nb_meta_channels;
+        maybe_palette_3.num_c = gi.channel.size() - gi.nb_meta_channels - 1;
+        maybe_palette_3.nb_colors = std::abs(cparams_.palette_colors);
+        maybe_palette_3.ordered_palette = cparams_.palette_colors >= 0;
+        maybe_palette_3.lossy_palette = cparams_.lossy_palette;
+        if (maybe_palette_3.lossy_palette) {
+          maybe_palette_3.predictor = Predictor::Weighted;
+        }
+        do_transform(gi, maybe_palette_3, weighted::Header());
+      }
+    }
+
+    // Local channel palette
+    if (cparams_.channel_colors_percent > 0 &&
+        cparams_.butteraugli_distance == 0.f && !cparams_.lossy_palette &&
+        cparams_.speed_tier < SpeedTier::kCheetah &&
+        !(cparams_.responsive && cparams_.decoding_speed_tier >= 1)) {
+      // single channel palette (like FLIF's ChannelCompact)
+      size_t nb_channels = gi.channel.size() - gi.nb_meta_channels;
+      for (size_t i = 0; i < nb_channels; i++) {
+        int32_t min, max;
+        compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max);
+        int64_t colors = (int64_t)max - min + 1;
+        JXL_DEBUG_V(10, "Channel %" PRIuS ": range=%i..%i", i, min, max);
+        Transform maybe_palette_1(TransformId::kPalette);
+        maybe_palette_1.begin_c = i + gi.nb_meta_channels;
+        maybe_palette_1.num_c = 1;
+        // simple heuristic: if less than X percent of the values in the range
+        // actually occur, it is probably worth it to do a compaction
+        // (but only if the channel palette is less than 80% the size of the
+        // image itself)
+        maybe_palette_1.nb_colors =
+            std::min((int)(xsize * ysize * 0.8),
+                     (int)(cparams_.channel_colors_percent / 100. * colors));
+        do_transform(gi, maybe_palette_1, weighted::Header());
+      }
+    }
+  }
+
+  // lossless and no specific color transform specified: try Nothing, YCoCg,
+  // and 17 RCTs
+  if (cparams_.color_transform == ColorTransform::kNone &&
+      cparams_.IsLossless() && cparams_.colorspace < 0 &&
+      gi.channel.size() - gi.nb_meta_channels >= 3 &&
+      cparams_.responsive == false && do_color &&
+      cparams_.speed_tier <= SpeedTier::kHare) {
+    Transform sg(TransformId::kRCT);
+    sg.begin_c = gi.nb_meta_channels;
+    size_t nb_rcts_to_try = 0;
+    switch (cparams_.speed_tier) {
+      case SpeedTier::kLightning:
+      case SpeedTier::kThunder:
+      case SpeedTier::kFalcon:
+      case SpeedTier::kCheetah:
+        nb_rcts_to_try = 0;  // Just do global YCoCg
+        break;
+      case SpeedTier::kHare:
+        nb_rcts_to_try = 4;
+        break;
+      case SpeedTier::kWombat:
+        nb_rcts_to_try = 5;
+        break;
+      case SpeedTier::kSquirrel:
+        nb_rcts_to_try = 7;
+        break;
+      case SpeedTier::kKitten:
+        nb_rcts_to_try = 9;
+        break;
+      case SpeedTier::kGlacier:
+      case SpeedTier::kTortoise:
+        nb_rcts_to_try = 19;
+        break;
+    }
+    float best_cost = std::numeric_limits<float>::max();
+    size_t best_rct = 0;
+    // These should be 19 actually different transforms; the remaining ones
+    // are equivalent to one of these (note that the first two are do-nothing
+    // and YCoCg) modulo channel reordering (which only matters in the case of
+    // MA-with-prev-channels-properties) and/or sign (e.g. RmG vs GmR)
+    for (int i : {0 * 7 + 0, 0 * 7 + 6, 0 * 7 + 5, 1 * 7 + 3, 3 * 7 + 5,
+                  5 * 7 + 5, 1 * 7 + 5, 2 * 7 + 5, 1 * 7 + 1, 0 * 7 + 4,
+                  1 * 7 + 2, 2 * 7 + 1, 2 * 7 + 2, 2 * 7 + 3, 4 * 7 + 4,
+                  4 * 7 + 5, 0 * 7 + 2, 0 * 7 + 1, 0 * 7 + 3}) {
+      if (nb_rcts_to_try == 0) break;
+      sg.rct_type = i;
+      nb_rcts_to_try--;
+      if (do_transform(gi, sg, weighted::Header())) {
+        float cost = EstimateCost(gi);
+        if (cost < best_cost) {
+          best_rct = i;
+          best_cost = cost;
+        }
+        Transform t = gi.transform.back();
+        JXL_RETURN_IF_ERROR(t.Inverse(gi, weighted::Header(), nullptr));
+        gi.transform.pop_back();
+      }
+    }
+    // Apply the best RCT to the image for future encoding.
+    sg.rct_type = best_rct;
+    do_transform(gi, sg, weighted::Header());
+  } else {
+    // No need to try anything, just use the default options.
+  }
+  size_t nb_wp_modes = 1;
+  if (cparams_.speed_tier <= SpeedTier::kTortoise) {
+    nb_wp_modes = 5;
+  } else if (cparams_.speed_tier <= SpeedTier::kKitten) {
+    nb_wp_modes = 2;
+  }
+  if (nb_wp_modes > 1 &&
+      (stream_options_[stream_id].predictor == Predictor::Weighted ||
+       stream_options_[stream_id].predictor == Predictor::Best ||
+       stream_options_[stream_id].predictor == Predictor::Variable)) {
+    float best_cost = std::numeric_limits<float>::max();
+    stream_options_[stream_id].wp_mode = 0;
+    for (size_t i = 0; i < nb_wp_modes; i++) {
+      float cost = EstimateWPCost(gi, i);
+      if (cost < best_cost) {
+        best_cost = cost;
+        stream_options_[stream_id].wp_mode = i;
+      }
+    }
+  }
+  return true;
+}
+
+constexpr float q_deadzone = 0.62f;
+int QuantizeWP(const int32_t* qrow, size_t onerow, size_t c, size_t x, size_t y,
+               size_t w, weighted::State* wp_state, float value,
+               float inv_factor) {
+  float svalue = value * inv_factor;
+  PredictionResult pred =
+      PredictNoTreeWP(w, qrow + x, onerow, x, y, Predictor::Weighted, wp_state);
+  svalue -= pred.guess;
+  if (svalue > -q_deadzone && svalue < q_deadzone) svalue = 0;
+  int residual = roundf(svalue);
+  if (residual > 2 || residual < -2) residual = roundf(svalue * 0.5) * 2;
+  return residual + pred.guess;
+}
+
+int QuantizeGradient(const int32_t* qrow, size_t onerow, size_t c, size_t x,
+                     size_t y, size_t w, float value, float inv_factor) {
+  float svalue = value * inv_factor;
+  PredictionResult pred =
+      PredictNoTreeNoWP(w, qrow + x, onerow, x, y, Predictor::Gradient);
+  svalue -= pred.guess;
+  if (svalue > -q_deadzone && svalue < q_deadzone) svalue = 0;
+  int residual = roundf(svalue);
+  if (residual > 2 || residual < -2) residual = roundf(svalue * 0.5) * 2;
+  return residual + pred.guess;
+}
+
+void ModularFrameEncoder::AddVarDCTDC(const Image3F& dc, size_t group_index,
+                                      bool nl_dc, PassesEncoderState* enc_state,
+                                      bool jpeg_transcode) {
+  const Rect r = enc_state->shared.DCGroupRect(group_index);
+  extra_dc_precision[group_index] = nl_dc ? 1 : 0;
+  float mul = 1 << extra_dc_precision[group_index];
+
+  size_t stream_id = ModularStreamId::VarDCTDC(group_index).ID(frame_dim_);
+  stream_options_[stream_id].max_chan_size = 0xFFFFFF;
+  stream_options_[stream_id].predictor = Predictor::Weighted;
+  stream_options_[stream_id].wp_tree_mode = ModularOptions::TreeMode::kWPOnly;
+  if (cparams_.speed_tier >= SpeedTier::kSquirrel) {
+    stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kWPFixedDC;
+  }
+  if (cparams_.speed_tier < SpeedTier::kSquirrel && !nl_dc) {
+    stream_options_[stream_id].predictor =
+        (cparams_.speed_tier < SpeedTier::kKitten ? Predictor::Variable
+                                                  : Predictor::Best);
+    stream_options_[stream_id].wp_tree_mode =
+        ModularOptions::TreeMode::kDefault;
+    stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kLearn;
+  }
+  if (cparams_.decoding_speed_tier >= 1) {
+    stream_options_[stream_id].tree_kind =
+        ModularOptions::TreeKind::kGradientFixedDC;
+  }
+
+  stream_images_[stream_id] = Image(r.xsize(), r.ysize(), 8, 3);
+  if (nl_dc && stream_options_[stream_id].tree_kind ==
+                   ModularOptions::TreeKind::kGradientFixedDC) {
+    JXL_ASSERT(enc_state->shared.frame_header.chroma_subsampling.Is444());
+    for (size_t c : {1, 0, 2}) {
+      float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul;
+      float y_factor = enc_state->shared.quantizer.GetDcStep(1) / mul;
+      float cfl_factor = enc_state->shared.cmap.DCFactors()[c];
+      for (size_t y = 0; y < r.ysize(); y++) {
+        int32_t* quant_row =
+            stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c].plane.Row(y);
+        size_t stride = stream_images_[stream_id]
+                            .channel[c < 2 ? c ^ 1 : c]
+                            .plane.PixelsPerRow();
+        const float* row = r.ConstPlaneRow(dc, c, y);
+        if (c == 1) {
+          for (size_t x = 0; x < r.xsize(); x++) {
+            quant_row[x] = QuantizeGradient(quant_row, stride, c, x, y,
+                                            r.xsize(), row[x], inv_factor);
+          }
+        } else {
+          int32_t* quant_row_y =
+              stream_images_[stream_id].channel[0].plane.Row(y);
+          for (size_t x = 0; x < r.xsize(); x++) {
+            quant_row[x] = QuantizeGradient(
+                quant_row, stride, c, x, y, r.xsize(),
+                row[x] - quant_row_y[x] * (y_factor * cfl_factor), inv_factor);
+          }
+        }
+      }
+    }
+  } else if (nl_dc) {
+    JXL_ASSERT(enc_state->shared.frame_header.chroma_subsampling.Is444());
+    for (size_t c : {1, 0, 2}) {
+      float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul;
+      float y_factor = enc_state->shared.quantizer.GetDcStep(1) / mul;
+      float cfl_factor = enc_state->shared.cmap.DCFactors()[c];
+      weighted::Header header;
+      weighted::State wp_state(header, r.xsize(), r.ysize());
+      for (size_t y = 0; y < r.ysize(); y++) {
+        int32_t* quant_row =
+            stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c].plane.Row(y);
+        size_t stride = stream_images_[stream_id]
+                            .channel[c < 2 ? c ^ 1 : c]
+                            .plane.PixelsPerRow();
+        const float* row = r.ConstPlaneRow(dc, c, y);
+        if (c == 1) {
+          for (size_t x = 0; x < r.xsize(); x++) {
+            quant_row[x] = QuantizeWP(quant_row, stride, c, x, y, r.xsize(),
+                                      &wp_state, row[x], inv_factor);
+            wp_state.UpdateErrors(quant_row[x], x, y, r.xsize());
+          }
+        } else {
+          int32_t* quant_row_y =
+              stream_images_[stream_id].channel[0].plane.Row(y);
+          for (size_t x = 0; x < r.xsize(); x++) {
+            quant_row[x] = QuantizeWP(
+                quant_row, stride, c, x, y, r.xsize(), &wp_state,
+                row[x] - quant_row_y[x] * (y_factor * cfl_factor), inv_factor);
+            wp_state.UpdateErrors(quant_row[x], x, y, r.xsize());
+          }
+        }
+      }
+    }
+  } else if (enc_state->shared.frame_header.chroma_subsampling.Is444()) {
+    for (size_t c : {1, 0, 2}) {
+      float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul;
+      float y_factor = enc_state->shared.quantizer.GetDcStep(1) / mul;
+      float cfl_factor = enc_state->shared.cmap.DCFactors()[c];
+      for (size_t y = 0; y < r.ysize(); y++) {
+        int32_t* quant_row =
+            stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c].plane.Row(y);
+        const float* row = r.ConstPlaneRow(dc, c, y);
+        if (c == 1) {
+          for (size_t x = 0; x < r.xsize(); x++) {
+            quant_row[x] = roundf(row[x] * inv_factor);
+          }
+        } else {
+          int32_t* quant_row_y =
+              stream_images_[stream_id].channel[0].plane.Row(y);
+          for (size_t x = 0; x < r.xsize(); x++) {
+            quant_row[x] =
+                roundf((row[x] - quant_row_y[x] * (y_factor * cfl_factor)) *
+                       inv_factor);
+          }
+        }
+      }
+    }
+  } else {
+    for (size_t c : {1, 0, 2}) {
+      Rect rect(
+          r.x0() >> enc_state->shared.frame_header.chroma_subsampling.HShift(c),
+          r.y0() >> enc_state->shared.frame_header.chroma_subsampling.VShift(c),
+          r.xsize() >>
+              enc_state->shared.frame_header.chroma_subsampling.HShift(c),
+          r.ysize() >>
+              enc_state->shared.frame_header.chroma_subsampling.VShift(c));
+      float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul;
+      size_t ys = rect.ysize();
+      size_t xs = rect.xsize();
+      Channel& ch = stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c];
+      ch.w = xs;
+      ch.h = ys;
+      ch.shrink();
+      for (size_t y = 0; y < ys; y++) {
+        int32_t* quant_row = ch.plane.Row(y);
+        const float* row = rect.ConstPlaneRow(dc, c, y);
+        for (size_t x = 0; x < xs; x++) {
+          quant_row[x] = roundf(row[x] * inv_factor);
+        }
+      }
+    }
+  }
+
+  DequantDC(r, &enc_state->shared.dc_storage, &enc_state->shared.quant_dc,
+            stream_images_[stream_id], enc_state->shared.quantizer.MulDC(),
+            1.0 / mul, enc_state->shared.cmap.DCFactors(),
+            enc_state->shared.frame_header.chroma_subsampling,
+            enc_state->shared.block_ctx_map);
+}
+
+void ModularFrameEncoder::AddACMetadata(size_t group_index, bool jpeg_transcode,
+                                        PassesEncoderState* enc_state) {
+  const Rect r = enc_state->shared.DCGroupRect(group_index);
+  size_t stream_id = ModularStreamId::ACMetadata(group_index).ID(frame_dim_);
+  stream_options_[stream_id].max_chan_size = 0xFFFFFF;
+  stream_options_[stream_id].wp_tree_mode = ModularOptions::TreeMode::kNoWP;
+  if (jpeg_transcode) {
+    stream_options_[stream_id].tree_kind =
+        ModularOptions::TreeKind::kJpegTranscodeACMeta;
+  } else if (cparams_.speed_tier >= SpeedTier::kFalcon) {
+    stream_options_[stream_id].tree_kind =
+        ModularOptions::TreeKind::kFalconACMeta;
+  } else if (cparams_.speed_tier > SpeedTier::kKitten) {
+    stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kACMeta;
+  }
+  // If we are using a non-constant CfL field, and are in a slow enough mode,
+  // re-enable tree computation for it.
+  if (cparams_.speed_tier < SpeedTier::kSquirrel &&
+      cparams_.force_cfl_jpeg_recompression) {
+    stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kLearn;
+  }
+  // YToX, YToB, ACS + QF, EPF
+  Image& image = stream_images_[stream_id];
+  image = Image(r.xsize(), r.ysize(), 8, 4);
+  static_assert(kColorTileDimInBlocks == 8, "Color tile size changed");
+  Rect cr(r.x0() >> 3, r.y0() >> 3, (r.xsize() + 7) >> 3, (r.ysize() + 7) >> 3);
+  image.channel[0] = Channel(cr.xsize(), cr.ysize(), 3, 3);
+  image.channel[1] = Channel(cr.xsize(), cr.ysize(), 3, 3);
+  image.channel[2] = Channel(r.xsize() * r.ysize(), 2, 0, 0);
+  ConvertPlaneAndClamp(cr, enc_state->shared.cmap.ytox_map,
+                       Rect(image.channel[0].plane), &image.channel[0].plane);
+  ConvertPlaneAndClamp(cr, enc_state->shared.cmap.ytob_map,
+                       Rect(image.channel[1].plane), &image.channel[1].plane);
+  size_t num = 0;
+  for (size_t y = 0; y < r.ysize(); y++) {
+    AcStrategyRow row_acs = enc_state->shared.ac_strategy.ConstRow(r, y);
+    const int32_t* row_qf = r.ConstRow(enc_state->shared.raw_quant_field, y);
+    const uint8_t* row_epf = r.ConstRow(enc_state->shared.epf_sharpness, y);
+    int32_t* out_acs = image.channel[2].plane.Row(0);
+    int32_t* out_qf = image.channel[2].plane.Row(1);
+    int32_t* row_out_epf = image.channel[3].plane.Row(y);
+    for (size_t x = 0; x < r.xsize(); x++) {
+      row_out_epf[x] = row_epf[x];
+      if (!row_acs[x].IsFirstBlock()) continue;
+      out_acs[num] = row_acs[x].RawStrategy();
+      out_qf[num] = row_qf[x] - 1;
+      num++;
+    }
+  }
+  image.channel[2].w = num;
+  ac_metadata_size[group_index] = num;
+}
+
+void ModularFrameEncoder::EncodeQuantTable(
+    size_t size_x, size_t size_y, BitWriter* writer,
+    const QuantEncoding& encoding, size_t idx,
+    ModularFrameEncoder* modular_frame_encoder) {
+  JXL_ASSERT(encoding.qraw.qtable != nullptr);
+  JXL_ASSERT(size_x * size_y * 3 == encoding.qraw.qtable->size());
+  JXL_CHECK(F16Coder::Write(encoding.qraw.qtable_den, writer));
+  if (modular_frame_encoder) {
+    JXL_CHECK(modular_frame_encoder->EncodeStream(
+        writer, nullptr, 0, ModularStreamId::QuantTable(idx)));
+    return;
+  }
+  Image image(size_x, size_y, 8, 3);
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < size_y; y++) {
+      int32_t* JXL_RESTRICT row = image.channel[c].Row(y);
+      for (size_t x = 0; x < size_x; x++) {
+        row[x] = (*encoding.qraw.qtable)[c * size_x * size_y + y * size_x + x];
+      }
+    }
+  }
+  ModularOptions cfopts;
+  JXL_CHECK(ModularGenericCompress(image, cfopts, writer));
+}
+
+void ModularFrameEncoder::AddQuantTable(size_t size_x, size_t size_y,
+                                        const QuantEncoding& encoding,
+                                        size_t idx) {
+  size_t stream_id = ModularStreamId::QuantTable(idx).ID(frame_dim_);
+  JXL_ASSERT(encoding.qraw.qtable != nullptr);
+  JXL_ASSERT(size_x * size_y * 3 == encoding.qraw.qtable->size());
+  Image& image = stream_images_[stream_id];
+  image = Image(size_x, size_y, 8, 3);
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < size_y; y++) {
+      int32_t* JXL_RESTRICT row = image.channel[c].Row(y);
+      for (size_t x = 0; x < size_x; x++) {
+        row[x] = (*encoding.qraw.qtable)[c * size_x * size_y + y * size_x + x];
+      }
+    }
+  }
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_modular.h b/third_party/jpeg-xl/lib/jxl/enc_modular.h
new file mode 100644
index 0000000000..2af66e951f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_modular.h
@@ -0,0 +1,92 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_MODULAR_H_
+#define LIB_JXL_ENC_MODULAR_H_
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_modular.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/modular_image.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+class ModularFrameEncoder {
+ public:
+  ModularFrameEncoder(const FrameHeader& frame_header,
+                      const CompressParams& cparams_orig);
+  Status ComputeEncodingData(const FrameHeader& frame_header,
+                             const ImageMetadata& metadata,
+                             Image3F* JXL_RESTRICT color,
+                             const std::vector<ImageF>& extra_channels,
+                             PassesEncoderState* JXL_RESTRICT enc_state,
+                             const JxlCmsInterface& cms, ThreadPool* pool,
+                             AuxOut* aux_out, bool do_color);
+  // Encodes global info (tree + histograms) in the `writer`.
+  Status EncodeGlobalInfo(BitWriter* writer, AuxOut* aux_out);
+  // Encodes a specific modular image (identified by `stream`) in the `writer`,
+  // assigning bits to the provided `layer`.
+  Status EncodeStream(BitWriter* writer, AuxOut* aux_out, size_t layer,
+                      const ModularStreamId& stream);
+  // Creates a modular image for a given DC group of VarDCT mode. `dc` is the
+  // input DC image, not quantized; the group is specified by `group_index`, and
+  // `nl_dc` decides whether to apply a near-lossless processing to the DC or
+  // not.
+  void AddVarDCTDC(const Image3F& dc, size_t group_index, bool nl_dc,
+                   PassesEncoderState* enc_state, bool jpeg_transcode);
+  // Creates a modular image for the AC metadata of the given group
+  // (`group_index`).
+  void AddACMetadata(size_t group_index, bool jpeg_transcode,
+                     PassesEncoderState* enc_state);
+  // Encodes a RAW quantization table in `writer`. If `modular_frame_encoder` is
+  // null, the quantization table in `encoding` is used, with dimensions `size_x
+  // x size_y`. Otherwise, the table with ID `idx` is encoded from the given
+  // `modular_frame_encoder`.
+  static void EncodeQuantTable(size_t size_x, size_t size_y, BitWriter* writer,
+                               const QuantEncoding& encoding, size_t idx,
+                               ModularFrameEncoder* modular_frame_encoder);
+  // Stores a quantization table for future usage with `EncodeQuantTable`.
+  void AddQuantTable(size_t size_x, size_t size_y,
+                     const QuantEncoding& encoding, size_t idx);
+
+  std::vector<size_t> ac_metadata_size;
+  std::vector<uint8_t> extra_dc_precision;
+
+ private:
+  Status PrepareEncoding(const FrameHeader& frame_header, ThreadPool* pool,
+                         EncoderHeuristics* heuristics,
+                         AuxOut* aux_out = nullptr);
+  Status PrepareStreamParams(const Rect& rect, const CompressParams& cparams,
+                             int minShift, int maxShift,
+                             const ModularStreamId& stream, bool do_color);
+  std::vector<Image> stream_images_;
+  std::vector<ModularOptions> stream_options_;
+
+  Tree tree_;
+  std::vector<std::vector<Token>> tree_tokens_;
+  std::vector<GroupHeader> stream_headers_;
+  std::vector<std::vector<Token>> tokens_;
+  EntropyEncodingData code_;
+  std::vector<uint8_t> context_map_;
+  FrameDimensions frame_dim_;
+  CompressParams cparams_;
+  std::vector<size_t> tree_splits_;
+  std::vector<ModularMultiplierInfo> multiplier_info_;
+  std::vector<std::vector<uint32_t>> gi_channel_;
+  std::vector<size_t> image_widths_;
+  Predictor delta_pred_ = Predictor::Average4;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_MODULAR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_noise.cc b/third_party/jpeg-xl/lib/jxl/enc_noise.cc
new file mode 100644
index 0000000000..54bb4482e8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_noise.cc
@@ -0,0 +1,374 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_noise.h"
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_optimize.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/opsin_params.h"
+
+namespace jxl {
+namespace {
+
+using OptimizeArray = optimize::Array<double, NoiseParams::kNumNoisePoints>;
+
+float GetScoreSumsOfAbsoluteDifferences(const Image3F& opsin, const int x,
+                                        const int y, const int block_size) {
+  const int small_bl_size_x = 3;
+  const int small_bl_size_y = 4;
+  const int kNumSAD =
+      (block_size - small_bl_size_x) * (block_size - small_bl_size_y);
+  // block_size x block_size reference pixels
+  int counter = 0;
+  const int offset = 2;
+
+  std::vector<float> sad(kNumSAD, 0);
+  for (int y_bl = 0; y_bl + small_bl_size_y < block_size; ++y_bl) {
+    for (int x_bl = 0; x_bl + small_bl_size_x < block_size; ++x_bl) {
+      float sad_sum = 0;
+      // size of the center patch, we compare all the patches inside window with
+      // the center one
+      for (int cy = 0; cy < small_bl_size_y; ++cy) {
+        for (int cx = 0; cx < small_bl_size_x; ++cx) {
+          float wnd = 0.5f * (opsin.PlaneRow(1, y + y_bl + cy)[x + x_bl + cx] +
+                              opsin.PlaneRow(0, y + y_bl + cy)[x + x_bl + cx]);
+          float center =
+              0.5f * (opsin.PlaneRow(1, y + offset + cy)[x + offset + cx] +
+                      opsin.PlaneRow(0, y + offset + cy)[x + offset + cx]);
+          sad_sum += std::abs(center - wnd);
+        }
+      }
+      sad[counter++] = sad_sum;
+    }
+  }
+  const int kSamples = (kNumSAD) / 2;
+  // As with ROAD (rank order absolute distance), we keep the smallest half of
+  // the values in SAD (we use here the more robust patch SAD instead of
+  // absolute single-pixel differences).
+  std::sort(sad.begin(), sad.end());
+  const float total_sad_sum =
+      std::accumulate(sad.begin(), sad.begin() + kSamples, 0.0f);
+  return total_sad_sum / kSamples;
+}
+
+class NoiseHistogram {
+ public:
+  static constexpr int kBins = 256;
+
+  NoiseHistogram() { std::fill(bins, bins + kBins, 0); }
+
+  void Increment(const float x) { bins[Index(x)] += 1; }
+  int Get(const float x) const { return bins[Index(x)]; }
+  int Bin(const size_t bin) const { return bins[bin]; }
+
+  int Mode() const {
+    size_t max_idx = 0;
+    for (size_t i = 0; i < kBins; i++) {
+      if (bins[i] > bins[max_idx]) max_idx = i;
+    }
+    return max_idx;
+  }
+
+  double Quantile(double q01) const {
+    const int64_t total = std::accumulate(bins, bins + kBins, int64_t{1});
+    const int64_t target = static_cast<int64_t>(q01 * total);
+    // Until sum >= target:
+    int64_t sum = 0;
+    size_t i = 0;
+    for (; i < kBins; ++i) {
+      sum += bins[i];
+      // Exact match: assume middle of bin i
+      if (sum == target) {
+        return i + 0.5;
+      }
+      if (sum > target) break;
+    }
+
+    // Next non-empty bin (in case histogram is sparsely filled)
+    size_t next = i + 1;
+    while (next < kBins && bins[next] == 0) {
+      ++next;
+    }
+
+    // Linear interpolation according to how far into next we went
+    const double excess = target - sum;
+    const double weight_next = bins[Index(next)] / excess;
+    return ClampX(next * weight_next + i * (1.0 - weight_next));
+  }
+
+  // Inter-quartile range
+  double IQR() const { return Quantile(0.75) - Quantile(0.25); }
+
+ private:
+  template <typename T>
+  T ClampX(const T x) const {
+    return std::min(std::max(T(0), x), T(kBins - 1));
+  }
+  size_t Index(const float x) const { return ClampX(static_cast<int>(x)); }
+
+  uint32_t bins[kBins];
+};
+
+std::vector<float> GetSADScoresForPatches(const Image3F& opsin,
+                                          const size_t block_s,
+                                          const size_t num_bin,
+                                          NoiseHistogram* sad_histogram) {
+  std::vector<float> sad_scores(
+      (opsin.ysize() / block_s) * (opsin.xsize() / block_s), 0.0f);
+
+  int block_index = 0;
+
+  for (size_t y = 0; y + block_s <= opsin.ysize(); y += block_s) {
+    for (size_t x = 0; x + block_s <= opsin.xsize(); x += block_s) {
+      float sad_sc = GetScoreSumsOfAbsoluteDifferences(opsin, x, y, block_s);
+      sad_scores[block_index++] = sad_sc;
+      sad_histogram->Increment(sad_sc * num_bin);
+    }
+  }
+  return sad_scores;
+}
+
+float GetSADThreshold(const NoiseHistogram& histogram, const int num_bin) {
+  // Here we assume that the most patches with similar SAD value is a "flat"
+  // patches. However, some images might contain regular texture part and
+  // generate second strong peak at the histogram
+  // TODO(user) handle bimodal and heavy-tailed case
+  const int mode = histogram.Mode();
+  return static_cast<float>(mode) / NoiseHistogram::kBins;
+}
+
+// loss = sum asym * (F(x) - nl)^2 + kReg * num_points * sum (w[i] - w[i+1])^2
+// where asym = 1 if F(x) < nl, kAsym if F(x) > nl.
+struct LossFunction {
+  explicit LossFunction(std::vector<NoiseLevel> nl0) : nl(std::move(nl0)) {}
+
+  double Compute(const OptimizeArray& w, OptimizeArray* df,
+                 bool skip_regularization = false) const {
+    constexpr double kReg = 0.005;
+    constexpr double kAsym = 1.1;
+    double loss_function = 0;
+    for (size_t i = 0; i < w.size(); i++) {
+      (*df)[i] = 0;
+    }
+    for (auto ind : nl) {
+      std::pair<int, float> pos = IndexAndFrac(ind.intensity);
+      JXL_DASSERT(pos.first >= 0 && static_cast<size_t>(pos.first) <
+                                        NoiseParams::kNumNoisePoints - 1);
+      double low = w[pos.first];
+      double hi = w[pos.first + 1];
+      double val = low * (1.0f - pos.second) + hi * pos.second;
+      double dist = val - ind.noise_level;
+      if (dist > 0) {
+        loss_function += kAsym * dist * dist;
+        (*df)[pos.first] -= kAsym * (1.0f - pos.second) * dist;
+        (*df)[pos.first + 1] -= kAsym * pos.second * dist;
+      } else {
+        loss_function += dist * dist;
+        (*df)[pos.first] -= (1.0f - pos.second) * dist;
+        (*df)[pos.first + 1] -= pos.second * dist;
+      }
+    }
+    if (skip_regularization) return loss_function;
+    for (size_t i = 0; i + 1 < w.size(); i++) {
+      double diff = w[i] - w[i + 1];
+      loss_function += kReg * nl.size() * diff * diff;
+      (*df)[i] -= kReg * diff * nl.size();
+      (*df)[i + 1] += kReg * diff * nl.size();
+    }
+    return loss_function;
+  }
+
+  std::vector<NoiseLevel> nl;
+};
+
+void OptimizeNoiseParameters(const std::vector<NoiseLevel>& noise_level,
+                             NoiseParams* noise_params) {
+  constexpr double kMaxError = 1e-3;
+  static const double kPrecision = 1e-8;
+  static const int kMaxIter = 40;
+
+  float avg = 0;
+  for (const NoiseLevel& nl : noise_level) {
+    avg += nl.noise_level;
+  }
+  avg /= noise_level.size();
+
+  LossFunction loss_function(noise_level);
+  OptimizeArray parameter_vector;
+  for (size_t i = 0; i < parameter_vector.size(); i++) {
+    parameter_vector[i] = avg;
+  }
+
+  parameter_vector = optimize::OptimizeWithScaledConjugateGradientMethod(
+      loss_function, parameter_vector, kPrecision, kMaxIter);
+
+  OptimizeArray df = parameter_vector;
+  float loss = loss_function.Compute(parameter_vector, &df,
+                                     /*skip_regularization=*/true) /
+               noise_level.size();
+
+  // Approximation went too badly: escape with no noise at all.
+  if (loss > kMaxError) {
+    noise_params->Clear();
+    return;
+  }
+
+  for (size_t i = 0; i < parameter_vector.size(); i++) {
+    noise_params->lut[i] = std::max(parameter_vector[i], 0.0);
+  }
+}
+
+std::vector<NoiseLevel> GetNoiseLevel(
+    const Image3F& opsin, const std::vector<float>& texture_strength,
+    const float threshold, const size_t block_s) {
+  std::vector<NoiseLevel> noise_level_per_intensity;
+
+  const int filt_size = 1;
+  static const float kLaplFilter[filt_size * 2 + 1][filt_size * 2 + 1] = {
+      {-0.25f, -1.0f, -0.25f},
+      {-1.0f, 5.0f, -1.0f},
+      {-0.25f, -1.0f, -0.25f},
+  };
+
+  // The noise model is built based on channel 0.5 * (X+Y) as we notice that it
+  // is similar to the model 0.5 * (Y-X)
+  size_t patch_index = 0;
+
+  for (size_t y = 0; y + block_s <= opsin.ysize(); y += block_s) {
+    for (size_t x = 0; x + block_s <= opsin.xsize(); x += block_s) {
+      if (texture_strength[patch_index] <= threshold) {
+        // Calculate mean value
+        float mean_int = 0;
+        for (size_t y_bl = 0; y_bl < block_s; ++y_bl) {
+          for (size_t x_bl = 0; x_bl < block_s; ++x_bl) {
+            mean_int += 0.5f * (opsin.PlaneRow(1, y + y_bl)[x + x_bl] +
+                                opsin.PlaneRow(0, y + y_bl)[x + x_bl]);
+          }
+        }
+        mean_int /= block_s * block_s;
+
+        // Calculate Noise level
+        float noise_level = 0;
+        size_t count = 0;
+        for (size_t y_bl = 0; y_bl < block_s; ++y_bl) {
+          for (size_t x_bl = 0; x_bl < block_s; ++x_bl) {
+            float filtered_value = 0;
+            for (int y_f = -1 * filt_size; y_f <= filt_size; ++y_f) {
+              if ((static_cast<ssize_t>(y_bl) + y_f) >= 0 &&
+                  (y_bl + y_f) < block_s) {
+                for (int x_f = -1 * filt_size; x_f <= filt_size; ++x_f) {
+                  if ((static_cast<ssize_t>(x_bl) + x_f) >= 0 &&
+                      (x_bl + x_f) < block_s) {
+                    filtered_value +=
+                        0.5f *
+                        (opsin.PlaneRow(1, y + y_bl + y_f)[x + x_bl + x_f] +
+                         opsin.PlaneRow(0, y + y_bl + y_f)[x + x_bl + x_f]) *
+                        kLaplFilter[y_f + filt_size][x_f + filt_size];
+                  } else {
+                    filtered_value +=
+                        0.5f *
+                        (opsin.PlaneRow(1, y + y_bl + y_f)[x + x_bl - x_f] +
+                         opsin.PlaneRow(0, y + y_bl + y_f)[x + x_bl - x_f]) *
+                        kLaplFilter[y_f + filt_size][x_f + filt_size];
+                  }
+                }
+              } else {
+                for (int x_f = -1 * filt_size; x_f <= filt_size; ++x_f) {
+                  if ((static_cast<ssize_t>(x_bl) + x_f) >= 0 &&
+                      (x_bl + x_f) < block_s) {
+                    filtered_value +=
+                        0.5f *
+                        (opsin.PlaneRow(1, y + y_bl - y_f)[x + x_bl + x_f] +
+                         opsin.PlaneRow(0, y + y_bl - y_f)[x + x_bl + x_f]) *
+                        kLaplFilter[y_f + filt_size][x_f + filt_size];
+                  } else {
+                    filtered_value +=
+                        0.5f *
+                        (opsin.PlaneRow(1, y + y_bl - y_f)[x + x_bl - x_f] +
+                         opsin.PlaneRow(0, y + y_bl - y_f)[x + x_bl - x_f]) *
+                        kLaplFilter[y_f + filt_size][x_f + filt_size];
+                  }
+                }
+              }
+            }
+            noise_level += std::abs(filtered_value);
+            ++count;
+          }
+        }
+        noise_level /= count;
+        NoiseLevel nl;
+        nl.intensity = mean_int;
+        nl.noise_level = noise_level;
+        noise_level_per_intensity.push_back(nl);
+      }
+      ++patch_index;
+    }
+  }
+  return noise_level_per_intensity;
+}
+
+void EncodeFloatParam(float val, float precision, BitWriter* writer) {
+  JXL_ASSERT(val >= 0);
+  const int absval_quant = static_cast<int>(val * precision + 0.5f);
+  JXL_ASSERT(absval_quant < (1 << 10));
+  writer->Write(10, absval_quant);
+}
+
+}  // namespace
+
+Status GetNoiseParameter(const Image3F& opsin, NoiseParams* noise_params,
+                         float quality_coef) {
+  // The size of a patch in decoder might be different from encoder's patch
+  // size.
+  // For encoder: the patch size should be big enough to estimate
+  //              noise level, but, at the same time, it should be not too big
+  //              to be able to estimate intensity value of the patch
+  const size_t block_s = 8;
+  const size_t kNumBin = 256;
+  NoiseHistogram sad_histogram;
+  std::vector<float> sad_scores =
+      GetSADScoresForPatches(opsin, block_s, kNumBin, &sad_histogram);
+  float sad_threshold = GetSADThreshold(sad_histogram, kNumBin);
+  // If threshold is too large, the image has a strong pattern. This pattern
+  // fools our model and it will add too much noise. Therefore, we do not add
+  // noise for such images
+  if (sad_threshold > 0.15f || sad_threshold <= 0.0f) {
+    noise_params->Clear();
+    return false;
+  }
+  std::vector<NoiseLevel> nl =
+      GetNoiseLevel(opsin, sad_scores, sad_threshold, block_s);
+
+  OptimizeNoiseParameters(nl, noise_params);
+  for (float& i : noise_params->lut) {
+    i *= quality_coef * 1.4;
+  }
+  return noise_params->HasAny();
+}
+
+void EncodeNoise(const NoiseParams& noise_params, BitWriter* writer,
+                 size_t layer, AuxOut* aux_out) {
+  JXL_ASSERT(noise_params.HasAny());
+
+  BitWriter::Allotment allotment(writer, NoiseParams::kNumNoisePoints * 16);
+  for (float i : noise_params.lut) {
+    EncodeFloatParam(i, kNoisePrecision, writer);
+  }
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_noise.h b/third_party/jpeg-xl/lib/jxl/enc_noise.h
new file mode 100644
index 0000000000..851fdd12db
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_noise.h
@@ -0,0 +1,34 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_NOISE_H_
+#define LIB_JXL_ENC_NOISE_H_
+
+// Noise parameter estimation.
+
+#include <stddef.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/noise.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Get parameters of the noise for NoiseParams model
+// Returns whether a valid noise model (with HasAny()) is set.
+Status GetNoiseParameter(const Image3F& opsin, NoiseParams* noise_params,
+                         float quality_coef);
+
+// Does not write anything if `noise_params` are empty. Otherwise, caller must
+// set FrameHeader.flags.kNoise.
+void EncodeNoise(const NoiseParams& noise_params, BitWriter* writer,
+                 size_t layer, AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_NOISE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_optimize.cc b/third_party/jpeg-xl/lib/jxl/enc_optimize.cc
new file mode 100644
index 0000000000..6865ff67df
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_optimize.cc
@@ -0,0 +1,163 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_optimize.h"
+
+#include <algorithm>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+namespace optimize {
+
+namespace {
+
+// simplex vector must be sorted by first element of its elements
+std::vector<double> Midpoint(const std::vector<std::vector<double>>& simplex) {
+  JXL_CHECK(!simplex.empty());
+  JXL_CHECK(simplex.size() == simplex[0].size());
+  int dim = simplex.size() - 1;
+  std::vector<double> result(dim + 1, 0);
+  for (int i = 0; i < dim; i++) {
+    for (int k = 0; k < dim; k++) {
+      result[i + 1] += simplex[k][i + 1];
+    }
+    result[i + 1] /= dim;
+  }
+  return result;
+}
+
+// first element ignored
+std::vector<double> Subtract(const std::vector<double>& a,
+                             const std::vector<double>& b) {
+  JXL_CHECK(a.size() == b.size());
+  std::vector<double> result(a.size());
+  result[0] = 0;
+  for (size_t i = 1; i < result.size(); i++) {
+    result[i] = a[i] - b[i];
+  }
+  return result;
+}
+
+// first element ignored
+std::vector<double> Add(const std::vector<double>& a,
+                        const std::vector<double>& b) {
+  JXL_CHECK(a.size() == b.size());
+  std::vector<double> result(a.size());
+  result[0] = 0;
+  for (size_t i = 1; i < result.size(); i++) {
+    result[i] = a[i] + b[i];
+  }
+  return result;
+}
+
+// first element ignored
+std::vector<double> Average(const std::vector<double>& a,
+                            const std::vector<double>& b) {
+  JXL_CHECK(a.size() == b.size());
+  std::vector<double> result(a.size());
+  result[0] = 0;
+  for (size_t i = 1; i < result.size(); i++) {
+    result[i] = 0.5 * (a[i] + b[i]);
+  }
+  return result;
+}
+
+// vec: [0] will contain the objective function, [1:] will
+//   contain the vector position for the objective function.
+// fun: the function evaluates the value.
+void Eval(std::vector<double>* vec,
+          const std::function<double(const std::vector<double>&)>& fun) {
+  std::vector<double> args(vec->begin() + 1, vec->end());
+  (*vec)[0] = fun(args);
+}
+
+void Sort(std::vector<std::vector<double>>* simplex) {
+  std::sort(simplex->begin(), simplex->end());
+}
+
+// Main iteration step of Nelder-Mead like optimization.
+void Reflect(std::vector<std::vector<double>>* simplex,
+             const std::function<double(const std::vector<double>&)>& fun) {
+  Sort(simplex);
+  const std::vector<double>& last = simplex->back();
+  std::vector<double> mid = Midpoint(*simplex);
+  std::vector<double> diff = Subtract(mid, last);
+  std::vector<double> mirrored = Add(mid, diff);
+  Eval(&mirrored, fun);
+  if (mirrored[0] > (*simplex)[simplex->size() - 2][0]) {
+    // Still the worst, shrink towards the best.
+    std::vector<double> shrinking = Average(simplex->back(), (*simplex)[0]);
+    Eval(&shrinking, fun);
+    simplex->back() = shrinking;
+  } else if (mirrored[0] < (*simplex)[0][0]) {
+    // new best
+    std::vector<double> even_further = Add(mirrored, diff);
+    Eval(&even_further, fun);
+    if (even_further[0] < mirrored[0]) {
+      mirrored = even_further;
+    }
+    simplex->back() = mirrored;
+  } else {
+    // not a best, not a worst point
+    simplex->back() = mirrored;
+  }
+}
+
+// Initialize the simplex at origin.
+std::vector<std::vector<double>> InitialSimplex(
+    int dim, double amount, const std::vector<double>& init,
+    const std::function<double(const std::vector<double>&)>& fun) {
+  std::vector<double> best(1 + dim, 0);
+  std::copy(init.begin(), init.end(), best.begin() + 1);
+  Eval(&best, fun);
+  std::vector<std::vector<double>> result{best};
+  for (int i = 0; i < dim; i++) {
+    best = result[0];
+    best[i + 1] += amount;
+    Eval(&best, fun);
+    result.push_back(best);
+    Sort(&result);
+  }
+  return result;
+}
+
+// For comparing the same with the python tool
+/*void RunSimplexExternal(
+    int dim, double amount, int max_iterations,
+    const std::function<double((const vector<double>&))>& fun) {
+  vector<double> vars;
+  for (int i = 0; i < dim; i++) {
+    vars.push_back(atof(getenv(StrCat("VAR", i).c_str())));
+  }
+  double result = fun(vars);
+  std::cout << "Result=" << result;
+}*/
+
+}  // namespace
+
+std::vector<double> RunSimplex(
+    int dim, double amount, int max_iterations, const std::vector<double>& init,
+    const std::function<double(const std::vector<double>&)>& fun) {
+  std::vector<std::vector<double>> simplex =
+      InitialSimplex(dim, amount, init, fun);
+  for (int i = 0; i < max_iterations; i++) {
+    Sort(&simplex);
+    Reflect(&simplex, fun);
+  }
+  return simplex[0];
+}
+
+std::vector<double> RunSimplex(
+    int dim, double amount, int max_iterations,
+    const std::function<double(const std::vector<double>&)>& fun) {
+  std::vector<double> init(dim, 0.0);
+  return RunSimplex(dim, amount, max_iterations, init, fun);
+}
+
+}  // namespace optimize
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_optimize.h b/third_party/jpeg-xl/lib/jxl/enc_optimize.h
new file mode 100644
index 0000000000..0a60198214
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_optimize.h
@@ -0,0 +1,218 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Utility functions for optimizing multi-dimensional nonlinear functions.
+
+#ifndef LIB_JXL_OPTIMIZE_H_
+#define LIB_JXL_OPTIMIZE_H_
+
+#include <stdio.h>
+
+#include <cmath>
+#include <cstdio>
+#include <functional>
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace optimize {
+
+// An array type of numeric values that supports math operations with operator-,
+// operator+, etc.
+template <typename T, size_t N>
+class Array {
+ public:
+  Array() = default;
+  explicit Array(T v) {
+    for (size_t i = 0; i < N; i++) v_[i] = v;
+  }
+
+  size_t size() const { return N; }
+
+  T& operator[](size_t index) {
+    JXL_DASSERT(index < N);
+    return v_[index];
+  }
+  T operator[](size_t index) const {
+    JXL_DASSERT(index < N);
+    return v_[index];
+  }
+
+ private:
+  // The values used by this Array.
+  T v_[N];
+};
+
+template <typename T, size_t N>
+Array<T, N> operator+(const Array<T, N>& x, const Array<T, N>& y) {
+  Array<T, N> z;
+  for (size_t i = 0; i < N; ++i) {
+    z[i] = x[i] + y[i];
+  }
+  return z;
+}
+
+template <typename T, size_t N>
+Array<T, N> operator-(const Array<T, N>& x, const Array<T, N>& y) {
+  Array<T, N> z;
+  for (size_t i = 0; i < N; ++i) {
+    z[i] = x[i] - y[i];
+  }
+  return z;
+}
+
+template <typename T, size_t N>
+Array<T, N> operator*(T v, const Array<T, N>& x) {
+  Array<T, N> y;
+  for (size_t i = 0; i < N; ++i) {
+    y[i] = v * x[i];
+  }
+  return y;
+}
+
+template <typename T, size_t N>
+T operator*(const Array<T, N>& x, const Array<T, N>& y) {
+  T r = 0.0;
+  for (size_t i = 0; i < N; ++i) {
+    r += x[i] * y[i];
+  }
+  return r;
+}
+
+// Runs Nelder-Mead like optimization. Runs for max_iterations times,
+// fun gets called with a vector of size dim as argument, and returns the score
+// based on those parameters (lower is better). Returns a vector of dim+1
+// dimensions, where the first value is the optimal value of the function and
+// the rest is the argmin value. Use init to pass an initial guess or where
+// the optimal value is.
+//
+// Usage example:
+//
+// RunSimplex(2, 0.1, 100, [](const vector<float>& v) {
+//   return (v[0] - 5) * (v[0] - 5) + (v[1] - 7) * (v[1] - 7);
+// });
+//
+// Returns (0.0, 5, 7)
+std::vector<double> RunSimplex(
+    int dim, double amount, int max_iterations,
+    const std::function<double(const std::vector<double>&)>& fun);
+std::vector<double> RunSimplex(
+    int dim, double amount, int max_iterations, const std::vector<double>& init,
+    const std::function<double(const std::vector<double>&)>& fun);
+
+// Implementation of the Scaled Conjugate Gradient method described in the
+// following paper:
+//   Moller, M. "A Scaled Conjugate Gradient Algorithm for Fast Supervised
+//   Learning", Neural Networks, Vol. 6. pp. 525-533, 1993
+//   http://sci2s.ugr.es/keel/pdf/algorithm/articulo/moller1990.pdf
+//
+// The Function template parameter is a class that has the following method:
+//
+//   // Returns the value of the function at point w and sets *df to be the
+//   // negative gradient vector of the function at point w.
+//   double Compute(const optimize::Array<T, N>& w,
+//                  optimize::Array<T, N>* df) const;
+//
+// Returns a vector w, such that |df(w)| < grad_norm_threshold.
+template <typename T, size_t N, typename Function>
+Array<T, N> OptimizeWithScaledConjugateGradientMethod(
+    const Function& f, const Array<T, N>& w0, const T grad_norm_threshold,
+    size_t max_iters) {
+  const size_t n = w0.size();
+  const T rsq_threshold = grad_norm_threshold * grad_norm_threshold;
+  const T sigma0 = static_cast<T>(0.0001);
+  const T l_min = static_cast<T>(1.0e-15);
+  const T l_max = static_cast<T>(1.0e15);
+
+  Array<T, N> w = w0;
+  Array<T, N> wp;
+  Array<T, N> r;
+  Array<T, N> rt;
+  Array<T, N> e;
+  Array<T, N> p;
+  T psq;
+  T fp;
+  T D;
+  T d;
+  T m;
+  T a;
+  T b;
+  T s;
+  T t;
+
+  T fw = f.Compute(w, &r);
+  T rsq = r * r;
+  e = r;
+  p = r;
+  T l = static_cast<T>(1.0);
+  bool success = true;
+  size_t n_success = 0;
+  size_t k = 0;
+
+  while (k++ < max_iters) {
+    if (success) {
+      m = -(p * r);
+      if (m >= 0) {
+        p = r;
+        m = -(p * r);
+      }
+      psq = p * p;
+      s = sigma0 / std::sqrt(psq);
+      f.Compute(w + (s * p), &rt);
+      t = (p * (r - rt)) / s;
+    }
+
+    d = t + l * psq;
+    if (d <= 0) {
+      d = l * psq;
+      l = l - t / psq;
+    }
+
+    a = -m / d;
+    wp = w + a * p;
+    fp = f.Compute(wp, &rt);
+
+    D = 2.0 * (fp - fw) / (a * m);
+    if (D >= 0.0) {
+      success = true;
+      n_success++;
+      w = wp;
+    } else {
+      success = false;
+    }
+
+    if (success) {
+      e = r;
+      r = rt;
+      rsq = r * r;
+      fw = fp;
+      if (rsq <= rsq_threshold) {
+        break;
+      }
+    }
+
+    if (D < 0.25) {
+      l = std::min(4.0 * l, l_max);
+    } else if (D > 0.75) {
+      l = std::max(0.25 * l, l_min);
+    }
+
+    if ((n_success % n) == 0) {
+      p = r;
+      l = 1.0;
+    } else if (success) {
+      b = ((e - r) * r) / m;
+      p = b * p + r;
+    }
+  }
+
+  return w;
+}
+
+}  // namespace optimize
+}  // namespace jxl
+
+#endif  // LIB_JXL_OPTIMIZE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_optimize_test.cc b/third_party/jpeg-xl/lib/jxl/enc_optimize_test.cc
new file mode 100644
index 0000000000..1c6699f99e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_optimize_test.cc
@@ -0,0 +1,109 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_optimize.h"
+
+#include <stdio.h>
+
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace optimize {
+namespace {
+
+// The maximum number of iterations for the test.
+static const size_t kMaxTestIter = 100000;
+
+// F(w) = (w - w_min)^2.
+struct SimpleQuadraticFunction {
+  typedef Array<double, 2> ArrayType;
+  explicit SimpleQuadraticFunction(const ArrayType& w0) : w_min(w0) {}
+
+  double Compute(const ArrayType& w, ArrayType* df) const {
+    ArrayType dw = w - w_min;
+    *df = -2.0 * dw;
+    return dw * dw;
+  }
+
+  ArrayType w_min;
+};
+
+// F(alpha, beta, gamma| x,y) = \sum_i(y_i - (alpha x_i ^ gamma + beta))^2.
+struct PowerFunction {
+  explicit PowerFunction(const std::vector<double>& x0,
+                         const std::vector<double>& y0)
+      : x(x0), y(y0) {}
+
+  typedef Array<double, 3> ArrayType;
+  double Compute(const ArrayType& w, ArrayType* df) const {
+    double loss_function = 0;
+    (*df)[0] = 0;
+    (*df)[1] = 0;
+    (*df)[2] = 0;
+    for (size_t ind = 0; ind < y.size(); ++ind) {
+      if (x[ind] != 0) {
+        double l_f = y[ind] - (w[0] * pow(x[ind], w[1]) + w[2]);
+        (*df)[0] += 2.0 * l_f * pow(x[ind], w[1]);
+        (*df)[1] += 2.0 * l_f * w[0] * pow(x[ind], w[1]) * log(x[ind]);
+        (*df)[2] += 2.0 * l_f * 1;
+        loss_function += l_f * l_f;
+      }
+    }
+    return loss_function;
+  }
+
+  std::vector<double> x;
+  std::vector<double> y;
+};
+
+TEST(OptimizeTest, SimpleQuadraticFunction) {
+  SimpleQuadraticFunction::ArrayType w_min;
+  w_min[0] = 1.0;
+  w_min[1] = 2.0;
+  SimpleQuadraticFunction f(w_min);
+  SimpleQuadraticFunction::ArrayType w(0.);
+  static const double kPrecision = 1e-8;
+  w = optimize::OptimizeWithScaledConjugateGradientMethod(f, w, kPrecision,
+                                                          kMaxTestIter);
+  EXPECT_NEAR(w[0], 1.0, kPrecision);
+  EXPECT_NEAR(w[1], 2.0, kPrecision);
+}
+
+TEST(OptimizeTest, PowerFunction) {
+  std::vector<double> x(10);
+  std::vector<double> y(10);
+  for (int ind = 0; ind < 10; ++ind) {
+    x[ind] = 1. * ind;
+    y[ind] = 2. * pow(x[ind], 3) + 5.;
+  }
+  PowerFunction f(x, y);
+  PowerFunction::ArrayType w(0.);
+
+  static const double kPrecision = 0.01;
+  w = optimize::OptimizeWithScaledConjugateGradientMethod(f, w, kPrecision,
+                                                          kMaxTestIter);
+  EXPECT_NEAR(w[0], 2.0, kPrecision);
+  EXPECT_NEAR(w[1], 3.0, kPrecision);
+  EXPECT_NEAR(w[2], 5.0, kPrecision);
+}
+
+TEST(OptimizeTest, SimplexOptTest) {
+  auto f = [](const std::vector<double>& x) -> double {
+    double t1 = x[0] - 1.0;
+    double t2 = x[1] + 1.5;
+    return 2.0 + t1 * t1 + t2 * t2;
+  };
+  auto opt = RunSimplex(2, 0.01, 100, f);
+  EXPECT_EQ(opt.size(), 3u);
+
+  static const double kPrecision = 0.01;
+  EXPECT_NEAR(opt[0], 2.0, kPrecision);
+  EXPECT_NEAR(opt[1], 1.0, kPrecision);
+  EXPECT_NEAR(opt[2], -1.5, kPrecision);
+}
+
+}  // namespace
+}  // namespace optimize
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_params.h b/third_party/jpeg-xl/lib/jxl/enc_params.h
new file mode 100644
index 0000000000..737a951362
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_params.h
@@ -0,0 +1,225 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_PARAMS_H_
+#define LIB_JXL_ENC_PARAMS_H_
+
+// Parameters and flags that govern JXL compression.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/butteraugli/butteraugli.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/modular/options.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+namespace jxl {
+
+enum class SpeedTier {
+  // Try multiple combinations of Tortoise flags for modular mode. Otherwise
+  // like kTortoise.
+  kGlacier = 0,
+  // Turns on FindBestQuantizationHQ loop. Equivalent to "guetzli" mode.
+  kTortoise = 1,
+  // Turns on FindBestQuantization butteraugli loop.
+  kKitten = 2,
+  // Turns on dots, patches, and spline detection by default, as well as full
+  // context clustering. Default.
+  kSquirrel = 3,
+  // Turns on error diffusion and full AC strategy heuristics. Equivalent to
+  // "fast" mode.
+  kWombat = 4,
+  // Turns on gaborish by default, non-default cmap, initial quant field.
+  kHare = 5,
+  // Turns on simple heuristics for AC strategy, quant field, and clustering;
+  // also enables coefficient reordering.
+  kCheetah = 6,
+  // Turns off most encoder features. Does context clustering.
+  // Modular: uses fixed tree with Weighted predictor.
+  kFalcon = 7,
+  // Currently fastest possible setting for VarDCT.
+  // Modular: uses fixed tree with Gradient predictor.
+  kThunder = 8,
+  // VarDCT: same as kThunder.
+  // Modular: no tree, Gradient predictor, fast histograms
+  kLightning = 9
+};
+
+// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding)
+struct CompressParams {
+  float butteraugli_distance = 1.0f;
+
+  // explicit distances for extra channels (defaults to butteraugli_distance
+  // when not set; value of -1 can be used to represent 'default')
+  std::vector<float> ec_distance;
+  size_t target_size = 0;
+  float target_bitrate = 0.0f;
+
+  // 0.0 means search for the adaptive quantization map that matches the
+  // butteraugli distance, positive values mean quantize everywhere with that
+  // value.
+  float uniform_quant = 0.0f;
+  float quant_border_bias = 0.0f;
+
+  // Try to achieve a maximum pixel-by-pixel error on each channel.
+  bool max_error_mode = false;
+  float max_error[3] = {0.0, 0.0, 0.0};
+
+  SpeedTier speed_tier = SpeedTier::kSquirrel;
+  int brotli_effort = -1;
+
+  // 0 = default.
+  // 1 = slightly worse quality.
+  // 4 = fastest speed, lowest quality
+  size_t decoding_speed_tier = 0;
+
+  int max_butteraugli_iters = 4;
+
+  int max_butteraugli_iters_guetzli_mode = 100;
+
+  ColorTransform color_transform = ColorTransform::kXYB;
+  YCbCrChromaSubsampling chroma_subsampling;
+
+  // If true, the "modular mode options" members below are used.
+  bool modular_mode = false;
+
+  // Change group size in modular mode (0=128, 1=256, 2=512, 3=1024).
+  size_t modular_group_size_shift = 1;
+
+  Override preview = Override::kDefault;
+  Override noise = Override::kDefault;
+  Override dots = Override::kDefault;
+  Override patches = Override::kDefault;
+  Override gaborish = Override::kDefault;
+  int epf = -1;
+
+  // Progressive mode.
+  bool progressive_mode = false;
+
+  // Quantized-progressive mode.
+  bool qprogressive_mode = false;
+
+  // Put center groups first in the bitstream.
+  bool centerfirst = false;
+
+  // Pixel coordinates of the center. First group will contain that center.
+  size_t center_x = static_cast<size_t>(-1);
+  size_t center_y = static_cast<size_t>(-1);
+
+  int progressive_dc = -1;
+
+  // If on: preserve color of invisible pixels (if off: don't care)
+  // Default: on for lossless, off for lossy
+  Override keep_invisible = Override::kDefault;
+
+  // Currently unused as of 2020-01.
+  bool clear_metadata = false;
+
+  // Prints extra information during/after encoding.
+  bool verbose = false;
+  bool log_search_state = false;
+
+  ButteraugliParams ba_params;
+
+  // Force usage of CfL when doing JPEG recompression. This can have unexpected
+  // effects on the decoded pixels, while still being JPEG-compliant and
+  // allowing reconstruction of the original JPEG.
+  bool force_cfl_jpeg_recompression = true;
+
+  // Use brotli compression for any boxes derived from a JPEG frame.
+  bool jpeg_compress_boxes = true;
+
+  // Set the noise to what it would approximately be if shooting at the nominal
+  // exposure for a given ISO setting on a 35mm camera.
+  float photon_noise_iso = 0;
+
+  // modular mode options below
+  ModularOptions options;
+  int responsive = -1;
+  // empty for default squeeze
+  std::vector<SqueezeParams> squeezes;
+  int colorspace = -1;
+  // Use Global channel palette if #colors < this percentage of range
+  float channel_colors_pre_transform_percent = 95.f;
+  // Use Local channel palette if #colors < this percentage of range
+  float channel_colors_percent = 80.f;
+  int palette_colors = 1 << 10;  // up to 10-bit palette is probably worthwhile
+  bool lossy_palette = false;
+
+  // Returns whether these params are lossless as defined by SetLossless();
+  bool IsLossless() const { return modular_mode && ModularPartIsLossless(); }
+
+  bool ModularPartIsLossless() const {
+    if (modular_mode) {
+      // YCbCr is also considered lossless here since it's intended for
+      // source material that is already YCbCr (we don't do the fwd transform)
+      if (butteraugli_distance != 0 ||
+          color_transform == jxl::ColorTransform::kXYB)
+        return false;
+    }
+    for (float f : ec_distance) {
+      if (f > 0) return false;
+      if (f < 0 && butteraugli_distance != 0) return false;
+    }
+    // if no explicit ec_distance given, and using vardct, then the modular part
+    // is empty or not lossless
+    if (!modular_mode && ec_distance.empty()) return false;
+    // all modular channels are encoded at distance 0
+    return true;
+  }
+
+  // Sets the parameters required to make the codec lossless.
+  void SetLossless() {
+    modular_mode = true;
+    butteraugli_distance = 0.0f;
+    for (float &f : ec_distance) f = 0.0f;
+    color_transform = jxl::ColorTransform::kNone;
+  }
+
+  // Down/upsample the image before encoding / after decoding by this factor.
+  // The resampling value can also be set to <= 0 to automatically choose based
+  // on distance, however EncodeFrame doesn't support this, so it is
+  // required to call PostInit() to set a valid positive resampling
+  // value and altered butteraugli score if this is used.
+  int resampling = -1;
+  int ec_resampling = -1;
+  // Skip the downsampling before encoding if this is true.
+  bool already_downsampled = false;
+  // Butteraugli target distance on the original full size image, this can be
+  // different from butteraugli_distance if resampling was used.
+  float original_butteraugli_distance = -1.0f;
+
+  float quant_ac_rescale = 1.0;
+
+  // Codestream level to conform to.
+  // -1: don't care
+  int level = -1;
+
+  std::vector<float> manual_noise;
+  std::vector<float> manual_xyb_factors;
+};
+
+static constexpr float kMinButteraugliForDynamicAR = 0.5f;
+static constexpr float kMinButteraugliForDots = 3.0f;
+static constexpr float kMinButteraugliToSubtractOriginalPatches = 3.0f;
+
+// Always off
+static constexpr float kMinButteraugliForNoise = 99.0f;
+
+// Minimum butteraugli distance the encoder accepts.
+static constexpr float kMinButteraugliDistance = 0.001f;
+
+// Tile size for encoder-side processing. Must be equal to color tile dim in the
+// current implementation.
+static constexpr size_t kEncTileDim = 64;
+static constexpr size_t kEncTileDimInBlocks = kEncTileDim / kBlockDim;
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_PARAMS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.cc b/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.cc
new file mode 100644
index 0000000000..157e18c3a8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.cc
@@ -0,0 +1,813 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_patch_dictionary.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <atomic>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/dec_frame.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_dot_dictionary.h"
+#include "lib/jxl/enc_frame.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/patch_dictionary_internal.h"
+
+namespace jxl {
+
+static constexpr size_t kPatchFrameReferenceId = 3;
+
+// static
+void PatchDictionaryEncoder::Encode(const PatchDictionary& pdic,
+                                    BitWriter* writer, size_t layer,
+                                    AuxOut* aux_out) {
+  JXL_ASSERT(pdic.HasAny());
+  std::vector<std::vector<Token>> tokens(1);
+  size_t num_ec = pdic.shared_->metadata->m.num_extra_channels;
+
+  auto add_num = [&](int context, size_t num) {
+    tokens[0].emplace_back(context, num);
+  };
+  size_t num_ref_patch = 0;
+  for (size_t i = 0; i < pdic.positions_.size();) {
+    size_t ref_pos_idx = pdic.positions_[i].ref_pos_idx;
+    while (i < pdic.positions_.size() &&
+           pdic.positions_[i].ref_pos_idx == ref_pos_idx) {
+      i++;
+    }
+    num_ref_patch++;
+  }
+  add_num(kNumRefPatchContext, num_ref_patch);
+  size_t blend_pos = 0;
+  for (size_t i = 0; i < pdic.positions_.size();) {
+    size_t i_start = i;
+    size_t ref_pos_idx = pdic.positions_[i].ref_pos_idx;
+    const auto& ref_pos = pdic.ref_positions_[ref_pos_idx];
+    while (i < pdic.positions_.size() &&
+           pdic.positions_[i].ref_pos_idx == ref_pos_idx) {
+      i++;
+    }
+    size_t num = i - i_start;
+    JXL_ASSERT(num > 0);
+    add_num(kReferenceFrameContext, ref_pos.ref);
+    add_num(kPatchReferencePositionContext, ref_pos.x0);
+    add_num(kPatchReferencePositionContext, ref_pos.y0);
+    add_num(kPatchSizeContext, ref_pos.xsize - 1);
+    add_num(kPatchSizeContext, ref_pos.ysize - 1);
+    add_num(kPatchCountContext, num - 1);
+    for (size_t j = i_start; j < i; j++) {
+      const PatchPosition& pos = pdic.positions_[j];
+      if (j == i_start) {
+        add_num(kPatchPositionContext, pos.x);
+        add_num(kPatchPositionContext, pos.y);
+      } else {
+        add_num(kPatchOffsetContext,
+                PackSigned(pos.x - pdic.positions_[j - 1].x));
+        add_num(kPatchOffsetContext,
+                PackSigned(pos.y - pdic.positions_[j - 1].y));
+      }
+      for (size_t j = 0; j < num_ec + 1; ++j, ++blend_pos) {
+        const PatchBlending& info = pdic.blendings_[blend_pos];
+        add_num(kPatchBlendModeContext, static_cast<uint32_t>(info.mode));
+        if (UsesAlpha(info.mode) &&
+            pdic.shared_->metadata->m.extra_channel_info.size() > 1) {
+          add_num(kPatchAlphaChannelContext, info.alpha_channel);
+        }
+        if (UsesClamp(info.mode)) {
+          add_num(kPatchClampContext, info.clamp);
+        }
+      }
+    }
+  }
+
+  EntropyEncodingData codes;
+  std::vector<uint8_t> context_map;
+  BuildAndEncodeHistograms(HistogramParams(), kNumPatchDictionaryContexts,
+                           tokens, &codes, &context_map, writer, layer,
+                           aux_out);
+  WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out);
+}
+
+// static
+void PatchDictionaryEncoder::SubtractFrom(const PatchDictionary& pdic,
+                                          Image3F* opsin) {
+  size_t num_ec = pdic.shared_->metadata->m.num_extra_channels;
+  // TODO(veluca): this can likely be optimized knowing it runs on full images.
+  for (size_t y = 0; y < opsin->ysize(); y++) {
+    float* JXL_RESTRICT rows[3] = {
+        opsin->PlaneRow(0, y),
+        opsin->PlaneRow(1, y),
+        opsin->PlaneRow(2, y),
+    };
+    for (size_t pos_idx : pdic.GetPatchesForRow(y)) {
+      const size_t blending_idx = pos_idx * (num_ec + 1);
+      const PatchPosition& pos = pdic.positions_[pos_idx];
+      const PatchReferencePosition& ref_pos =
+          pdic.ref_positions_[pos.ref_pos_idx];
+      const PatchBlendMode mode = pdic.blendings_[blending_idx].mode;
+      size_t by = pos.y;
+      size_t bx = pos.x;
+      size_t xsize = ref_pos.xsize;
+      JXL_DASSERT(y >= by);
+      JXL_DASSERT(y < by + ref_pos.ysize);
+      size_t iy = y - by;
+      size_t ref = ref_pos.ref;
+      const float* JXL_RESTRICT ref_rows[3] = {
+          pdic.shared_->reference_frames[ref].frame.color().ConstPlaneRow(
+              0, ref_pos.y0 + iy) +
+              ref_pos.x0,
+          pdic.shared_->reference_frames[ref].frame.color().ConstPlaneRow(
+              1, ref_pos.y0 + iy) +
+              ref_pos.x0,
+          pdic.shared_->reference_frames[ref].frame.color().ConstPlaneRow(
+              2, ref_pos.y0 + iy) +
+              ref_pos.x0,
+      };
+      for (size_t ix = 0; ix < xsize; ix++) {
+        for (size_t c = 0; c < 3; c++) {
+          if (mode == PatchBlendMode::kAdd) {
+            rows[c][bx + ix] -= ref_rows[c][ix];
+          } else if (mode == PatchBlendMode::kReplace) {
+            rows[c][bx + ix] = 0;
+          } else if (mode == PatchBlendMode::kNone) {
+            // Nothing to do.
+          } else {
+            JXL_ABORT("Blending mode %u not yet implemented", (uint32_t)mode);
+          }
+        }
+      }
+    }
+  }
+}
+
+namespace {
+
+struct PatchColorspaceInfo {
+  float kChannelDequant[3];
+  float kChannelWeights[3];
+
+  explicit PatchColorspaceInfo(bool is_xyb) {
+    if (is_xyb) {
+      kChannelDequant[0] = 0.01615;
+      kChannelDequant[1] = 0.08875;
+      kChannelDequant[2] = 0.1922;
+      kChannelWeights[0] = 30.0;
+      kChannelWeights[1] = 3.0;
+      kChannelWeights[2] = 1.0;
+    } else {
+      kChannelDequant[0] = 20.0f / 255;
+      kChannelDequant[1] = 22.0f / 255;
+      kChannelDequant[2] = 20.0f / 255;
+      kChannelWeights[0] = 0.017 * 255;
+      kChannelWeights[1] = 0.02 * 255;
+      kChannelWeights[2] = 0.017 * 255;
+    }
+  }
+
+  float ScaleForQuantization(float val, size_t c) {
+    return val / kChannelDequant[c];
+  }
+
+  int Quantize(float val, size_t c) {
+    return truncf(ScaleForQuantization(val, c));
+  }
+
+  bool is_similar_v(const float v1[3], const float v2[3], float threshold) {
+    float distance = 0;
+    for (size_t c = 0; c < 3; c++) {
+      distance += std::fabs(v1[c] - v2[c]) * kChannelWeights[c];
+    }
+    return distance <= threshold;
+  }
+};
+
+std::vector<PatchInfo> FindTextLikePatches(
+    const Image3F& opsin, const PassesEncoderState* JXL_RESTRICT state,
+    ThreadPool* pool, AuxOut* aux_out, bool is_xyb) {
+  if (state->cparams.patches == Override::kOff) return {};
+
+  PatchColorspaceInfo pci(is_xyb);
+  float kSimilarThreshold = 0.8f;
+
+  auto is_similar_impl = [&pci](std::pair<uint32_t, uint32_t> p1,
+                                std::pair<uint32_t, uint32_t> p2,
+                                const float* JXL_RESTRICT rows[3],
+                                size_t stride, float threshold) {
+    float v1[3], v2[3];
+    for (size_t c = 0; c < 3; c++) {
+      v1[c] = rows[c][p1.second * stride + p1.first];
+      v2[c] = rows[c][p2.second * stride + p2.first];
+    }
+    return pci.is_similar_v(v1, v2, threshold);
+  };
+
+  std::atomic<bool> has_screenshot_areas{false};
+  const size_t opsin_stride = opsin.PixelsPerRow();
+  const float* JXL_RESTRICT opsin_rows[3] = {opsin.ConstPlaneRow(0, 0),
+                                             opsin.ConstPlaneRow(1, 0),
+                                             opsin.ConstPlaneRow(2, 0)};
+
+  auto is_same = [&opsin_rows, opsin_stride](std::pair<uint32_t, uint32_t> p1,
+                                             std::pair<uint32_t, uint32_t> p2) {
+    for (size_t c = 0; c < 3; c++) {
+      float v1 = opsin_rows[c][p1.second * opsin_stride + p1.first];
+      float v2 = opsin_rows[c][p2.second * opsin_stride + p2.first];
+      if (std::fabs(v1 - v2) > 1e-4) {
+        return false;
+      }
+    }
+    return true;
+  };
+
+  auto is_similar = [&](std::pair<uint32_t, uint32_t> p1,
+                        std::pair<uint32_t, uint32_t> p2) {
+    return is_similar_impl(p1, p2, opsin_rows, opsin_stride, kSimilarThreshold);
+  };
+
+  constexpr int64_t kPatchSide = 4;
+  constexpr int64_t kExtraSide = 4;
+
+  // Look for kPatchSide size squares, naturally aligned, that all have the same
+  // pixel values.
+  ImageB is_screenshot_like(DivCeil(opsin.xsize(), kPatchSide),
+                            DivCeil(opsin.ysize(), kPatchSide));
+  ZeroFillImage(&is_screenshot_like);
+  uint8_t* JXL_RESTRICT screenshot_row = is_screenshot_like.Row(0);
+  const size_t screenshot_stride = is_screenshot_like.PixelsPerRow();
+  const auto process_row = [&](const uint32_t y, size_t /* thread */) {
+    for (uint64_t x = 0; x < opsin.xsize() / kPatchSide; x++) {
+      bool all_same = true;
+      for (size_t iy = 0; iy < static_cast<size_t>(kPatchSide); iy++) {
+        for (size_t ix = 0; ix < static_cast<size_t>(kPatchSide); ix++) {
+          size_t cx = x * kPatchSide + ix;
+          size_t cy = y * kPatchSide + iy;
+          if (!is_same({cx, cy}, {x * kPatchSide, y * kPatchSide})) {
+            all_same = false;
+            break;
+          }
+        }
+      }
+      if (!all_same) continue;
+      size_t num = 0;
+      size_t num_same = 0;
+      for (int64_t iy = -kExtraSide; iy < kExtraSide + kPatchSide; iy++) {
+        for (int64_t ix = -kExtraSide; ix < kExtraSide + kPatchSide; ix++) {
+          int64_t cx = x * kPatchSide + ix;
+          int64_t cy = y * kPatchSide + iy;
+          if (cx < 0 || static_cast<uint64_t>(cx) >= opsin.xsize() ||  //
+              cy < 0 || static_cast<uint64_t>(cy) >= opsin.ysize()) {
+            continue;
+          }
+          num++;
+          if (is_same({cx, cy}, {x * kPatchSide, y * kPatchSide})) num_same++;
+        }
+      }
+      // Too few equal pixels nearby.
+      if (num_same * 8 < num * 7) continue;
+      screenshot_row[y * screenshot_stride + x] = 1;
+      has_screenshot_areas = true;
+    }
+  };
+  JXL_CHECK(RunOnPool(pool, 0, opsin.ysize() / kPatchSide, ThreadPool::NoInit,
+                      process_row, "IsScreenshotLike"));
+
+  // TODO(veluca): also parallelize the rest of this function.
+  if (WantDebugOutput(aux_out)) {
+    aux_out->DumpPlaneNormalized("screenshot_like", is_screenshot_like);
+  }
+
+  constexpr int kSearchRadius = 1;
+
+  if (!ApplyOverride(state->cparams.patches, has_screenshot_areas)) {
+    return {};
+  }
+
+  // Search for "similar enough" pixels near the screenshot-like areas.
+  ImageB is_background(opsin.xsize(), opsin.ysize());
+  ZeroFillImage(&is_background);
+  Image3F background(opsin.xsize(), opsin.ysize());
+  ZeroFillImage(&background);
+  constexpr size_t kDistanceLimit = 50;
+  float* JXL_RESTRICT background_rows[3] = {
+      background.PlaneRow(0, 0),
+      background.PlaneRow(1, 0),
+      background.PlaneRow(2, 0),
+  };
+  const size_t background_stride = background.PixelsPerRow();
+  uint8_t* JXL_RESTRICT is_background_row = is_background.Row(0);
+  const size_t is_background_stride = is_background.PixelsPerRow();
+  std::vector<
+      std::pair<std::pair<uint32_t, uint32_t>, std::pair<uint32_t, uint32_t>>>
+      queue;
+  size_t queue_front = 0;
+  for (size_t y = 0; y < opsin.ysize(); y++) {
+    for (size_t x = 0; x < opsin.xsize(); x++) {
+      if (!screenshot_row[screenshot_stride * (y / kPatchSide) +
+                          (x / kPatchSide)])
+        continue;
+      queue.push_back({{x, y}, {x, y}});
+    }
+  }
+  while (queue.size() != queue_front) {
+    std::pair<uint32_t, uint32_t> cur = queue[queue_front].first;
+    std::pair<uint32_t, uint32_t> src = queue[queue_front].second;
+    queue_front++;
+    if (is_background_row[cur.second * is_background_stride + cur.first])
+      continue;
+    is_background_row[cur.second * is_background_stride + cur.first] = 1;
+    for (size_t c = 0; c < 3; c++) {
+      background_rows[c][cur.second * background_stride + cur.first] =
+          opsin_rows[c][src.second * opsin_stride + src.first];
+    }
+    for (int dx = -kSearchRadius; dx <= kSearchRadius; dx++) {
+      for (int dy = -kSearchRadius; dy <= kSearchRadius; dy++) {
+        if (dx == 0 && dy == 0) continue;
+        int next_first = cur.first + dx;
+        int next_second = cur.second + dy;
+        if (next_first < 0 || next_second < 0 ||
+            static_cast<uint32_t>(next_first) >= opsin.xsize() ||
+            static_cast<uint32_t>(next_second) >= opsin.ysize()) {
+          continue;
+        }
+        if (static_cast<uint32_t>(
+                std::abs(next_first - static_cast<int>(src.first)) +
+                std::abs(next_second - static_cast<int>(src.second))) >
+            kDistanceLimit) {
+          continue;
+        }
+        std::pair<uint32_t, uint32_t> next{next_first, next_second};
+        if (is_similar(src, next)) {
+          if (!screenshot_row[next.second / kPatchSide * screenshot_stride +
+                              next.first / kPatchSide] ||
+              is_same(src, next)) {
+            if (!is_background_row[next.second * is_background_stride +
+                                   next.first])
+              queue.emplace_back(next, src);
+          }
+        }
+      }
+    }
+  }
+  queue.clear();
+
+  ImageF ccs;
+  Rng rng(0);
+  bool paint_ccs = false;
+  if (WantDebugOutput(aux_out)) {
+    aux_out->DumpPlaneNormalized("is_background", is_background);
+    if (is_xyb) {
+      aux_out->DumpXybImage("background", background);
+    } else {
+      aux_out->DumpImage("background", background);
+    }
+    ccs = ImageF(opsin.xsize(), opsin.ysize());
+    ZeroFillImage(&ccs);
+    paint_ccs = true;
+  }
+
+  constexpr float kVerySimilarThreshold = 0.03f;
+  constexpr float kHasSimilarThreshold = 0.03f;
+
+  const float* JXL_RESTRICT const_background_rows[3] = {
+      background_rows[0], background_rows[1], background_rows[2]};
+  auto is_similar_b = [&](std::pair<int, int> p1, std::pair<int, int> p2) {
+    return is_similar_impl(p1, p2, const_background_rows, background_stride,
+                           kVerySimilarThreshold);
+  };
+
+  constexpr int kMinPeak = 2;
+  constexpr int kHasSimilarRadius = 2;
+
+  std::vector<PatchInfo> info;
+
+  // Find small CC outside the "similar enough" areas, compute bounding boxes,
+  // and run heuristics to exclude some patches.
+  ImageB visited(opsin.xsize(), opsin.ysize());
+  ZeroFillImage(&visited);
+  uint8_t* JXL_RESTRICT visited_row = visited.Row(0);
+  const size_t visited_stride = visited.PixelsPerRow();
+  std::vector<std::pair<uint32_t, uint32_t>> cc;
+  std::vector<std::pair<uint32_t, uint32_t>> stack;
+  for (size_t y = 0; y < opsin.ysize(); y++) {
+    for (size_t x = 0; x < opsin.xsize(); x++) {
+      if (is_background_row[y * is_background_stride + x]) continue;
+      cc.clear();
+      stack.clear();
+      stack.emplace_back(x, y);
+      size_t min_x = x;
+      size_t max_x = x;
+      size_t min_y = y;
+      size_t max_y = y;
+      std::pair<uint32_t, uint32_t> reference;
+      bool found_border = false;
+      bool all_similar = true;
+      while (!stack.empty()) {
+        std::pair<uint32_t, uint32_t> cur = stack.back();
+        stack.pop_back();
+        if (visited_row[cur.second * visited_stride + cur.first]) continue;
+        visited_row[cur.second * visited_stride + cur.first] = 1;
+        if (cur.first < min_x) min_x = cur.first;
+        if (cur.first > max_x) max_x = cur.first;
+        if (cur.second < min_y) min_y = cur.second;
+        if (cur.second > max_y) max_y = cur.second;
+        if (paint_ccs) {
+          cc.push_back(cur);
+        }
+        for (int dx = -kSearchRadius; dx <= kSearchRadius; dx++) {
+          for (int dy = -kSearchRadius; dy <= kSearchRadius; dy++) {
+            if (dx == 0 && dy == 0) continue;
+            int next_first = static_cast<int32_t>(cur.first) + dx;
+            int next_second = static_cast<int32_t>(cur.second) + dy;
+            if (next_first < 0 || next_second < 0 ||
+                static_cast<uint32_t>(next_first) >= opsin.xsize() ||
+                static_cast<uint32_t>(next_second) >= opsin.ysize()) {
+              continue;
+            }
+            std::pair<uint32_t, uint32_t> next{next_first, next_second};
+            if (!is_background_row[next.second * is_background_stride +
+                                   next.first]) {
+              stack.push_back(next);
+            } else {
+              if (!found_border) {
+                reference = next;
+                found_border = true;
+              } else {
+                if (!is_similar_b(next, reference)) all_similar = false;
+              }
+            }
+          }
+        }
+      }
+      if (!found_border || !all_similar || max_x - min_x >= kMaxPatchSize ||
+          max_y - min_y >= kMaxPatchSize) {
+        continue;
+      }
+      size_t bpos = background_stride * reference.second + reference.first;
+      float ref[3] = {background_rows[0][bpos], background_rows[1][bpos],
+                      background_rows[2][bpos]};
+      bool has_similar = false;
+      for (size_t iy = std::max<int>(
+               static_cast<int32_t>(min_y) - kHasSimilarRadius, 0);
+           iy < std::min(max_y + kHasSimilarRadius + 1, opsin.ysize()); iy++) {
+        for (size_t ix = std::max<int>(
+                 static_cast<int32_t>(min_x) - kHasSimilarRadius, 0);
+             ix < std::min(max_x + kHasSimilarRadius + 1, opsin.xsize());
+             ix++) {
+          size_t opos = opsin_stride * iy + ix;
+          float px[3] = {opsin_rows[0][opos], opsin_rows[1][opos],
+                         opsin_rows[2][opos]};
+          if (pci.is_similar_v(ref, px, kHasSimilarThreshold)) {
+            has_similar = true;
+          }
+        }
+      }
+      if (!has_similar) continue;
+      info.emplace_back();
+      info.back().second.emplace_back(min_x, min_y);
+      QuantizedPatch& patch = info.back().first;
+      patch.xsize = max_x - min_x + 1;
+      patch.ysize = max_y - min_y + 1;
+      int max_value = 0;
+      for (size_t c : {1, 0, 2}) {
+        for (size_t iy = min_y; iy <= max_y; iy++) {
+          for (size_t ix = min_x; ix <= max_x; ix++) {
+            size_t offset = (iy - min_y) * patch.xsize + ix - min_x;
+            patch.fpixels[c][offset] =
+                opsin_rows[c][iy * opsin_stride + ix] - ref[c];
+            int val = pci.Quantize(patch.fpixels[c][offset], c);
+            patch.pixels[c][offset] = val;
+            if (std::abs(val) > max_value) max_value = std::abs(val);
+          }
+        }
+      }
+      if (max_value < kMinPeak) {
+        info.pop_back();
+        continue;
+      }
+      if (paint_ccs) {
+        float cc_color = rng.UniformF(0.5, 1.0);
+        for (std::pair<uint32_t, uint32_t> p : cc) {
+          ccs.Row(p.second)[p.first] = cc_color;
+        }
+      }
+    }
+  }
+
+  if (paint_ccs) {
+    JXL_ASSERT(WantDebugOutput(aux_out));
+    aux_out->DumpPlaneNormalized("ccs", ccs);
+  }
+  if (info.empty()) {
+    return {};
+  }
+
+  // Remove duplicates.
+  constexpr size_t kMinPatchOccurrences = 2;
+  std::sort(info.begin(), info.end());
+  size_t unique = 0;
+  for (size_t i = 1; i < info.size(); i++) {
+    if (info[i].first == info[unique].first) {
+      info[unique].second.insert(info[unique].second.end(),
+                                 info[i].second.begin(), info[i].second.end());
+    } else {
+      if (info[unique].second.size() >= kMinPatchOccurrences) {
+        unique++;
+      }
+      info[unique] = info[i];
+    }
+  }
+  if (info[unique].second.size() >= kMinPatchOccurrences) {
+    unique++;
+  }
+  info.resize(unique);
+
+  size_t max_patch_size = 0;
+
+  for (size_t i = 0; i < info.size(); i++) {
+    size_t pixels = info[i].first.xsize * info[i].first.ysize;
+    if (pixels > max_patch_size) max_patch_size = pixels;
+  }
+
+  // don't use patches if all patches are smaller than this
+  constexpr size_t kMinMaxPatchSize = 20;
+  if (max_patch_size < kMinMaxPatchSize) return {};
+
+  return info;
+}
+
+}  // namespace
+
+void FindBestPatchDictionary(const Image3F& opsin,
+                             PassesEncoderState* JXL_RESTRICT state,
+                             const JxlCmsInterface& cms, ThreadPool* pool,
+                             AuxOut* aux_out, bool is_xyb) {
+  std::vector<PatchInfo> info =
+      FindTextLikePatches(opsin, state, pool, aux_out, is_xyb);
+
+  // TODO(veluca): this doesn't work if both dots and patches are enabled.
+  // For now, since dots and patches are not likely to occur in the same kind of
+  // images, disable dots if some patches were found.
+  if (info.empty() &&
+      ApplyOverride(
+          state->cparams.dots,
+          state->cparams.speed_tier <= SpeedTier::kSquirrel &&
+              state->cparams.butteraugli_distance >= kMinButteraugliForDots)) {
+    info = FindDotDictionary(state->cparams, opsin, state->shared.cmap, pool);
+  }
+
+  if (info.empty()) return;
+
+  std::sort(
+      info.begin(), info.end(), [&](const PatchInfo& a, const PatchInfo& b) {
+        return a.first.xsize * a.first.ysize > b.first.xsize * b.first.ysize;
+      });
+
+  size_t max_x_size = 0;
+  size_t max_y_size = 0;
+  size_t total_pixels = 0;
+
+  for (size_t i = 0; i < info.size(); i++) {
+    size_t pixels = info[i].first.xsize * info[i].first.ysize;
+    if (max_x_size < info[i].first.xsize) max_x_size = info[i].first.xsize;
+    if (max_y_size < info[i].first.ysize) max_y_size = info[i].first.ysize;
+    total_pixels += pixels;
+  }
+
+  // Bin-packing & conversion of patches.
+  constexpr float kBinPackingSlackness = 1.05f;
+  size_t ref_xsize = std::max<float>(max_x_size, std::sqrt(total_pixels));
+  size_t ref_ysize = std::max<float>(max_y_size, std::sqrt(total_pixels));
+  std::vector<std::pair<size_t, size_t>> ref_positions(info.size());
+  // TODO(veluca): allow partial overlaps of patches that have the same pixels.
+  size_t max_y = 0;
+  do {
+    max_y = 0;
+    // Increase packed image size.
+    ref_xsize = ref_xsize * kBinPackingSlackness + 1;
+    ref_ysize = ref_ysize * kBinPackingSlackness + 1;
+
+    ImageB occupied(ref_xsize, ref_ysize);
+    ZeroFillImage(&occupied);
+    uint8_t* JXL_RESTRICT occupied_rows = occupied.Row(0);
+    size_t occupied_stride = occupied.PixelsPerRow();
+
+    bool success = true;
+    // For every patch...
+    for (size_t patch = 0; patch < info.size(); patch++) {
+      size_t x0 = 0;
+      size_t y0 = 0;
+      size_t xsize = info[patch].first.xsize;
+      size_t ysize = info[patch].first.ysize;
+      bool found = false;
+      // For every possible start position ...
+      for (; y0 + ysize <= ref_ysize; y0++) {
+        x0 = 0;
+        for (; x0 + xsize <= ref_xsize; x0++) {
+          bool has_occupied_pixel = false;
+          size_t x = x0;
+          // Check if it is possible to place the patch in this position in the
+          // reference frame.
+          for (size_t y = y0; y < y0 + ysize; y++) {
+            x = x0;
+            for (; x < x0 + xsize; x++) {
+              if (occupied_rows[y * occupied_stride + x]) {
+                has_occupied_pixel = true;
+                break;
+              }
+            }
+          }  // end of positioning check
+          if (!has_occupied_pixel) {
+            found = true;
+            break;
+          }
+          x0 = x;  // Jump to next pixel after the occupied one.
+        }
+        if (found) break;
+      }  // end of start position checking
+
+      // We didn't find a possible position: repeat from the beginning with a
+      // larger reference frame size.
+      if (!found) {
+        success = false;
+        break;
+      }
+
+      // We found a position: mark the corresponding positions in the reference
+      // image as used.
+      ref_positions[patch] = {x0, y0};
+      for (size_t y = y0; y < y0 + ysize; y++) {
+        for (size_t x = x0; x < x0 + xsize; x++) {
+          occupied_rows[y * occupied_stride + x] = true;
+        }
+      }
+      max_y = std::max(max_y, y0 + ysize);
+    }
+
+    if (success) break;
+  } while (true);
+
+  JXL_ASSERT(ref_ysize >= max_y);
+
+  ref_ysize = max_y;
+
+  Image3F reference_frame(ref_xsize, ref_ysize);
+  // TODO(veluca): figure out a better way to fill the image.
+  ZeroFillImage(&reference_frame);
+  std::vector<PatchPosition> positions;
+  std::vector<PatchReferencePosition> pref_positions;
+  std::vector<PatchBlending> blendings;
+  float* JXL_RESTRICT ref_rows[3] = {
+      reference_frame.PlaneRow(0, 0),
+      reference_frame.PlaneRow(1, 0),
+      reference_frame.PlaneRow(2, 0),
+  };
+  size_t ref_stride = reference_frame.PixelsPerRow();
+  size_t num_ec = state->shared.metadata->m.num_extra_channels;
+
+  for (size_t i = 0; i < info.size(); i++) {
+    PatchReferencePosition ref_pos;
+    ref_pos.xsize = info[i].first.xsize;
+    ref_pos.ysize = info[i].first.ysize;
+    ref_pos.x0 = ref_positions[i].first;
+    ref_pos.y0 = ref_positions[i].second;
+    ref_pos.ref = kPatchFrameReferenceId;
+    for (size_t y = 0; y < ref_pos.ysize; y++) {
+      for (size_t x = 0; x < ref_pos.xsize; x++) {
+        for (size_t c = 0; c < 3; c++) {
+          ref_rows[c][(y + ref_pos.y0) * ref_stride + x + ref_pos.x0] =
+              info[i].first.fpixels[c][y * ref_pos.xsize + x];
+        }
+      }
+    }
+    for (const auto& pos : info[i].second) {
+      positions.emplace_back(
+          PatchPosition{pos.first, pos.second, pref_positions.size()});
+      // Add blending for color channels, ignore other channels.
+      blendings.push_back({PatchBlendMode::kAdd, 0, false});
+      for (size_t j = 0; j < num_ec; ++j) {
+        blendings.push_back({PatchBlendMode::kNone, 0, false});
+      }
+    }
+    pref_positions.emplace_back(std::move(ref_pos));
+  }
+
+  CompressParams cparams = state->cparams;
+  // Recursive application of patches could create very weird issues.
+  cparams.patches = Override::kOff;
+
+  RoundtripPatchFrame(&reference_frame, state, kPatchFrameReferenceId, cparams,
+                      cms, pool, aux_out, /*subtract=*/true);
+
+  // TODO(veluca): this assumes that applying patches is commutative, which is
+  // not true for all blending modes. This code only produces kAdd patches, so
+  // this works out.
+  PatchDictionaryEncoder::SetPositions(
+      &state->shared.image_features.patches, std::move(positions),
+      std::move(pref_positions), std::move(blendings));
+}
+
+void RoundtripPatchFrame(Image3F* reference_frame,
+                         PassesEncoderState* JXL_RESTRICT state, int idx,
+                         CompressParams& cparams, const JxlCmsInterface& cms,
+                         ThreadPool* pool, AuxOut* aux_out, bool subtract) {
+  FrameInfo patch_frame_info;
+  cparams.resampling = 1;
+  cparams.ec_resampling = 1;
+  cparams.dots = Override::kOff;
+  cparams.noise = Override::kOff;
+  cparams.modular_mode = true;
+  cparams.responsive = 0;
+  cparams.progressive_dc = 0;
+  cparams.progressive_mode = false;
+  cparams.qprogressive_mode = false;
+  // Use gradient predictor and not Predictor::Best.
+  cparams.options.predictor = Predictor::Gradient;
+  patch_frame_info.save_as_reference = idx;  // always saved.
+  patch_frame_info.frame_type = FrameType::kReferenceOnly;
+  patch_frame_info.save_before_color_transform = true;
+  ImageBundle ib(&state->shared.metadata->m);
+  // TODO(veluca): metadata.color_encoding is a lie: ib is in XYB, but there is
+  // no simple way to express that yet.
+  patch_frame_info.ib_needs_color_transform = false;
+  ib.SetFromImage(std::move(*reference_frame),
+                  state->shared.metadata->m.color_encoding);
+  if (!ib.metadata()->extra_channel_info.empty()) {
+    // Add dummy extra channels to the patch image: patch encoding does not yet
+    // support extra channels, but the codec expects that the amount of extra
+    // channels in frames matches that in the metadata of the codestream.
+    std::vector<ImageF> extra_channels;
+    extra_channels.reserve(ib.metadata()->extra_channel_info.size());
+    for (size_t i = 0; i < ib.metadata()->extra_channel_info.size(); i++) {
+      extra_channels.emplace_back(ib.xsize(), ib.ysize());
+      // Must initialize the image with data to not affect blending with
+      // uninitialized memory.
+      // TODO(lode): patches must copy and use the real extra channels instead.
+      ZeroFillImage(&extra_channels.back());
+    }
+    ib.SetExtraChannels(std::move(extra_channels));
+  }
+  PassesEncoderState roundtrip_state;
+  auto special_frame = std::unique_ptr<BitWriter>(new BitWriter());
+  AuxOut patch_aux_out;
+  JXL_CHECK(EncodeFrame(cparams, patch_frame_info, state->shared.metadata, ib,
+                        &roundtrip_state, cms, pool, special_frame.get(),
+                        aux_out ? &patch_aux_out : nullptr));
+  if (aux_out) {
+    for (const auto& l : patch_aux_out.layers) {
+      aux_out->layers[kLayerDictionary].Assimilate(l);
+    }
+  }
+  const Span<const uint8_t> encoded = special_frame->GetSpan();
+  state->special_frames.emplace_back(std::move(special_frame));
+  if (subtract) {
+    ImageBundle decoded(&state->shared.metadata->m);
+    PassesDecoderState dec_state;
+    JXL_CHECK(dec_state.output_encoding_info.SetFromMetadata(
+        *state->shared.metadata));
+    const uint8_t* frame_start = encoded.data();
+    size_t encoded_size = encoded.size();
+    JXL_CHECK(DecodeFrame(&dec_state, pool, frame_start, encoded_size, &decoded,
+                          *state->shared.metadata));
+    frame_start += decoded.decoded_bytes();
+    encoded_size -= decoded.decoded_bytes();
+    size_t ref_xsize =
+        dec_state.shared_storage.reference_frames[idx].frame.color()->xsize();
+    // if the frame itself uses patches, we need to decode another frame
+    if (!ref_xsize) {
+      JXL_CHECK(DecodeFrame(&dec_state, pool, frame_start, encoded_size,
+                            &decoded, *state->shared.metadata));
+    }
+    JXL_CHECK(encoded_size == 0);
+    state->shared.reference_frames[idx] =
+        std::move(dec_state.shared_storage.reference_frames[idx]);
+  } else {
+    state->shared.reference_frames[idx].frame = std::move(ib);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.h b/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.h
new file mode 100644
index 0000000000..f30881b232
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.h
@@ -0,0 +1,109 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_PATCH_DICTIONARY_H_
+#define LIB_JXL_ENC_PATCH_DICTIONARY_H_
+
+// Chooses reference patches, and avoids encoding them once per occurrence.
+
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include <tuple>
+#include <vector>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_patch_dictionary.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/opsin_params.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+constexpr size_t kMaxPatchSize = 32;
+
+struct QuantizedPatch {
+  size_t xsize;
+  size_t ysize;
+  QuantizedPatch() {
+    for (size_t i = 0; i < 3; i++) {
+      pixels[i].resize(kMaxPatchSize * kMaxPatchSize);
+      fpixels[i].resize(kMaxPatchSize * kMaxPatchSize);
+    }
+  }
+  std::vector<int8_t> pixels[3] = {};
+  // Not compared. Used only to retrieve original pixels to construct the
+  // reference image.
+  std::vector<float> fpixels[3] = {};
+  bool operator==(const QuantizedPatch& other) const {
+    if (xsize != other.xsize) return false;
+    if (ysize != other.ysize) return false;
+    for (size_t c = 0; c < 3; c++) {
+      if (memcmp(pixels[c].data(), other.pixels[c].data(),
+                 sizeof(int8_t) * xsize * ysize) != 0)
+        return false;
+    }
+    return true;
+  }
+
+  bool operator<(const QuantizedPatch& other) const {
+    if (xsize != other.xsize) return xsize < other.xsize;
+    if (ysize != other.ysize) return ysize < other.ysize;
+    for (size_t c = 0; c < 3; c++) {
+      int cmp = memcmp(pixels[c].data(), other.pixels[c].data(),
+                       sizeof(int8_t) * xsize * ysize);
+      if (cmp > 0) return false;
+      if (cmp < 0) return true;
+    }
+    return false;
+  }
+};
+
+// Pair (patch, vector of occurrences).
+using PatchInfo =
+    std::pair<QuantizedPatch, std::vector<std::pair<uint32_t, uint32_t>>>;
+
+// Friend class of PatchDictionary.
+class PatchDictionaryEncoder {
+ public:
+  // Only call if HasAny().
+  static void Encode(const PatchDictionary& pdic, BitWriter* writer,
+                     size_t layer, AuxOut* aux_out);
+
+  static void SetPositions(PatchDictionary* pdic,
+                           std::vector<PatchPosition> positions,
+                           std::vector<PatchReferencePosition> ref_positions,
+                           std::vector<PatchBlending> blendings) {
+    pdic->positions_ = std::move(positions);
+    pdic->ref_positions_ = std::move(ref_positions);
+    pdic->blendings_ = std::move(blendings);
+    pdic->ComputePatchTree();
+  }
+
+  static void SubtractFrom(const PatchDictionary& pdic, Image3F* opsin);
+};
+
+void FindBestPatchDictionary(const Image3F& opsin,
+                             PassesEncoderState* JXL_RESTRICT state,
+                             const JxlCmsInterface& cms, ThreadPool* pool,
+                             AuxOut* aux_out, bool is_xyb = true);
+
+void RoundtripPatchFrame(Image3F* reference_frame,
+                         PassesEncoderState* JXL_RESTRICT state, int idx,
+                         CompressParams& cparams, const JxlCmsInterface& cms,
+                         ThreadPool* pool, AuxOut* aux_out, bool subtract);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_PATCH_DICTIONARY_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_photon_noise.cc b/third_party/jpeg-xl/lib/jxl/enc_photon_noise.cc
new file mode 100644
index 0000000000..3786ef5cf5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_photon_noise.cc
@@ -0,0 +1,89 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_photon_noise.h"
+
+namespace jxl {
+
+namespace {
+
+// Assumes a daylight-like spectrum.
+// https://www.strollswithmydog.com/effective-quantum-efficiency-of-sensor/#:~:text=11%2C260%20photons/um%5E2/lx-s
+constexpr float kPhotonsPerLxSPerUm2 = 11260;
+
+// Order of magnitude for cameras in the 2010-2020 decade, taking the CFA into
+// account.
+constexpr float kEffectiveQuantumEfficiency = 0.20;
+
+// TODO(sboukortt): reevaluate whether these are good defaults, notably whether
+// it would be worth making read noise higher at lower ISO settings.
+constexpr float kPhotoResponseNonUniformity = 0.005;
+constexpr float kInputReferredReadNoise = 3;
+
+// Assumes a 35mm sensor.
+constexpr float kSensorAreaUm2 = 36000.f * 24000;
+
+template <typename T>
+inline constexpr T Square(const T x) {
+  return x * x;
+}
+template <typename T>
+inline constexpr T Cube(const T x) {
+  return x * x * x;
+}
+
+}  // namespace
+
+NoiseParams SimulatePhotonNoise(const size_t xsize, const size_t ysize,
+                                const float iso) {
+  const float kOpsinAbsorbanceBiasCbrt = std::cbrt(kOpsinAbsorbanceBias[1]);
+
+  // Focal plane exposure for 18% of kDefaultIntensityTarget, in lx·s.
+  // (ISO = 10 lx·s ÷ H)
+  const float h_18 = 10 / iso;
+
+  const float pixel_area_um2 = kSensorAreaUm2 / (xsize * ysize);
+
+  const float electrons_per_pixel_18 = kEffectiveQuantumEfficiency *
+                                       kPhotonsPerLxSPerUm2 * h_18 *
+                                       pixel_area_um2;
+
+  NoiseParams params;
+
+  for (size_t i = 0; i < NoiseParams::kNumNoisePoints; ++i) {
+    const float scaled_index = i / (NoiseParams::kNumNoisePoints - 2.f);
+    // scaled_index is used for XYB = (0, 2·scaled_index, 2·scaled_index)
+    const float y = 2 * scaled_index;
+    // 1 = default intensity target
+    const float linear = std::max(
+        0.f, Cube(y - kOpsinAbsorbanceBiasCbrt) + kOpsinAbsorbanceBias[1]);
+    const float electrons_per_pixel = electrons_per_pixel_18 * (linear / 0.18f);
+    // Quadrature sum of read noise, photon shot noise (sqrt(S) so simply not
+    // squared here) and photo response non-uniformity.
+    // https://doi.org/10.1117/3.725073
+    // Units are electrons rms.
+    const float noise =
+        std::sqrt(Square(kInputReferredReadNoise) + electrons_per_pixel +
+                  Square(kPhotoResponseNonUniformity * electrons_per_pixel));
+    const float linear_noise = noise * (0.18f / electrons_per_pixel_18);
+    const float opsin_derivative =
+        (1.f / 3) / Square(std::cbrt(linear - kOpsinAbsorbanceBias[1]));
+    const float opsin_noise = linear_noise * opsin_derivative;
+
+    // TODO(sboukortt): verify more thoroughly whether the denominator is
+    // correct.
+    params.lut[i] =
+        Clamp1(opsin_noise /
+                   (0.22f             // norm_const
+                    * std::sqrt(2.f)  // red_noise + green_noise
+                    * 1.13f  // standard deviation of a plane of generated noise
+                    ),
+               0.f, 1.f);
+  }
+
+  return params;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_photon_noise.h b/third_party/jpeg-xl/lib/jxl/enc_photon_noise.h
new file mode 100644
index 0000000000..f43e14d560
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_photon_noise.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_PHOTON_NOISE_H_
+#define LIB_JXL_ENC_PHOTON_NOISE_H_
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/noise.h"
+
+namespace jxl {
+
+// Constructs a NoiseParams representing the noise that would be seen at the
+// selected nominal exposure on a last-decade (as of 2021) color camera with a
+// 36×24mm sensor (“35mm format”).
+NoiseParams SimulatePhotonNoise(size_t xsize, size_t ysize, float iso);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_PHOTON_NOISE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_photon_noise_test.cc b/third_party/jpeg-xl/lib/jxl/enc_photon_noise_test.cc
new file mode 100644
index 0000000000..be11b465ad
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_photon_noise_test.cc
@@ -0,0 +1,51 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_photon_noise.h"
+
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+MATCHER(AreApproximatelyEqual, "") {
+  constexpr float kTolerance = 1e-6;
+  const float actual = std::get<0>(arg);
+  const float expected = std::get<1>(arg);
+  return testing::ExplainMatchResult(FloatNear(expected, kTolerance), actual,
+                                     result_listener);
+}
+
+TEST(EncPhotonNoiseTest, LUTs) {
+  EXPECT_THAT(
+      SimulatePhotonNoise(/*xsize=*/6000, /*ysize=*/4000, /*iso=*/100).lut,
+      Pointwise(AreApproximatelyEqual(),
+                {0.00259652, 0.0139648, 0.00681551, 0.00632582, 0.00694917,
+                 0.00803922, 0.00934574, 0.0107607}));
+  EXPECT_THAT(
+      SimulatePhotonNoise(/*xsize=*/6000, /*ysize=*/4000, /*iso=*/800).lut,
+      Pointwise(AreApproximatelyEqual(),
+                {0.02077220, 0.0420923, 0.01820690, 0.01439020, 0.01293670,
+                 0.01254030, 0.01277390, 0.0134161}));
+  EXPECT_THAT(
+      SimulatePhotonNoise(/*xsize=*/6000, /*ysize=*/4000, /*iso=*/6400).lut,
+      Pointwise(AreApproximatelyEqual(),
+                {0.1661770, 0.1691120, 0.05309080, 0.03963960, 0.03357410,
+                 0.03001650, 0.02776740, 0.0263478}));
+
+  // Lower when measured on a per-pixel basis as there are fewer of them.
+  EXPECT_THAT(
+      SimulatePhotonNoise(/*xsize=*/4000, /*ysize=*/3000, /*iso=*/6400).lut,
+      Pointwise(AreApproximatelyEqual(),
+                {0.0830886, 0.1008720, 0.0367748, 0.0280305, 0.0240236,
+                 0.0218040, 0.0205771, 0.0200058}));
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_progressive_split.cc b/third_party/jpeg-xl/lib/jxl/enc_progressive_split.cc
new file mode 100644
index 0000000000..b65319f3fd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_progressive_split.cc
@@ -0,0 +1,82 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_progressive_split.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <memory>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+template <typename T>
+void ProgressiveSplitter::SplitACCoefficients(
+    const T* JXL_RESTRICT block, const AcStrategy& acs, size_t bx, size_t by,
+    T* JXL_RESTRICT output[kMaxNumPasses]) {
+  size_t size = acs.covered_blocks_x() * acs.covered_blocks_y() * kDCTBlockSize;
+  auto shift_right_round0 = [&](T v, int shift) {
+    T one_if_negative = static_cast<uint32_t>(v) >> 31;
+    T add = (one_if_negative << shift) - one_if_negative;
+    return (v + add) >> shift;
+  };
+  // Early quit for the simple case of only one pass.
+  if (mode_.num_passes == 1) {
+    memcpy(output[0], block, sizeof(T) * size);
+    return;
+  }
+  size_t ncoeffs_all_done_from_earlier_passes = 1;
+
+  int previous_pass_shift = 0;
+  for (size_t num_pass = 0; num_pass < mode_.num_passes; num_pass++) {  // pass
+    // Zero out output block.
+    memset(output[num_pass], 0, size * sizeof(T));
+    const int pass_shift = mode_.passes[num_pass].shift;
+    size_t frame_ncoeffs = mode_.passes[num_pass].num_coefficients;
+    size_t xsize = acs.covered_blocks_x();
+    size_t ysize = acs.covered_blocks_y();
+    CoefficientLayout(&ysize, &xsize);
+    for (size_t y = 0; y < ysize * frame_ncoeffs; y++) {    // superblk-y
+      for (size_t x = 0; x < xsize * frame_ncoeffs; x++) {  // superblk-x
+        size_t pos = y * xsize * kBlockDim + x;
+        if (x < xsize * ncoeffs_all_done_from_earlier_passes &&
+            y < ysize * ncoeffs_all_done_from_earlier_passes) {
+          // This coefficient was already included in an earlier pass,
+          // which included a genuinely smaller set of coefficients.
+          continue;
+        }
+        T v = block[pos];
+        // Previous pass discarded some bits: do not encode them again.
+        if (previous_pass_shift != 0) {
+          T previous_v = shift_right_round0(v, previous_pass_shift) *
+                         (1 << previous_pass_shift);
+          v -= previous_v;
+        }
+        output[num_pass][pos] = shift_right_round0(v, pass_shift);
+      }  // superblk-x
+    }    // superblk-y
+    // We just finished a pass.
+    // Hence, we are now guaranteed to have included all coeffs up to
+    // frame_ncoeffs in every block, unless the current pass is shifted.
+    if (mode_.passes[num_pass].shift == 0) {
+      ncoeffs_all_done_from_earlier_passes = frame_ncoeffs;
+    }
+    previous_pass_shift = mode_.passes[num_pass].shift;
+  }  // num_pass
+}
+
+template void ProgressiveSplitter::SplitACCoefficients<int32_t>(
+    const int32_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t,
+    int32_t* JXL_RESTRICT[kMaxNumPasses]);
+
+template void ProgressiveSplitter::SplitACCoefficients<int16_t>(
+    const int16_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t,
+    int16_t* JXL_RESTRICT[kMaxNumPasses]);
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_progressive_split.h b/third_party/jpeg-xl/lib/jxl/enc_progressive_split.h
new file mode 100644
index 0000000000..ef25944bb7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_progressive_split.h
@@ -0,0 +1,131 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_PROGRESSIVE_SPLIT_H_
+#define LIB_JXL_PROGRESSIVE_SPLIT_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/splines.h"
+
+// Functions to split DCT coefficients in multiple passes. All the passes of a
+// single frame are added together.
+
+namespace jxl {
+
+constexpr size_t kNoDownsamplingFactor = std::numeric_limits<size_t>::max();
+
+struct PassDefinition {
+  // Side of the square of the coefficients that should be kept in each 8x8
+  // block. Must be greater than 1, and at most 8. Should be in non-decreasing
+  // order.
+  size_t num_coefficients;
+
+  // How much to shift the encoded values by, with rounding.
+  size_t shift;
+
+  // If specified, this indicates that if the requested downsampling factor is
+  // sufficiently high, then it is fine to stop decoding after this pass.
+  // By default, passes are not marked as being suitable for any downsampling.
+  size_t suitable_for_downsampling_of_at_least;
+};
+
+struct ProgressiveMode {
+  size_t num_passes = 1;
+  PassDefinition passes[kMaxNumPasses] = {
+      PassDefinition{/*num_coefficients=*/8, /*shift=*/0,
+                     /*suitable_for_downsampling_of_at_least=*/1}};
+
+  ProgressiveMode() = default;
+
+  template <size_t nump>
+  explicit ProgressiveMode(const PassDefinition (&p)[nump]) {
+    JXL_ASSERT(nump <= kMaxNumPasses);
+    num_passes = nump;
+    PassDefinition previous_pass{
+        /*num_coefficients=*/1, /*shift=*/0,
+        /*suitable_for_downsampling_of_at_least=*/kNoDownsamplingFactor};
+    size_t last_downsampling_factor = kNoDownsamplingFactor;
+    for (size_t i = 0; i < nump; i++) {
+      JXL_ASSERT(p[i].num_coefficients > previous_pass.num_coefficients ||
+                 (p[i].num_coefficients == previous_pass.num_coefficients &&
+                  p[i].shift < previous_pass.shift));
+      JXL_ASSERT(p[i].suitable_for_downsampling_of_at_least ==
+                     kNoDownsamplingFactor ||
+                 p[i].suitable_for_downsampling_of_at_least <=
+                     last_downsampling_factor);
+      // Only used inside assert.
+      (void)last_downsampling_factor;
+      if (p[i].suitable_for_downsampling_of_at_least != kNoDownsamplingFactor) {
+        last_downsampling_factor = p[i].suitable_for_downsampling_of_at_least;
+      }
+      previous_pass = passes[i] = p[i];
+    }
+  }
+};
+
+class ProgressiveSplitter {
+ public:
+  void SetProgressiveMode(ProgressiveMode mode) { mode_ = mode; }
+
+  size_t GetNumPasses() const { return mode_.num_passes; }
+
+  void InitPasses(Passes* JXL_RESTRICT passes) const {
+    passes->num_passes = static_cast<uint32_t>(GetNumPasses());
+    passes->num_downsample = 0;
+    JXL_ASSERT(passes->num_passes != 0);
+    passes->shift[passes->num_passes - 1] = 0;
+    if (passes->num_passes == 1) return;  // Done, arrays are empty
+
+    for (uint32_t i = 0; i < mode_.num_passes - 1; ++i) {
+      const size_t min_downsampling_factor =
+          mode_.passes[i].suitable_for_downsampling_of_at_least;
+      passes->shift[i] = mode_.passes[i].shift;
+      if (1 < min_downsampling_factor &&
+          min_downsampling_factor != kNoDownsamplingFactor) {
+        passes->downsample[passes->num_downsample] = min_downsampling_factor;
+        passes->last_pass[passes->num_downsample] = i;
+        if (mode_.passes[i + 1].suitable_for_downsampling_of_at_least <
+            min_downsampling_factor) {
+          passes->num_downsample += 1;
+        }
+      }
+    }
+  }
+
+  template <typename T>
+  void SplitACCoefficients(const T* JXL_RESTRICT block, const AcStrategy& acs,
+                           size_t bx, size_t by,
+                           T* JXL_RESTRICT output[kMaxNumPasses]);
+
+ private:
+  ProgressiveMode mode_;
+};
+
+extern template void ProgressiveSplitter::SplitACCoefficients<int32_t>(
+    const int32_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t,
+    int32_t* JXL_RESTRICT[kMaxNumPasses]);
+
+extern template void ProgressiveSplitter::SplitACCoefficients<int16_t>(
+    const int16_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t,
+    int16_t* JXL_RESTRICT[kMaxNumPasses]);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_PROGRESSIVE_SPLIT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_quant_weights.cc b/third_party/jpeg-xl/lib/jxl/enc_quant_weights.cc
new file mode 100644
index 0000000000..848310e75d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_quant_weights.cc
@@ -0,0 +1,214 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_quant_weights.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <utility>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_modular.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/options.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+namespace {
+
+Status EncodeDctParams(const DctQuantWeightParams& params, BitWriter* writer) {
+  JXL_ASSERT(params.num_distance_bands >= 1);
+  writer->Write(DctQuantWeightParams::kLog2MaxDistanceBands,
+                params.num_distance_bands - 1);
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t i = 0; i < params.num_distance_bands; i++) {
+      JXL_RETURN_IF_ERROR(F16Coder::Write(
+          params.distance_bands[c][i] * (i == 0 ? (1 / 64.0f) : 1.0f), writer));
+    }
+  }
+  return true;
+}
+
+Status EncodeQuant(const QuantEncoding& encoding, size_t idx, size_t size_x,
+                   size_t size_y, BitWriter* writer,
+                   ModularFrameEncoder* modular_frame_encoder) {
+  writer->Write(kLog2NumQuantModes, encoding.mode);
+  size_x *= kBlockDim;
+  size_y *= kBlockDim;
+  switch (encoding.mode) {
+    case QuantEncoding::kQuantModeLibrary: {
+      writer->Write(kCeilLog2NumPredefinedTables, encoding.predefined);
+      break;
+    }
+    case QuantEncoding::kQuantModeID: {
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 3; i++) {
+          JXL_RETURN_IF_ERROR(
+              F16Coder::Write(encoding.idweights[c][i] * (1.0f / 64), writer));
+        }
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT2: {
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 6; i++) {
+          JXL_RETURN_IF_ERROR(F16Coder::Write(
+              encoding.dct2weights[c][i] * (1.0f / 64), writer));
+        }
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT4X8: {
+      for (size_t c = 0; c < 3; c++) {
+        JXL_RETURN_IF_ERROR(
+            F16Coder::Write(encoding.dct4x8multipliers[c], writer));
+      }
+      JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer));
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT4: {
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 2; i++) {
+          JXL_RETURN_IF_ERROR(
+              F16Coder::Write(encoding.dct4multipliers[c][i], writer));
+        }
+      }
+      JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer));
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT: {
+      JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer));
+      break;
+    }
+    case QuantEncoding::kQuantModeRAW: {
+      ModularFrameEncoder::EncodeQuantTable(size_x, size_y, writer, encoding,
+                                            idx, modular_frame_encoder);
+      break;
+    }
+    case QuantEncoding::kQuantModeAFV: {
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 9; i++) {
+          JXL_RETURN_IF_ERROR(F16Coder::Write(
+              encoding.afv_weights[c][i] * (i < 6 ? 1.0f / 64 : 1.0f), writer));
+        }
+      }
+      JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer));
+      JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params_afv_4x4, writer));
+      break;
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+Status DequantMatricesEncode(const DequantMatrices* matrices, BitWriter* writer,
+                             size_t layer, AuxOut* aux_out,
+                             ModularFrameEncoder* modular_frame_encoder) {
+  bool all_default = true;
+  const std::vector<QuantEncoding>& encodings = matrices->encodings();
+
+  for (size_t i = 0; i < encodings.size(); i++) {
+    if (encodings[i].mode != QuantEncoding::kQuantModeLibrary ||
+        encodings[i].predefined != 0) {
+      all_default = false;
+    }
+  }
+  // TODO(janwas): better bound
+  BitWriter::Allotment allotment(writer, 512 * 1024);
+  writer->Write(1, all_default);
+  if (!all_default) {
+    for (size_t i = 0; i < encodings.size(); i++) {
+      JXL_RETURN_IF_ERROR(EncodeQuant(
+          encodings[i], i, DequantMatrices::required_size_x[i],
+          DequantMatrices::required_size_y[i], writer, modular_frame_encoder));
+    }
+  }
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+  return true;
+}
+
+Status DequantMatricesEncodeDC(const DequantMatrices* matrices,
+                               BitWriter* writer, size_t layer,
+                               AuxOut* aux_out) {
+  bool all_default = true;
+  const float* dc_quant = matrices->DCQuants();
+  for (size_t c = 0; c < 3; c++) {
+    if (dc_quant[c] != kDCQuant[c]) {
+      all_default = false;
+    }
+  }
+  BitWriter::Allotment allotment(writer, 1 + sizeof(float) * kBitsPerByte * 3);
+  writer->Write(1, all_default);
+  if (!all_default) {
+    for (size_t c = 0; c < 3; c++) {
+      JXL_RETURN_IF_ERROR(F16Coder::Write(dc_quant[c] * 128.0f, writer));
+    }
+  }
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+  return true;
+}
+
+void DequantMatricesSetCustomDC(DequantMatrices* matrices, const float* dc) {
+  matrices->SetDCQuant(dc);
+  // Roundtrip encode/decode DC to ensure same values as decoder.
+  BitWriter writer;
+  JXL_CHECK(DequantMatricesEncodeDC(matrices, &writer, 0, nullptr));
+  writer.ZeroPadToByte();
+  BitReader br(writer.GetSpan());
+  // Called only in the encoder: should fail only for programmer errors.
+  JXL_CHECK(matrices->DecodeDC(&br));
+  JXL_CHECK(br.Close());
+}
+
+void DequantMatricesScaleDC(DequantMatrices* matrices, const float scale) {
+  float dc[3];
+  for (size_t c = 0; c < 3; ++c) {
+    dc[c] = matrices->InvDCQuant(c) * (1.0f / scale);
+  }
+  DequantMatricesSetCustomDC(matrices, dc);
+}
+
+void DequantMatricesRoundtrip(DequantMatrices* matrices) {
+  // Do not pass modular en/decoder, as they only change entropy and not
+  // values.
+  BitWriter writer;
+  JXL_CHECK(DequantMatricesEncode(matrices, &writer, 0, nullptr));
+  writer.ZeroPadToByte();
+  BitReader br(writer.GetSpan());
+  // Called only in the encoder: should fail only for programmer errors.
+  JXL_CHECK(matrices->Decode(&br));
+  JXL_CHECK(br.Close());
+}
+
+void DequantMatricesSetCustom(DequantMatrices* matrices,
+                              const std::vector<QuantEncoding>& encodings,
+                              ModularFrameEncoder* encoder) {
+  JXL_ASSERT(encodings.size() == DequantMatrices::kNum);
+  matrices->SetEncodings(encodings);
+  for (size_t i = 0; i < encodings.size(); i++) {
+    if (encodings[i].mode == QuantEncodingInternal::kQuantModeRAW) {
+      encoder->AddQuantTable(DequantMatrices::required_size_x[i] * kBlockDim,
+                             DequantMatrices::required_size_y[i] * kBlockDim,
+                             encodings[i], i);
+    }
+  }
+  DequantMatricesRoundtrip(matrices);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_quant_weights.h b/third_party/jpeg-xl/lib/jxl/enc_quant_weights.h
new file mode 100644
index 0000000000..e0a387fed5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_quant_weights.h
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_QUANT_WEIGHTS_H_
+#define LIB_JXL_ENC_QUANT_WEIGHTS_H_
+
+#include "lib/jxl/quant_weights.h"
+
+namespace jxl {
+
+struct AuxOut;
+struct BitWriter;
+
+Status DequantMatricesEncode(
+    const DequantMatrices* matrices, BitWriter* writer, size_t layer,
+    AuxOut* aux_out, ModularFrameEncoder* modular_frame_encoder = nullptr);
+Status DequantMatricesEncodeDC(const DequantMatrices* matrices,
+                               BitWriter* writer, size_t layer,
+                               AuxOut* aux_out);
+// For consistency with QuantEncoding, higher values correspond to more
+// precision.
+void DequantMatricesSetCustomDC(DequantMatrices* matrices, const float* dc);
+
+void DequantMatricesScaleDC(DequantMatrices* matrices, float scale);
+
+void DequantMatricesSetCustom(DequantMatrices* matrices,
+                              const std::vector<QuantEncoding>& encodings,
+                              ModularFrameEncoder* encoder);
+
+// Roundtrip encode/decode the matrices to ensure same values as decoder.
+void DequantMatricesRoundtrip(DequantMatrices* matrices);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_QUANT_WEIGHTS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_splines.cc b/third_party/jpeg-xl/lib/jxl/enc_splines.cc
new file mode 100644
index 0000000000..ddcd78a748
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_splines.cc
@@ -0,0 +1,98 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <algorithm>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/splines.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+class QuantizedSplineEncoder {
+ public:
+  // Only call if HasAny().
+  static void Tokenize(const QuantizedSpline& spline,
+                       std::vector<Token>* const tokens) {
+    tokens->emplace_back(kNumControlPointsContext,
+                         spline.control_points_.size());
+    for (const auto& point : spline.control_points_) {
+      tokens->emplace_back(kControlPointsContext, PackSigned(point.first));
+      tokens->emplace_back(kControlPointsContext, PackSigned(point.second));
+    }
+    const auto encode_dct = [tokens](const int dct[32]) {
+      for (int i = 0; i < 32; ++i) {
+        tokens->emplace_back(kDCTContext, PackSigned(dct[i]));
+      }
+    };
+    for (int c = 0; c < 3; ++c) {
+      encode_dct(spline.color_dct_[c]);
+    }
+    encode_dct(spline.sigma_dct_);
+  }
+};
+
+namespace {
+
+void EncodeAllStartingPoints(const std::vector<Spline::Point>& points,
+                             std::vector<Token>* tokens) {
+  int64_t last_x = 0;
+  int64_t last_y = 0;
+  for (size_t i = 0; i < points.size(); i++) {
+    const int64_t x = lroundf(points[i].x);
+    const int64_t y = lroundf(points[i].y);
+    if (i == 0) {
+      tokens->emplace_back(kStartingPositionContext, x);
+      tokens->emplace_back(kStartingPositionContext, y);
+    } else {
+      tokens->emplace_back(kStartingPositionContext, PackSigned(x - last_x));
+      tokens->emplace_back(kStartingPositionContext, PackSigned(y - last_y));
+    }
+    last_x = x;
+    last_y = y;
+  }
+}
+
+}  // namespace
+
+void EncodeSplines(const Splines& splines, BitWriter* writer,
+                   const size_t layer, const HistogramParams& histogram_params,
+                   AuxOut* aux_out) {
+  JXL_ASSERT(splines.HasAny());
+
+  const std::vector<QuantizedSpline>& quantized_splines =
+      splines.QuantizedSplines();
+  std::vector<std::vector<Token>> tokens(1);
+  tokens[0].emplace_back(kNumSplinesContext, quantized_splines.size() - 1);
+  EncodeAllStartingPoints(splines.StartingPoints(), &tokens[0]);
+
+  tokens[0].emplace_back(kQuantizationAdjustmentContext,
+                         PackSigned(splines.GetQuantizationAdjustment()));
+
+  for (const QuantizedSpline& spline : quantized_splines) {
+    QuantizedSplineEncoder::Tokenize(spline, &tokens[0]);
+  }
+
+  EntropyEncodingData codes;
+  std::vector<uint8_t> context_map;
+  BuildAndEncodeHistograms(histogram_params, kNumSplineContexts, tokens, &codes,
+                           &context_map, writer, layer, aux_out);
+  WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out);
+}
+
+Splines FindSplines(const Image3F& opsin) {
+  // TODO: implement spline detection.
+  return {};
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_splines.h b/third_party/jpeg-xl/lib/jxl/enc_splines.h
new file mode 100644
index 0000000000..be700dba75
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_splines.h
@@ -0,0 +1,38 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_SPLINES_H_
+#define LIB_JXL_ENC_SPLINES_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/splines.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Only call if splines.HasAny().
+void EncodeSplines(const Splines& splines, BitWriter* writer, size_t layer,
+                   const HistogramParams& histogram_params, AuxOut* aux_out);
+
+Splines FindSplines(const Image3F& opsin);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_SPLINES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_toc.cc b/third_party/jpeg-xl/lib/jxl/enc_toc.cc
new file mode 100644
index 0000000000..dc75fdd9ba
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_toc.cc
@@ -0,0 +1,45 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_toc.h"
+
+#include <stdint.h>
+
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_coeff_order.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/toc.h"
+
+namespace jxl {
+Status WriteGroupOffsets(const std::vector<BitWriter>& group_codes,
+                         const std::vector<coeff_order_t>* permutation,
+                         BitWriter* JXL_RESTRICT writer, AuxOut* aux_out) {
+  BitWriter::Allotment allotment(writer, MaxBits(group_codes.size()));
+  if (permutation && !group_codes.empty()) {
+    // Don't write a permutation at all for an empty group_codes.
+    writer->Write(1, 1);  // permutation
+    JXL_DASSERT(permutation->size() == group_codes.size());
+    EncodePermutation(permutation->data(), /*skip=*/0, permutation->size(),
+                      writer, /* layer= */ 0, aux_out);
+
+  } else {
+    writer->Write(1, 0);  // no permutation
+  }
+  writer->ZeroPadToByte();  // before TOC entries
+
+  for (size_t i = 0; i < group_codes.size(); i++) {
+    JXL_ASSERT(group_codes[i].BitsWritten() % kBitsPerByte == 0);
+    const size_t group_size = group_codes[i].BitsWritten() / kBitsPerByte;
+    JXL_RETURN_IF_ERROR(U32Coder::Write(kTocDist, group_size, writer));
+  }
+  writer->ZeroPadToByte();  // before first group
+  allotment.ReclaimAndCharge(writer, kLayerTOC, aux_out);
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_toc.h b/third_party/jpeg-xl/lib/jxl/enc_toc.h
new file mode 100644
index 0000000000..242b3efccb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_toc.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_TOC_H_
+#define LIB_JXL_ENC_TOC_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/enc_bit_writer.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Writes the group offsets. If the permutation vector is nullptr, the identity
+// permutation will be used.
+Status WriteGroupOffsets(const std::vector<BitWriter>& group_codes,
+                         const std::vector<coeff_order_t>* permutation,
+                         BitWriter* JXL_RESTRICT writer, AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_TOC_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_transforms-inl.h b/third_party/jpeg-xl/lib/jxl/enc_transforms-inl.h
new file mode 100644
index 0000000000..ef6dc2bbd7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_transforms-inl.h
@@ -0,0 +1,827 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JXL_ENC_TRANSFORMS_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_ENC_TRANSFORMS_INL_H_
+#undef LIB_JXL_ENC_TRANSFORMS_INL_H_
+#else
+#define LIB_JXL_ENC_TRANSFORMS_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/dct-inl.h"
+#include "lib/jxl/dct_scales.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// Inverse of ReinterpretingDCT.
+template <size_t DCT_ROWS, size_t DCT_COLS, size_t LF_ROWS, size_t LF_COLS,
+          size_t ROWS, size_t COLS>
+HWY_INLINE void ReinterpretingIDCT(const float* input,
+                                   const size_t input_stride, float* output,
+                                   const size_t output_stride) {
+  HWY_ALIGN float block[ROWS * COLS] = {};
+  if (ROWS < COLS) {
+    for (size_t y = 0; y < LF_ROWS; y++) {
+      for (size_t x = 0; x < LF_COLS; x++) {
+        block[y * COLS + x] = input[y * input_stride + x] *
+                              DCTTotalResampleScale<DCT_ROWS, ROWS>(y) *
+                              DCTTotalResampleScale<DCT_COLS, COLS>(x);
+      }
+    }
+  } else {
+    for (size_t y = 0; y < LF_COLS; y++) {
+      for (size_t x = 0; x < LF_ROWS; x++) {
+        block[y * ROWS + x] = input[y * input_stride + x] *
+                              DCTTotalResampleScale<DCT_COLS, COLS>(y) *
+                              DCTTotalResampleScale<DCT_ROWS, ROWS>(x);
+      }
+    }
+  }
+
+  // ROWS, COLS <= 8, so we can put scratch space on the stack.
+  HWY_ALIGN float scratch_space[ROWS * COLS];
+  ComputeScaledIDCT<ROWS, COLS>()(block, DCTTo(output, output_stride),
+                                  scratch_space);
+}
+
+template <size_t S>
+void DCT2TopBlock(const float* block, size_t stride, float* out) {
+  static_assert(kBlockDim % S == 0, "S should be a divisor of kBlockDim");
+  static_assert(S % 2 == 0, "S should be even");
+  float temp[kDCTBlockSize];
+  constexpr size_t num_2x2 = S / 2;
+  for (size_t y = 0; y < num_2x2; y++) {
+    for (size_t x = 0; x < num_2x2; x++) {
+      float c00 = block[y * 2 * stride + x * 2];
+      float c01 = block[y * 2 * stride + x * 2 + 1];
+      float c10 = block[(y * 2 + 1) * stride + x * 2];
+      float c11 = block[(y * 2 + 1) * stride + x * 2 + 1];
+      float r00 = c00 + c01 + c10 + c11;
+      float r01 = c00 + c01 - c10 - c11;
+      float r10 = c00 - c01 + c10 - c11;
+      float r11 = c00 - c01 - c10 + c11;
+      r00 *= 0.25f;
+      r01 *= 0.25f;
+      r10 *= 0.25f;
+      r11 *= 0.25f;
+      temp[y * kBlockDim + x] = r00;
+      temp[y * kBlockDim + num_2x2 + x] = r01;
+      temp[(y + num_2x2) * kBlockDim + x] = r10;
+      temp[(y + num_2x2) * kBlockDim + num_2x2 + x] = r11;
+    }
+  }
+  for (size_t y = 0; y < S; y++) {
+    for (size_t x = 0; x < S; x++) {
+      out[y * kBlockDim + x] = temp[y * kBlockDim + x];
+    }
+  }
+}
+
+void AFVDCT4x4(const float* JXL_RESTRICT pixels, float* JXL_RESTRICT coeffs) {
+  HWY_ALIGN static constexpr float k4x4AFVBasisTranspose[16][16] = {
+      {
+          0.2500000000000000,
+          0.8769029297991420f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          -0.4105377591765233f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+      },
+      {
+          0.2500000000000000,
+          0.2206518106944235f,
+          0.0000000000000000,
+          0.0000000000000000,
+          -0.7071067811865474f,
+          0.6235485373547691f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375376f,
+          0.4067007583026075f,
+          -0.2125574805828875f,
+          0.0000000000000000,
+          -0.0643507165794627f,
+          -0.4517556589999482f,
+          -0.3046847507248690f,
+          0.3017929516615495f,
+          0.4082482904638627f,
+          0.1747866975480809f,
+          -0.2110560104933578f,
+          -0.1426608480880726f,
+          -0.1381354035075859f,
+          -0.1743760259965107f,
+          0.1135498731499434f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375375f,
+          0.4444481661973445f,
+          0.3085497062849767f,
+          0.0000000000000000f,
+          -0.0643507165794627f,
+          0.1585450355184006f,
+          0.5112616136591823f,
+          0.2579236279634118f,
+          0.0000000000000000,
+          0.0812611176717539f,
+          0.1856718091610980f,
+          -0.3416446842253372f,
+          0.3302282550303788f,
+          0.0702790691196284f,
+          -0.0741750459581035f,
+      },
+      {
+          0.2500000000000000,
+          0.2206518106944236f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.7071067811865476f,
+          0.6235485373547694f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375378f,
+          0.0000000000000000,
+          0.4706702258572536f,
+          0.0000000000000000,
+          -0.0643507165794628f,
+          -0.0403851516082220f,
+          0.0000000000000000,
+          0.1627234014286620f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.7367497537172237f,
+          0.0875511500058708f,
+          -0.2921026642334881f,
+          0.1940289303259434f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375377f,
+          0.1957439937204294f,
+          -0.1621205195722993f,
+          0.0000000000000000,
+          -0.0643507165794628f,
+          0.0074182263792424f,
+          -0.2904801297289980f,
+          0.0952002265347504f,
+          0.0000000000000000,
+          -0.3675398009862027f,
+          0.4921585901373873f,
+          0.2462710772207515f,
+          -0.0794670660590957f,
+          0.3623817333531167f,
+          -0.4351904965232280f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375376f,
+          0.2929100136981264f,
+          0.0000000000000000,
+          0.0000000000000000,
+          -0.0643507165794627f,
+          0.3935103426921017f,
+          -0.0657870154914280f,
+          0.0000000000000000,
+          -0.4082482904638628f,
+          -0.3078822139579090f,
+          -0.3852501370925192f,
+          -0.0857401903551931f,
+          -0.4613374887461511f,
+          0.0000000000000000,
+          0.2191868483885747f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375376f,
+          -0.4067007583026072f,
+          -0.2125574805828705f,
+          0.0000000000000000,
+          -0.0643507165794627f,
+          -0.4517556589999464f,
+          0.3046847507248840f,
+          0.3017929516615503f,
+          -0.4082482904638635f,
+          -0.1747866975480813f,
+          0.2110560104933581f,
+          -0.1426608480880734f,
+          -0.1381354035075829f,
+          -0.1743760259965108f,
+          0.1135498731499426f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375377f,
+          -0.1957439937204287f,
+          -0.1621205195722833f,
+          0.0000000000000000,
+          -0.0643507165794628f,
+          0.0074182263792444f,
+          0.2904801297290076f,
+          0.0952002265347505f,
+          0.0000000000000000,
+          0.3675398009862011f,
+          -0.4921585901373891f,
+          0.2462710772207514f,
+          -0.0794670660591026f,
+          0.3623817333531165f,
+          -0.4351904965232251f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375375f,
+          0.0000000000000000,
+          -0.4706702258572528f,
+          0.0000000000000000,
+          -0.0643507165794627f,
+          0.1107416575309343f,
+          0.0000000000000000,
+          -0.1627234014286617f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.1488339922711357f,
+          0.4972464710953509f,
+          0.2921026642334879f,
+          0.5550443808910661f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375377f,
+          0.1137907446044809f,
+          -0.1464291867126764f,
+          0.0000000000000000,
+          -0.0643507165794628f,
+          0.0829816309488205f,
+          -0.2388977352334460f,
+          -0.3531238544981630f,
+          -0.4082482904638630f,
+          0.4826689115059883f,
+          0.1741941265991622f,
+          -0.0476868035022925f,
+          0.1253805944856366f,
+          -0.4326608024727445f,
+          -0.2546827712406646f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375377f,
+          -0.4444481661973438f,
+          0.3085497062849487f,
+          0.0000000000000000,
+          -0.0643507165794628f,
+          0.1585450355183970f,
+          -0.5112616136592012f,
+          0.2579236279634129f,
+          0.0000000000000000,
+          -0.0812611176717504f,
+          -0.1856718091610990f,
+          -0.3416446842253373f,
+          0.3302282550303805f,
+          0.0702790691196282f,
+          -0.0741750459581023f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375376f,
+          -0.2929100136981264f,
+          0.0000000000000000,
+          0.0000000000000000,
+          -0.0643507165794627f,
+          0.3935103426921022f,
+          0.0657870154914254f,
+          0.0000000000000000,
+          0.4082482904638634f,
+          0.3078822139579031f,
+          0.3852501370925211f,
+          -0.0857401903551927f,
+          -0.4613374887461554f,
+          0.0000000000000000,
+          0.2191868483885728f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375376f,
+          -0.1137907446044814f,
+          -0.1464291867126654f,
+          0.0000000000000000,
+          -0.0643507165794627f,
+          0.0829816309488214f,
+          0.2388977352334547f,
+          -0.3531238544981624f,
+          0.4082482904638630f,
+          -0.4826689115059858f,
+          -0.1741941265991621f,
+          -0.0476868035022928f,
+          0.1253805944856431f,
+          -0.4326608024727457f,
+          -0.2546827712406641f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375374f,
+          0.0000000000000000,
+          0.4251149611657548f,
+          0.0000000000000000,
+          -0.0643507165794626f,
+          -0.4517556589999480f,
+          0.0000000000000000,
+          -0.6035859033230976f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          -0.1426608480880724f,
+          -0.1381354035075845f,
+          0.3487520519930227f,
+          0.1135498731499429f,
+      },
+  };
+
+  const HWY_CAPPED(float, 16) d;
+  for (size_t i = 0; i < 16; i += Lanes(d)) {
+    auto scalar = Zero(d);
+    for (size_t j = 0; j < 16; j++) {
+      auto px = Set(d, pixels[j]);
+      auto basis = Load(d, k4x4AFVBasisTranspose[j] + i);
+      scalar = MulAdd(px, basis, scalar);
+    }
+    Store(scalar, d, coeffs + i);
+  }
+}
+
+// Coefficient layout:
+//  - (even, even) positions hold AFV coefficients
+//  - (odd, even) positions hold DCT4x4 coefficients
+//  - (any, odd) positions hold DCT4x8 coefficients
+template <size_t afv_kind>
+void AFVTransformFromPixels(const float* JXL_RESTRICT pixels,
+                            size_t pixels_stride,
+                            float* JXL_RESTRICT coefficients) {
+  HWY_ALIGN float scratch_space[4 * 8 * 2];
+  size_t afv_x = afv_kind & 1;
+  size_t afv_y = afv_kind / 2;
+  HWY_ALIGN float block[4 * 8];
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 4; ix++) {
+      block[(afv_y == 1 ? 3 - iy : iy) * 4 + (afv_x == 1 ? 3 - ix : ix)] =
+          pixels[(iy + 4 * afv_y) * pixels_stride + ix + 4 * afv_x];
+    }
+  }
+  // AFV coefficients in (even, even) positions.
+  HWY_ALIGN float coeff[4 * 4];
+  AFVDCT4x4(block, coeff);
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 4; ix++) {
+      coefficients[iy * 2 * 8 + ix * 2] = coeff[iy * 4 + ix];
+    }
+  }
+  // 4x4 DCT of the block with same y and different x.
+  ComputeScaledDCT<4, 4>()(
+      DCTFrom(pixels + afv_y * 4 * pixels_stride + (afv_x == 1 ? 0 : 4),
+              pixels_stride),
+      block, scratch_space);
+  // ... in (odd, even) positions.
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 8; ix++) {
+      coefficients[iy * 2 * 8 + ix * 2 + 1] = block[iy * 4 + ix];
+    }
+  }
+  // 4x8 DCT of the other half of the block.
+  ComputeScaledDCT<4, 8>()(
+      DCTFrom(pixels + (afv_y == 1 ? 0 : 4) * pixels_stride, pixels_stride),
+      block, scratch_space);
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 8; ix++) {
+      coefficients[(1 + iy * 2) * 8 + ix] = block[iy * 8 + ix];
+    }
+  }
+  float block00 = coefficients[0] * 0.25f;
+  float block01 = coefficients[1];
+  float block10 = coefficients[8];
+  coefficients[0] = (block00 + block01 + 2 * block10) * 0.25f;
+  coefficients[1] = (block00 - block01) * 0.5f;
+  coefficients[8] = (block00 + block01 - 2 * block10) * 0.25f;
+}
+
+HWY_MAYBE_UNUSED void TransformFromPixels(const AcStrategy::Type strategy,
+                                          const float* JXL_RESTRICT pixels,
+                                          size_t pixels_stride,
+                                          float* JXL_RESTRICT coefficients,
+                                          float* JXL_RESTRICT scratch_space) {
+  using Type = AcStrategy::Type;
+  switch (strategy) {
+    case Type::IDENTITY: {
+      PROFILER_ZONE("DCT Identity");
+      for (size_t y = 0; y < 2; y++) {
+        for (size_t x = 0; x < 2; x++) {
+          float block_dc = 0;
+          for (size_t iy = 0; iy < 4; iy++) {
+            for (size_t ix = 0; ix < 4; ix++) {
+              block_dc += pixels[(y * 4 + iy) * pixels_stride + x * 4 + ix];
+            }
+          }
+          block_dc *= 1.0f / 16;
+          for (size_t iy = 0; iy < 4; iy++) {
+            for (size_t ix = 0; ix < 4; ix++) {
+              if (ix == 1 && iy == 1) continue;
+              coefficients[(y + iy * 2) * 8 + x + ix * 2] =
+                  pixels[(y * 4 + iy) * pixels_stride + x * 4 + ix] -
+                  pixels[(y * 4 + 1) * pixels_stride + x * 4 + 1];
+            }
+          }
+          coefficients[(y + 2) * 8 + x + 2] = coefficients[y * 8 + x];
+          coefficients[y * 8 + x] = block_dc;
+        }
+      }
+      float block00 = coefficients[0];
+      float block01 = coefficients[1];
+      float block10 = coefficients[8];
+      float block11 = coefficients[9];
+      coefficients[0] = (block00 + block01 + block10 + block11) * 0.25f;
+      coefficients[1] = (block00 + block01 - block10 - block11) * 0.25f;
+      coefficients[8] = (block00 - block01 + block10 - block11) * 0.25f;
+      coefficients[9] = (block00 - block01 - block10 + block11) * 0.25f;
+      break;
+    }
+    case Type::DCT8X4: {
+      PROFILER_ZONE("DCT 8x4");
+      for (size_t x = 0; x < 2; x++) {
+        HWY_ALIGN float block[4 * 8];
+        ComputeScaledDCT<8, 4>()(DCTFrom(pixels + x * 4, pixels_stride), block,
+                                 scratch_space);
+        for (size_t iy = 0; iy < 4; iy++) {
+          for (size_t ix = 0; ix < 8; ix++) {
+            // Store transposed.
+            coefficients[(x + iy * 2) * 8 + ix] = block[iy * 8 + ix];
+          }
+        }
+      }
+      float block0 = coefficients[0];
+      float block1 = coefficients[8];
+      coefficients[0] = (block0 + block1) * 0.5f;
+      coefficients[8] = (block0 - block1) * 0.5f;
+      break;
+    }
+    case Type::DCT4X8: {
+      PROFILER_ZONE("DCT 4x8");
+      for (size_t y = 0; y < 2; y++) {
+        HWY_ALIGN float block[4 * 8];
+        ComputeScaledDCT<4, 8>()(
+            DCTFrom(pixels + y * 4 * pixels_stride, pixels_stride), block,
+            scratch_space);
+        for (size_t iy = 0; iy < 4; iy++) {
+          for (size_t ix = 0; ix < 8; ix++) {
+            coefficients[(y + iy * 2) * 8 + ix] = block[iy * 8 + ix];
+          }
+        }
+      }
+      float block0 = coefficients[0];
+      float block1 = coefficients[8];
+      coefficients[0] = (block0 + block1) * 0.5f;
+      coefficients[8] = (block0 - block1) * 0.5f;
+      break;
+    }
+    case Type::DCT4X4: {
+      PROFILER_ZONE("DCT 4");
+      for (size_t y = 0; y < 2; y++) {
+        for (size_t x = 0; x < 2; x++) {
+          HWY_ALIGN float block[4 * 4];
+          ComputeScaledDCT<4, 4>()(
+              DCTFrom(pixels + y * 4 * pixels_stride + x * 4, pixels_stride),
+              block, scratch_space);
+          for (size_t iy = 0; iy < 4; iy++) {
+            for (size_t ix = 0; ix < 4; ix++) {
+              coefficients[(y + iy * 2) * 8 + x + ix * 2] = block[iy * 4 + ix];
+            }
+          }
+        }
+      }
+      float block00 = coefficients[0];
+      float block01 = coefficients[1];
+      float block10 = coefficients[8];
+      float block11 = coefficients[9];
+      coefficients[0] = (block00 + block01 + block10 + block11) * 0.25f;
+      coefficients[1] = (block00 + block01 - block10 - block11) * 0.25f;
+      coefficients[8] = (block00 - block01 + block10 - block11) * 0.25f;
+      coefficients[9] = (block00 - block01 - block10 + block11) * 0.25f;
+      break;
+    }
+    case Type::DCT2X2: {
+      PROFILER_ZONE("DCT 2");
+      DCT2TopBlock<8>(pixels, pixels_stride, coefficients);
+      DCT2TopBlock<4>(coefficients, kBlockDim, coefficients);
+      DCT2TopBlock<2>(coefficients, kBlockDim, coefficients);
+      break;
+    }
+    case Type::DCT16X16: {
+      PROFILER_ZONE("DCT 16");
+      ComputeScaledDCT<16, 16>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT16X8: {
+      PROFILER_ZONE("DCT 16x8");
+      ComputeScaledDCT<16, 8>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                scratch_space);
+      break;
+    }
+    case Type::DCT8X16: {
+      PROFILER_ZONE("DCT 8x16");
+      ComputeScaledDCT<8, 16>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                scratch_space);
+      break;
+    }
+    case Type::DCT32X8: {
+      PROFILER_ZONE("DCT 32x8");
+      ComputeScaledDCT<32, 8>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                scratch_space);
+      break;
+    }
+    case Type::DCT8X32: {
+      PROFILER_ZONE("DCT 8x32");
+      ComputeScaledDCT<8, 32>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                scratch_space);
+      break;
+    }
+    case Type::DCT32X16: {
+      PROFILER_ZONE("DCT 32x16");
+      ComputeScaledDCT<32, 16>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT16X32: {
+      PROFILER_ZONE("DCT 16x32");
+      ComputeScaledDCT<16, 32>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT32X32: {
+      PROFILER_ZONE("DCT 32");
+      ComputeScaledDCT<32, 32>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT: {
+      PROFILER_ZONE("DCT 8");
+      ComputeScaledDCT<8, 8>()(DCTFrom(pixels, pixels_stride), coefficients,
+                               scratch_space);
+      break;
+    }
+    case Type::AFV0: {
+      PROFILER_ZONE("AFV0");
+      AFVTransformFromPixels<0>(pixels, pixels_stride, coefficients);
+      break;
+    }
+    case Type::AFV1: {
+      PROFILER_ZONE("AFV1");
+      AFVTransformFromPixels<1>(pixels, pixels_stride, coefficients);
+      break;
+    }
+    case Type::AFV2: {
+      PROFILER_ZONE("AFV2");
+      AFVTransformFromPixels<2>(pixels, pixels_stride, coefficients);
+      break;
+    }
+    case Type::AFV3: {
+      PROFILER_ZONE("AFV3");
+      AFVTransformFromPixels<3>(pixels, pixels_stride, coefficients);
+      break;
+    }
+    case Type::DCT64X64: {
+      PROFILER_ZONE("DCT 64x64");
+      ComputeScaledDCT<64, 64>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT64X32: {
+      PROFILER_ZONE("DCT 64x32");
+      ComputeScaledDCT<64, 32>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT32X64: {
+      PROFILER_ZONE("DCT 32x64");
+      ComputeScaledDCT<32, 64>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT128X128: {
+      PROFILER_ZONE("DCT 128x128");
+      ComputeScaledDCT<128, 128>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                   scratch_space);
+      break;
+    }
+    case Type::DCT128X64: {
+      PROFILER_ZONE("DCT 128x64");
+      ComputeScaledDCT<128, 64>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT64X128: {
+      PROFILER_ZONE("DCT 64x128");
+      ComputeScaledDCT<64, 128>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT256X256: {
+      PROFILER_ZONE("DCT 256x256");
+      ComputeScaledDCT<256, 256>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                   scratch_space);
+      break;
+    }
+    case Type::DCT256X128: {
+      PROFILER_ZONE("DCT 256x128");
+      ComputeScaledDCT<256, 128>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                   scratch_space);
+      break;
+    }
+    case Type::DCT128X256: {
+      PROFILER_ZONE("DCT 128x256");
+      ComputeScaledDCT<128, 256>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                   scratch_space);
+      break;
+    }
+    case Type::kNumValidStrategies:
+      JXL_ABORT("Invalid strategy");
+  }
+}
+
+HWY_MAYBE_UNUSED void DCFromLowestFrequencies(const AcStrategy::Type strategy,
+                                              const float* block, float* dc,
+                                              size_t dc_stride) {
+  using Type = AcStrategy::Type;
+  switch (strategy) {
+    case Type::DCT16X8: {
+      ReinterpretingIDCT</*DCT_ROWS=*/2 * kBlockDim, /*DCT_COLS=*/kBlockDim,
+                         /*LF_ROWS=*/2, /*LF_COLS=*/1, /*ROWS=*/2, /*COLS=*/1>(
+          block, 2 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT8X16: {
+      ReinterpretingIDCT</*DCT_ROWS=*/kBlockDim, /*DCT_COLS=*/2 * kBlockDim,
+                         /*LF_ROWS=*/1, /*LF_COLS=*/2, /*ROWS=*/1, /*COLS=*/2>(
+          block, 2 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT16X16: {
+      ReinterpretingIDCT</*DCT_ROWS=*/2 * kBlockDim, /*DCT_COLS=*/2 * kBlockDim,
+                         /*LF_ROWS=*/2, /*LF_COLS=*/2, /*ROWS=*/2, /*COLS=*/2>(
+          block, 2 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT32X8: {
+      ReinterpretingIDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/kBlockDim,
+                         /*LF_ROWS=*/4, /*LF_COLS=*/1, /*ROWS=*/4, /*COLS=*/1>(
+          block, 4 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT8X32: {
+      ReinterpretingIDCT</*DCT_ROWS=*/kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                         /*LF_ROWS=*/1, /*LF_COLS=*/4, /*ROWS=*/1, /*COLS=*/4>(
+          block, 4 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT32X16: {
+      ReinterpretingIDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/2 * kBlockDim,
+                         /*LF_ROWS=*/4, /*LF_COLS=*/2, /*ROWS=*/4, /*COLS=*/2>(
+          block, 4 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT16X32: {
+      ReinterpretingIDCT</*DCT_ROWS=*/2 * kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                         /*LF_ROWS=*/2, /*LF_COLS=*/4, /*ROWS=*/2, /*COLS=*/4>(
+          block, 4 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT32X32: {
+      ReinterpretingIDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                         /*LF_ROWS=*/4, /*LF_COLS=*/4, /*ROWS=*/4, /*COLS=*/4>(
+          block, 4 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT64X32: {
+      ReinterpretingIDCT</*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                         /*LF_ROWS=*/8, /*LF_COLS=*/4, /*ROWS=*/8, /*COLS=*/4>(
+          block, 8 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT32X64: {
+      ReinterpretingIDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim,
+                         /*LF_ROWS=*/4, /*LF_COLS=*/8, /*ROWS=*/4, /*COLS=*/8>(
+          block, 8 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT64X64: {
+      ReinterpretingIDCT</*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim,
+                         /*LF_ROWS=*/8, /*LF_COLS=*/8, /*ROWS=*/8, /*COLS=*/8>(
+          block, 8 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT128X64: {
+      ReinterpretingIDCT<
+          /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim,
+          /*LF_ROWS=*/16, /*LF_COLS=*/8, /*ROWS=*/16, /*COLS=*/8>(
+          block, 16 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT64X128: {
+      ReinterpretingIDCT<
+          /*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim,
+          /*LF_ROWS=*/8, /*LF_COLS=*/16, /*ROWS=*/8, /*COLS=*/16>(
+          block, 16 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT128X128: {
+      ReinterpretingIDCT<
+          /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim,
+          /*LF_ROWS=*/16, /*LF_COLS=*/16, /*ROWS=*/16, /*COLS=*/16>(
+          block, 16 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT256X128: {
+      ReinterpretingIDCT<
+          /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim,
+          /*LF_ROWS=*/32, /*LF_COLS=*/16, /*ROWS=*/32, /*COLS=*/16>(
+          block, 32 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT128X256: {
+      ReinterpretingIDCT<
+          /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim,
+          /*LF_ROWS=*/16, /*LF_COLS=*/32, /*ROWS=*/16, /*COLS=*/32>(
+          block, 32 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT256X256: {
+      ReinterpretingIDCT<
+          /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim,
+          /*LF_ROWS=*/32, /*LF_COLS=*/32, /*ROWS=*/32, /*COLS=*/32>(
+          block, 32 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT:
+    case Type::DCT2X2:
+    case Type::DCT4X4:
+    case Type::DCT4X8:
+    case Type::DCT8X4:
+    case Type::AFV0:
+    case Type::AFV1:
+    case Type::AFV2:
+    case Type::AFV3:
+    case Type::IDENTITY:
+      dc[0] = block[0];
+      break;
+    case Type::kNumValidStrategies:
+      JXL_ABORT("Invalid strategy");
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_ENC_TRANSFORMS_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_transforms.cc b/third_party/jpeg-xl/lib/jxl/enc_transforms.cc
new file mode 100644
index 0000000000..8978ba1dcb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_transforms.cc
@@ -0,0 +1,41 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_transforms.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_transforms.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/enc_transforms-inl.h"
+
+namespace jxl {
+
+#if HWY_ONCE
+HWY_EXPORT(TransformFromPixels);
+void TransformFromPixels(const AcStrategy::Type strategy,
+                         const float* JXL_RESTRICT pixels, size_t pixels_stride,
+                         float* JXL_RESTRICT coefficients,
+                         float* scratch_space) {
+  return HWY_DYNAMIC_DISPATCH(TransformFromPixels)(
+      strategy, pixels, pixels_stride, coefficients, scratch_space);
+}
+
+HWY_EXPORT(DCFromLowestFrequencies);
+void DCFromLowestFrequencies(AcStrategy::Type strategy, const float* block,
+                             float* dc, size_t dc_stride) {
+  return HWY_DYNAMIC_DISPATCH(DCFromLowestFrequencies)(strategy, block, dc,
+                                                       dc_stride);
+}
+
+HWY_EXPORT(AFVDCT4x4);
+void AFVDCT4x4(const float* JXL_RESTRICT pixels, float* JXL_RESTRICT coeffs) {
+  return HWY_DYNAMIC_DISPATCH(AFVDCT4x4)(pixels, coeffs);
+}
+#endif  // HWY_ONCE
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_transforms.h b/third_party/jpeg-xl/lib/jxl/enc_transforms.h
new file mode 100644
index 0000000000..039ccc3893
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_transforms.h
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_TRANSFORMS_H_
+#define LIB_JXL_ENC_TRANSFORMS_H_
+
+// Facade for (non-inlined) integral transforms.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+
+void TransformFromPixels(const AcStrategy::Type strategy,
+                         const float* JXL_RESTRICT pixels, size_t pixels_stride,
+                         float* JXL_RESTRICT coefficients,
+                         float* JXL_RESTRICT scratch_space);
+
+// Equivalent of the above for DC image.
+void DCFromLowestFrequencies(AcStrategy::Type strategy, const float* block,
+                             float* dc, size_t dc_stride);
+
+void AFVDCT4x4(const float* JXL_RESTRICT pixels, float* JXL_RESTRICT coeffs);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_TRANSFORMS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_xyb.cc b/third_party/jpeg-xl/lib/jxl/enc_xyb.cc
new file mode 100644
index 0000000000..2ee0abf821
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_xyb.cc
@@ -0,0 +1,520 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_xyb.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cstdlib>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_xyb.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_image_bundle.h"
+#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+// 4x3 matrix * 3x1 SIMD vectors
+template <class V>
+JXL_INLINE void OpsinAbsorbance(const V r, const V g, const V b,
+                                const float* JXL_RESTRICT premul_absorb,
+                                V* JXL_RESTRICT mixed0, V* JXL_RESTRICT mixed1,
+                                V* JXL_RESTRICT mixed2) {
+  const float* bias = &kOpsinAbsorbanceBias[0];
+  const HWY_FULL(float) d;
+  const size_t N = Lanes(d);
+  const auto m0 = Load(d, premul_absorb + 0 * N);
+  const auto m1 = Load(d, premul_absorb + 1 * N);
+  const auto m2 = Load(d, premul_absorb + 2 * N);
+  const auto m3 = Load(d, premul_absorb + 3 * N);
+  const auto m4 = Load(d, premul_absorb + 4 * N);
+  const auto m5 = Load(d, premul_absorb + 5 * N);
+  const auto m6 = Load(d, premul_absorb + 6 * N);
+  const auto m7 = Load(d, premul_absorb + 7 * N);
+  const auto m8 = Load(d, premul_absorb + 8 * N);
+  *mixed0 = MulAdd(m0, r, MulAdd(m1, g, MulAdd(m2, b, Set(d, bias[0]))));
+  *mixed1 = MulAdd(m3, r, MulAdd(m4, g, MulAdd(m5, b, Set(d, bias[1]))));
+  *mixed2 = MulAdd(m6, r, MulAdd(m7, g, MulAdd(m8, b, Set(d, bias[2]))));
+}
+
+template <class V>
+void StoreXYB(const V r, V g, const V b, float* JXL_RESTRICT valx,
+              float* JXL_RESTRICT valy, float* JXL_RESTRICT valz) {
+  const HWY_FULL(float) d;
+  const V half = Set(d, 0.5f);
+  Store(Mul(half, Sub(r, g)), d, valx);
+  Store(Mul(half, Add(r, g)), d, valy);
+  Store(b, d, valz);
+}
+
+// Converts one RGB vector to XYB.
+template <class V>
+void LinearRGBToXYB(const V r, const V g, const V b,
+                    const float* JXL_RESTRICT premul_absorb,
+                    float* JXL_RESTRICT valx, float* JXL_RESTRICT valy,
+                    float* JXL_RESTRICT valz) {
+  V mixed0, mixed1, mixed2;
+  OpsinAbsorbance(r, g, b, premul_absorb, &mixed0, &mixed1, &mixed2);
+
+  // mixed* should be non-negative even for wide-gamut, so clamp to zero.
+  mixed0 = ZeroIfNegative(mixed0);
+  mixed1 = ZeroIfNegative(mixed1);
+  mixed2 = ZeroIfNegative(mixed2);
+
+  const HWY_FULL(float) d;
+  const size_t N = Lanes(d);
+  mixed0 = CubeRootAndAdd(mixed0, Load(d, premul_absorb + 9 * N));
+  mixed1 = CubeRootAndAdd(mixed1, Load(d, premul_absorb + 10 * N));
+  mixed2 = CubeRootAndAdd(mixed2, Load(d, premul_absorb + 11 * N));
+  StoreXYB(mixed0, mixed1, mixed2, valx, valy, valz);
+
+  // For wide-gamut inputs, r/g/b and valx (but not y/z) are often negative.
+}
+
+void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
+                       float* JXL_RESTRICT row2,
+                       const float* JXL_RESTRICT premul_absorb, size_t xsize) {
+  const HWY_FULL(float) d;
+  for (size_t x = 0; x < xsize; x += Lanes(d)) {
+    const auto r = Load(d, row0 + x);
+    const auto g = Load(d, row1 + x);
+    const auto b = Load(d, row2 + x);
+    LinearRGBToXYB(r, g, b, premul_absorb, row0 + x, row1 + x, row2 + x);
+  }
+}
+
+// Input/output uses the codec.h scaling: nominally 0-1 if in-gamut.
+template <class V>
+V LinearFromSRGB(V encoded) {
+  return TF_SRGB().DisplayFromEncoded(encoded);
+}
+
+Status LinearSRGBToXYB(const Image3F& linear,
+                       const float* JXL_RESTRICT premul_absorb,
+                       ThreadPool* pool, Image3F* JXL_RESTRICT xyb) {
+  const size_t xsize = linear.xsize();
+
+  const HWY_FULL(float) d;
+  return RunOnPool(
+      pool, 0, static_cast<uint32_t>(linear.ysize()), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const size_t y = static_cast<size_t>(task);
+        const float* JXL_RESTRICT row_in0 = linear.ConstPlaneRow(0, y);
+        const float* JXL_RESTRICT row_in1 = linear.ConstPlaneRow(1, y);
+        const float* JXL_RESTRICT row_in2 = linear.ConstPlaneRow(2, y);
+        float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y);
+        float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y);
+        float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y);
+
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          const auto in_r = Load(d, row_in0 + x);
+          const auto in_g = Load(d, row_in1 + x);
+          const auto in_b = Load(d, row_in2 + x);
+          LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x,
+                         row_xyb1 + x, row_xyb2 + x);
+        }
+      },
+      "LinearToXYB");
+}
+
+Status SRGBToXYB(const Image3F& srgb, const float* JXL_RESTRICT premul_absorb,
+                 ThreadPool* pool, Image3F* JXL_RESTRICT xyb) {
+  const size_t xsize = srgb.xsize();
+
+  const HWY_FULL(float) d;
+  return RunOnPool(
+      pool, 0, static_cast<uint32_t>(srgb.ysize()), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const size_t y = static_cast<size_t>(task);
+        const float* JXL_RESTRICT row_srgb0 = srgb.ConstPlaneRow(0, y);
+        const float* JXL_RESTRICT row_srgb1 = srgb.ConstPlaneRow(1, y);
+        const float* JXL_RESTRICT row_srgb2 = srgb.ConstPlaneRow(2, y);
+        float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y);
+        float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y);
+        float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y);
+
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          const auto in_r = LinearFromSRGB(Load(d, row_srgb0 + x));
+          const auto in_g = LinearFromSRGB(Load(d, row_srgb1 + x));
+          const auto in_b = LinearFromSRGB(Load(d, row_srgb2 + x));
+          LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x,
+                         row_xyb1 + x, row_xyb2 + x);
+        }
+      },
+      "SRGBToXYB");
+}
+
+Status SRGBToXYBAndLinear(const Image3F& srgb,
+                          const float* JXL_RESTRICT premul_absorb,
+                          ThreadPool* pool, Image3F* JXL_RESTRICT xyb,
+                          Image3F* JXL_RESTRICT linear) {
+  const size_t xsize = srgb.xsize();
+
+  const HWY_FULL(float) d;
+  return RunOnPool(
+      pool, 0, static_cast<uint32_t>(srgb.ysize()), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const size_t y = static_cast<size_t>(task);
+        const float* JXL_RESTRICT row_srgb0 = srgb.ConstPlaneRow(0, y);
+        const float* JXL_RESTRICT row_srgb1 = srgb.ConstPlaneRow(1, y);
+        const float* JXL_RESTRICT row_srgb2 = srgb.ConstPlaneRow(2, y);
+
+        float* JXL_RESTRICT row_linear0 = linear->PlaneRow(0, y);
+        float* JXL_RESTRICT row_linear1 = linear->PlaneRow(1, y);
+        float* JXL_RESTRICT row_linear2 = linear->PlaneRow(2, y);
+
+        float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y);
+        float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y);
+        float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y);
+
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          const auto in_r = LinearFromSRGB(Load(d, row_srgb0 + x));
+          const auto in_g = LinearFromSRGB(Load(d, row_srgb1 + x));
+          const auto in_b = LinearFromSRGB(Load(d, row_srgb2 + x));
+
+          Store(in_r, d, row_linear0 + x);
+          Store(in_g, d, row_linear1 + x);
+          Store(in_b, d, row_linear2 + x);
+
+          LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x,
+                         row_xyb1 + x, row_xyb2 + x);
+        }
+      },
+      "SRGBToXYBAndLinear");
+}
+
+void ComputePremulAbsorb(float intensity_target, float* premul_absorb) {
+  const HWY_FULL(float) d;
+  const size_t N = Lanes(d);
+  const float mul = intensity_target / 255.0f;
+  for (size_t i = 0; i < 9; ++i) {
+    const auto absorb = Set(d, kOpsinAbsorbanceMatrix[i] * mul);
+    Store(absorb, d, premul_absorb + i * N);
+  }
+  for (size_t i = 0; i < 3; ++i) {
+    const auto neg_bias_cbrt = Set(d, -cbrtf(kOpsinAbsorbanceBias[i]));
+    Store(neg_bias_cbrt, d, premul_absorb + (9 + i) * N);
+  }
+}
+
+Image3F TransformToLinearRGB(const Image3F& in,
+                             const ColorEncoding& color_encoding,
+                             float intensity_target, const JxlCmsInterface& cms,
+                             ThreadPool* pool) {
+  ColorSpaceTransform c_transform(cms);
+  bool is_gray = color_encoding.IsGray();
+  const ColorEncoding& c_desired = ColorEncoding::LinearSRGB(is_gray);
+  Image3F out(in.xsize(), in.ysize());
+  std::atomic<bool> ok{true};
+  JXL_CHECK(RunOnPool(
+      pool, 0, in.ysize(),
+      [&](const size_t num_threads) {
+        return c_transform.Init(color_encoding, c_desired, intensity_target,
+                                in.xsize(), num_threads);
+      },
+      [&](const uint32_t y, const size_t thread) {
+        float* mutable_src_buf = c_transform.BufSrc(thread);
+        const float* src_buf = mutable_src_buf;
+        // Interleave input.
+        if (is_gray) {
+          src_buf = in.ConstPlaneRow(0, y);
+        } else {
+          const float* JXL_RESTRICT row_in0 = in.ConstPlaneRow(0, y);
+          const float* JXL_RESTRICT row_in1 = in.ConstPlaneRow(1, y);
+          const float* JXL_RESTRICT row_in2 = in.ConstPlaneRow(2, y);
+          for (size_t x = 0; x < in.xsize(); x++) {
+            mutable_src_buf[3 * x + 0] = row_in0[x];
+            mutable_src_buf[3 * x + 1] = row_in1[x];
+            mutable_src_buf[3 * x + 2] = row_in2[x];
+          }
+        }
+        float* JXL_RESTRICT dst_buf = c_transform.BufDst(thread);
+        if (!c_transform.Run(thread, src_buf, dst_buf)) {
+          ok.store(false);
+          return;
+        }
+        float* JXL_RESTRICT row_out0 = out.PlaneRow(0, y);
+        float* JXL_RESTRICT row_out1 = out.PlaneRow(1, y);
+        float* JXL_RESTRICT row_out2 = out.PlaneRow(2, y);
+        // De-interleave output and convert type.
+        if (is_gray) {
+          for (size_t x = 0; x < in.xsize(); x++) {
+            row_out0[x] = dst_buf[x];
+            row_out1[x] = dst_buf[x];
+            row_out2[x] = dst_buf[x];
+          }
+        } else {
+          for (size_t x = 0; x < in.xsize(); x++) {
+            row_out0[x] = dst_buf[3 * x + 0];
+            row_out1[x] = dst_buf[3 * x + 1];
+            row_out2[x] = dst_buf[3 * x + 2];
+          }
+        }
+      },
+      "Colorspace transform"));
+  JXL_CHECK(ok.load());
+  return out;
+}
+
+void Image3FToXYB(const Image3F& in, const ColorEncoding& color_encoding,
+                  float intensity_target, ThreadPool* pool,
+                  Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms) {
+  JXL_ASSERT(SameSize(in, *xyb));
+
+  const HWY_FULL(float) d;
+  // Pre-broadcasted constants
+  HWY_ALIGN float premul_absorb[MaxLanes(d) * 12];
+  ComputePremulAbsorb(intensity_target, premul_absorb);
+
+  bool is_gray = color_encoding.IsGray();
+  const ColorEncoding& c_linear_srgb = ColorEncoding::LinearSRGB(is_gray);
+  if (c_linear_srgb.SameColorEncoding(color_encoding)) {
+    JXL_CHECK(LinearSRGBToXYB(in, premul_absorb, pool, xyb));
+  } else if (color_encoding.IsSRGB()) {
+    JXL_CHECK(SRGBToXYB(in, premul_absorb, pool, xyb));
+  } else {
+    Image3F linear =
+        TransformToLinearRGB(in, color_encoding, intensity_target, cms, pool);
+    JXL_CHECK(LinearSRGBToXYB(linear, premul_absorb, pool, xyb));
+  }
+}
+
+// This is different from Butteraugli's OpsinDynamicsImage() in the sense that
+// it does not contain a sensitivity multiplier based on the blurred image.
+const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool,
+                         Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms,
+                         ImageBundle* const JXL_RESTRICT linear) {
+  PROFILER_FUNC;
+
+  const size_t xsize = in.xsize();
+  const size_t ysize = in.ysize();
+  JXL_ASSERT(SameSize(in, *xyb));
+
+  const HWY_FULL(float) d;
+  // Pre-broadcasted constants
+  HWY_ALIGN float premul_absorb[MaxLanes(d) * 12];
+  ComputePremulAbsorb(in.metadata()->IntensityTarget(), premul_absorb);
+
+  const bool want_linear = linear != nullptr;
+
+  const ColorEncoding& c_linear_srgb = ColorEncoding::LinearSRGB(in.IsGray());
+  // Linear sRGB inputs are rare but can be useful for the fastest encoders, for
+  // which undoing the sRGB transfer function would be a large part of the cost.
+  if (c_linear_srgb.SameColorEncoding(in.c_current())) {
+    JXL_CHECK(LinearSRGBToXYB(in.color(), premul_absorb, pool, xyb));
+    // This only happens if kitten or slower, moving ImageBundle might be
+    // possible but the encoder is much slower than this copy.
+    if (want_linear) {
+      *linear = in.Copy();
+      return linear;
+    }
+    return &in;
+  }
+
+  // Common case: already sRGB, can avoid the color transform
+  if (in.IsSRGB()) {
+    // Common case: can avoid allocating/copying
+    if (!want_linear) {
+      JXL_CHECK(SRGBToXYB(in.color(), premul_absorb, pool, xyb));
+      return &in;
+    }
+
+    // Slow encoder also wants linear sRGB.
+    linear->SetFromImage(Image3F(xsize, ysize), c_linear_srgb);
+    JXL_CHECK(SRGBToXYBAndLinear(in.color(), premul_absorb, pool, xyb,
+                                 linear->color()));
+    return linear;
+  }
+
+  // General case: not sRGB, need color transform.
+  ImageBundle linear_storage;  // Local storage only used if !want_linear.
+
+  ImageBundle* linear_storage_ptr;
+  if (want_linear) {
+    // Caller asked for linear, use that storage directly.
+    linear_storage_ptr = linear;
+  } else {
+    // Caller didn't ask for linear, create our own local storage
+    // OK to reuse metadata, it will not be changed.
+    linear_storage = ImageBundle(const_cast<ImageMetadata*>(in.metadata()));
+    linear_storage_ptr = &linear_storage;
+  }
+
+  const ImageBundle* ptr;
+  JXL_CHECK(TransformIfNeeded(in, c_linear_srgb, cms, pool, linear_storage_ptr,
+                              &ptr));
+  // If no transform was necessary, should have taken the above codepath.
+  JXL_ASSERT(ptr == linear_storage_ptr);
+
+  JXL_CHECK(
+      LinearSRGBToXYB(*linear_storage_ptr->color(), premul_absorb, pool, xyb));
+  return want_linear ? linear : &in;
+}
+
+// Transform RGB to YCbCr.
+// Could be performed in-place (i.e. Y, Cb and Cr could alias R, B and B).
+Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane,
+                  const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane,
+                  ImageF* cr_plane, ThreadPool* pool) {
+  const HWY_FULL(float) df;
+  const size_t S = Lanes(df);  // Step.
+
+  const size_t xsize = r_plane.xsize();
+  const size_t ysize = r_plane.ysize();
+  if ((xsize == 0) || (ysize == 0)) return true;
+
+  // Full-range BT.601 as defined by JFIF Clause 7:
+  // https://www.itu.int/rec/T-REC-T.871-201105-I/en
+  const auto k128 = Set(df, 128.0f / 255);
+  const auto kR = Set(df, 0.299f);  // NTSC luma
+  const auto kG = Set(df, 0.587f);
+  const auto kB = Set(df, 0.114f);
+  const auto kAmpR = Set(df, 0.701f);
+  const auto kAmpB = Set(df, 0.886f);
+  const auto kDiffR = Add(kAmpR, kR);
+  const auto kDiffB = Add(kAmpB, kB);
+  const auto kNormR = Div(Set(df, 1.0f), (Add(kAmpR, Add(kG, kB))));
+  const auto kNormB = Div(Set(df, 1.0f), (Add(kR, Add(kG, kAmpB))));
+
+  constexpr size_t kGroupArea = kGroupDim * kGroupDim;
+  const size_t lines_per_group = DivCeil(kGroupArea, xsize);
+  const size_t num_stripes = DivCeil(ysize, lines_per_group);
+  const auto transform = [&](int idx, int /* thread*/) {
+    const size_t y0 = idx * lines_per_group;
+    const size_t y1 = std::min<size_t>(y0 + lines_per_group, ysize);
+    for (size_t y = y0; y < y1; ++y) {
+      const float* r_row = r_plane.ConstRow(y);
+      const float* g_row = g_plane.ConstRow(y);
+      const float* b_row = b_plane.ConstRow(y);
+      float* y_row = y_plane->Row(y);
+      float* cb_row = cb_plane->Row(y);
+      float* cr_row = cr_plane->Row(y);
+      for (size_t x = 0; x < xsize; x += S) {
+        const auto r = Load(df, r_row + x);
+        const auto g = Load(df, g_row + x);
+        const auto b = Load(df, b_row + x);
+        const auto r_base = Mul(r, kR);
+        const auto r_diff = Mul(r, kDiffR);
+        const auto g_base = Mul(g, kG);
+        const auto b_base = Mul(b, kB);
+        const auto b_diff = Mul(b, kDiffB);
+        const auto y_base = Add(r_base, Add(g_base, b_base));
+        const auto y_vec = Sub(y_base, k128);
+        const auto cb_vec = Mul(Sub(b_diff, y_base), kNormB);
+        const auto cr_vec = Mul(Sub(r_diff, y_base), kNormR);
+        Store(y_vec, df, y_row + x);
+        Store(cb_vec, df, cb_row + x);
+        Store(cr_vec, df, cr_row + x);
+      }
+    }
+  };
+  return RunOnPool(pool, 0, static_cast<int>(num_stripes), ThreadPool::NoInit,
+                   transform, "RgbToYcbCr");
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(ToXYB);
+const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool,
+                         Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms,
+                         ImageBundle* JXL_RESTRICT linear_storage) {
+  return HWY_DYNAMIC_DISPATCH(ToXYB)(in, pool, xyb, cms, linear_storage);
+}
+
+HWY_EXPORT(LinearRGBRowToXYB);
+void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
+                       float* JXL_RESTRICT row2,
+                       const float* JXL_RESTRICT premul_absorb, size_t xsize) {
+  HWY_DYNAMIC_DISPATCH(LinearRGBRowToXYB)
+  (row0, row1, row2, premul_absorb, xsize);
+}
+
+HWY_EXPORT(ComputePremulAbsorb);
+void ComputePremulAbsorb(float intensity_target, float* premul_absorb) {
+  HWY_DYNAMIC_DISPATCH(ComputePremulAbsorb)(intensity_target, premul_absorb);
+}
+
+void ScaleXYBRow(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
+                 float* JXL_RESTRICT row2, size_t xsize) {
+  for (size_t x = 0; x < xsize; x++) {
+    row2[x] = (row2[x] - row1[x] + kScaledXYBOffset[2]) * kScaledXYBScale[2];
+    row0[x] = (row0[x] + kScaledXYBOffset[0]) * kScaledXYBScale[0];
+    row1[x] = (row1[x] + kScaledXYBOffset[1]) * kScaledXYBScale[1];
+  }
+}
+
+void ScaleXYB(Image3F* opsin) {
+  for (size_t y = 0; y < opsin->ysize(); y++) {
+    float* row0 = opsin->PlaneRow(0, y);
+    float* row1 = opsin->PlaneRow(1, y);
+    float* row2 = opsin->PlaneRow(2, y);
+    ScaleXYBRow(row0, row1, row2, opsin->xsize());
+  }
+}
+
+HWY_EXPORT(Image3FToXYB);
+void Image3FToXYB(const Image3F& in, const ColorEncoding& color_encoding,
+                  float intensity_target, ThreadPool* pool,
+                  Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms) {
+  return HWY_DYNAMIC_DISPATCH(Image3FToXYB)(in, color_encoding,
+                                            intensity_target, pool, xyb, cms);
+}
+
+HWY_EXPORT(RgbToYcbcr);
+Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane,
+                  const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane,
+                  ImageF* cr_plane, ThreadPool* pool) {
+  return HWY_DYNAMIC_DISPATCH(RgbToYcbcr)(r_plane, g_plane, b_plane, y_plane,
+                                          cb_plane, cr_plane, pool);
+}
+
+// DEPRECATED
+Image3F OpsinDynamicsImage(const Image3B& srgb8, const JxlCmsInterface& cms) {
+  ImageMetadata metadata;
+  metadata.SetUintSamples(8);
+  metadata.color_encoding = ColorEncoding::SRGB();
+  ImageBundle ib(&metadata);
+  ib.SetFromImage(ConvertToFloat(srgb8), metadata.color_encoding);
+  JXL_CHECK(ib.TransformTo(ColorEncoding::LinearSRGB(ib.IsGray()), cms));
+  ThreadPool* null_pool = nullptr;
+  Image3F xyb(srgb8.xsize(), srgb8.ysize());
+
+  ImageBundle linear_storage(&metadata);
+  (void)ToXYB(ib, null_pool, &xyb, cms, &linear_storage);
+  return xyb;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_xyb.h b/third_party/jpeg-xl/lib/jxl/enc_xyb.h
new file mode 100644
index 0000000000..fc902848ee
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_xyb.h
@@ -0,0 +1,56 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_XYB_H_
+#define LIB_JXL_ENC_XYB_H_
+
+// Converts to XYB color space.
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+// Converts any color space to XYB. If `linear` is not null, returns `linear`
+// after filling it with a linear sRGB copy of `in`. Otherwise, returns `&in`.
+//
+// NOTE this return value can avoid an extra color conversion if `in` would
+// later be passed to JxlButteraugliComparator.
+const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool,
+                         Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms,
+                         ImageBundle* JXL_RESTRICT linear = nullptr);
+
+void Image3FToXYB(const Image3F& in, const ColorEncoding& color_encoding,
+                  float intensity_target, ThreadPool* pool,
+                  Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms);
+
+void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
+                       float* JXL_RESTRICT row2,
+                       const float* JXL_RESTRICT premul_absorb, size_t xsize);
+
+void ComputePremulAbsorb(float intensity_target, float* premul_absorb);
+
+// Transforms each color component of the given XYB image into the [0.0, 1.0]
+// interval with an affine transform.
+void ScaleXYB(Image3F* opsin);
+void ScaleXYBRow(float* row0, float* row1, float* row2, size_t xsize);
+
+// Bt.601 to match JPEG/JFIF. Outputs _signed_ YCbCr values suitable for DCT,
+// see F.1.1.3 of T.81 (because our data type is float, there is no need to add
+// a bias to make the values unsigned).
+Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane,
+                  const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane,
+                  ImageF* cr_plane, ThreadPool* pool);
+
+// DEPRECATED, used by opsin_image_wrapper.
+Image3F OpsinDynamicsImage(const Image3B& srgb8, const JxlCmsInterface& cms);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_XYB_H_
diff --git a/third_party/jpeg-xl/lib/jxl/encode.cc b/third_party/jpeg-xl/lib/jxl/encode.cc
new file mode 100644
index 0000000000..fbd5133ae5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/encode.cc
@@ -0,0 +1,2128 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <brotli/encode.h>
+#include <jxl/codestream_header.h>
+#include <jxl/encode.h>
+#include <jxl/types.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/enc_fast_lossless.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_icc_codec.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/encode_internal.h"
+#include "lib/jxl/exif.h"
+#include "lib/jxl/jpeg/enc_jpeg_data.h"
+#include "lib/jxl/luminance.h"
+#include "lib/jxl/sanitizers.h"
+
+// Debug-printing failure macro similar to JXL_FAILURE, but for the status code
+// JXL_ENC_ERROR
+#ifdef JXL_CRASH_ON_ERROR
+#define JXL_API_ERROR(enc, error_code, format, ...)                          \
+  (enc->error = error_code,                                                  \
+   ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \
+   ::jxl::Abort(), JXL_ENC_ERROR)
+#define JXL_API_ERROR_NOSET(format, ...)                                     \
+  (::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \
+   ::jxl::Abort(), JXL_ENC_ERROR)
+#else  // JXL_CRASH_ON_ERROR
+#define JXL_API_ERROR(enc, error_code, format, ...)                            \
+  (enc->error = error_code,                                                    \
+   ((JXL_DEBUG_ON_ERROR) &&                                                    \
+    ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__)), \
+   JXL_ENC_ERROR)
+#define JXL_API_ERROR_NOSET(format, ...)                                     \
+  (::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \
+   JXL_ENC_ERROR)
+#endif  // JXL_CRASH_ON_ERROR
+
+namespace jxl {}  // namespace jxl
+
+uint32_t JxlEncoderVersion(void) {
+  return JPEGXL_MAJOR_VERSION * 1000000 + JPEGXL_MINOR_VERSION * 1000 +
+         JPEGXL_PATCH_VERSION;
+}
+
+namespace {
+template <typename T>
+void AppendJxlpBoxCounter(uint32_t counter, bool last, T* output) {
+  if (last) counter |= 0x80000000;
+  for (size_t i = 0; i < 4; i++) {
+    output->push_back(counter >> (8 * (3 - i)) & 0xff);
+  }
+}
+
+void QueueFrame(
+    const JxlEncoderFrameSettings* frame_settings,
+    jxl::MemoryManagerUniquePtr<jxl::JxlEncoderQueuedFrame>& frame) {
+  if (frame_settings->values.lossless) {
+    frame->option_values.cparams.SetLossless();
+  }
+
+  jxl::JxlEncoderQueuedInput queued_input(frame_settings->enc->memory_manager);
+  queued_input.frame = std::move(frame);
+  frame_settings->enc->input_queue.emplace_back(std::move(queued_input));
+  frame_settings->enc->num_queued_frames++;
+}
+
+void QueueFastLosslessFrame(const JxlEncoderFrameSettings* frame_settings,
+                            JxlFastLosslessFrameState* fast_lossless_frame) {
+  jxl::JxlEncoderQueuedInput queued_input(frame_settings->enc->memory_manager);
+  queued_input.fast_lossless_frame.reset(fast_lossless_frame);
+  frame_settings->enc->input_queue.emplace_back(std::move(queued_input));
+  frame_settings->enc->num_queued_frames++;
+}
+
+void QueueBox(JxlEncoder* enc,
+              jxl::MemoryManagerUniquePtr<jxl::JxlEncoderQueuedBox>& box) {
+  jxl::JxlEncoderQueuedInput queued_input(enc->memory_manager);
+  queued_input.box = std::move(box);
+  enc->input_queue.emplace_back(std::move(queued_input));
+  enc->num_queued_boxes++;
+}
+
+// TODO(lode): share this code and the Brotli compression code in enc_jpeg_data
+JxlEncoderStatus BrotliCompress(int quality, const uint8_t* in, size_t in_size,
+                                jxl::PaddedBytes* out) {
+  std::unique_ptr<BrotliEncoderState, decltype(BrotliEncoderDestroyInstance)*>
+      enc(BrotliEncoderCreateInstance(nullptr, nullptr, nullptr),
+          BrotliEncoderDestroyInstance);
+  if (!enc) return JXL_API_ERROR_NOSET("BrotliEncoderCreateInstance failed");
+
+  BrotliEncoderSetParameter(enc.get(), BROTLI_PARAM_QUALITY, quality);
+  BrotliEncoderSetParameter(enc.get(), BROTLI_PARAM_SIZE_HINT, in_size);
+
+  constexpr size_t kBufferSize = 128 * 1024;
+  jxl::PaddedBytes temp_buffer(kBufferSize);
+
+  size_t avail_in = in_size;
+  const uint8_t* next_in = in;
+
+  size_t total_out = 0;
+
+  for (;;) {
+    size_t avail_out = kBufferSize;
+    uint8_t* next_out = temp_buffer.data();
+    jxl::msan::MemoryIsInitialized(next_in, avail_in);
+    if (!BrotliEncoderCompressStream(enc.get(), BROTLI_OPERATION_FINISH,
+                                     &avail_in, &next_in, &avail_out, &next_out,
+                                     &total_out)) {
+      return JXL_API_ERROR_NOSET("Brotli compression failed");
+    }
+    size_t out_size = next_out - temp_buffer.data();
+    jxl::msan::UnpoisonMemory(next_out - out_size, out_size);
+    out->resize(out->size() + out_size);
+    memcpy(out->data() + out->size() - out_size, temp_buffer.data(), out_size);
+    if (BrotliEncoderIsFinished(enc.get())) break;
+  }
+
+  return JXL_ENC_SUCCESS;
+}
+
+// The JXL codestream can have level 5 or level 10. Levels have certain
+// restrictions such as max allowed image dimensions. This function checks the
+// level required to support the current encoder settings. The debug_string is
+// intended to be used for developer API error messages, and may be set to
+// nullptr.
+int VerifyLevelSettings(const JxlEncoder* enc, std::string* debug_string) {
+  const auto& m = enc->metadata.m;
+
+  uint64_t xsize = enc->metadata.size.xsize();
+  uint64_t ysize = enc->metadata.size.ysize();
+  // The uncompressed ICC size, if it is used.
+  size_t icc_size = 0;
+  if (m.color_encoding.WantICC()) {
+    icc_size = m.color_encoding.ICC().size();
+  }
+
+  // Level 10 checks
+
+  if (xsize > (1ull << 30ull) || ysize > (1ull << 30ull) ||
+      xsize * ysize > (1ull << 40ull)) {
+    if (debug_string) *debug_string = "Too large image dimensions";
+    return -1;
+  }
+  if (icc_size > (1ull << 28)) {
+    if (debug_string) *debug_string = "Too large ICC profile size";
+    return -1;
+  }
+  if (m.num_extra_channels > 256) {
+    if (debug_string) *debug_string = "Too many extra channels";
+    return -1;
+  }
+
+  // Level 5 checks
+
+  if (!m.modular_16_bit_buffer_sufficient) {
+    if (debug_string) *debug_string = "Too high modular bit depth";
+    return 10;
+  }
+  if (xsize > (1ull << 18ull) || ysize > (1ull << 18ull) ||
+      xsize * ysize > (1ull << 28ull)) {
+    if (debug_string) *debug_string = "Too large image dimensions";
+    return 10;
+  }
+  if (icc_size > (1ull << 22)) {
+    if (debug_string) *debug_string = "Too large ICC profile";
+    return 10;
+  }
+  if (m.num_extra_channels > 4) {
+    if (debug_string) *debug_string = "Too many extra channels";
+    return 10;
+  }
+  for (size_t i = 0; i < m.extra_channel_info.size(); ++i) {
+    if (m.extra_channel_info[i].type == jxl::ExtraChannel::kBlack) {
+      if (debug_string) *debug_string = "CMYK channel not allowed";
+      return 10;
+    }
+  }
+
+  // TODO(lode): also need to check if consecutive composite-still frames total
+  // pixel amount doesn't exceed 2**28 in the case of level 5. This should be
+  // done when adding frame and requires ability to add composite still frames
+  // to be added first.
+
+  // TODO(lode): also need to check animation duration of a frame. This should
+  // be done when adding frame, but first requires implementing setting the
+  // JxlFrameHeader for a frame.
+
+  // TODO(lode): also need to check properties such as num_splines, num_patches,
+  // modular_16bit_buffers and multiple properties of modular trees. However
+  // these are not user-set properties so cannot be checked here, but decisions
+  // the C++ encoder should be able to make based on the level.
+
+  // All level 5 checks passes, so can return the more compatible level 5
+  return 5;
+}
+
+size_t BitsPerChannel(JxlDataType data_type) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      return 8;
+    case JXL_TYPE_UINT16:
+      return 16;
+    case JXL_TYPE_FLOAT:
+      return 32;
+    case JXL_TYPE_FLOAT16:
+      return 16;
+    default:
+      return 0;  // signals unhandled JxlDataType
+  }
+}
+
+template <typename T>
+uint32_t GetBitDepth(JxlBitDepth bit_depth, const T& metadata,
+                     JxlPixelFormat format) {
+  if (bit_depth.type == JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
+    return BitsPerChannel(format.data_type);
+  } else if (bit_depth.type == JXL_BIT_DEPTH_FROM_CODESTREAM) {
+    return metadata.bit_depth.bits_per_sample;
+  } else if (bit_depth.type == JXL_BIT_DEPTH_CUSTOM) {
+    return bit_depth.bits_per_sample;
+  } else {
+    return 0;
+  }
+}
+
+JxlEncoderStatus CheckValidBitdepth(uint32_t bits_per_sample,
+                                    uint32_t exponent_bits_per_sample) {
+  if (!exponent_bits_per_sample) {
+    // The spec allows up to 31 for bits_per_sample here, but
+    // the code does not (yet) support it.
+    if (!(bits_per_sample > 0 && bits_per_sample <= 24)) {
+      return JXL_API_ERROR_NOSET("Invalid value for bits_per_sample");
+    }
+  } else if ((exponent_bits_per_sample > 8) ||
+             (bits_per_sample > 24 + exponent_bits_per_sample) ||
+             (bits_per_sample < 3 + exponent_bits_per_sample)) {
+    return JXL_API_ERROR_NOSET("Invalid float description");
+  }
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus VerifyInputBitDepth(JxlBitDepth bit_depth,
+                                     JxlPixelFormat format) {
+  if ((format.data_type == JXL_TYPE_FLOAT ||
+       format.data_type == JXL_TYPE_FLOAT16) &&
+      bit_depth.type != JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
+    return JXL_API_ERROR_NOSET(
+        "Only JXL_BIT_DEPTH_FROM_PIXEL_FORMAT is "
+        "implemented for float types.");
+  }
+  return JXL_ENC_SUCCESS;
+}
+
+bool EncodeFrameIndexBox(const jxl::JxlEncoderFrameIndexBox& frame_index_box,
+                         jxl::BitWriter& writer) {
+  bool ok = true;
+  int NF = 0;
+  for (size_t i = 0; i < frame_index_box.entries.size(); ++i) {
+    if (i == 0 || frame_index_box.entries[i].to_be_indexed) {
+      ++NF;
+    }
+  }
+  // Frame index box contents varint + 8 bytes
+  // continue with NF * 3 * varint
+  // varint max length is 10 for 64 bit numbers, and these numbers
+  // are limited to 63 bits.
+  static const int kVarintMaxLength = 10;
+  static const int kFrameIndexBoxHeaderLength = kVarintMaxLength + 8;
+  static const int kFrameIndexBoxElementLength = 3 * kVarintMaxLength;
+  const int buffer_size =
+      kFrameIndexBoxHeaderLength + NF * kFrameIndexBoxElementLength;
+  std::vector<uint8_t> buffer_vec(buffer_size);
+  uint8_t* buffer = buffer_vec.data();
+  size_t output_pos = 0;
+  ok &= jxl::EncodeVarInt(NF, buffer_vec.size(), &output_pos, buffer);
+  StoreBE32(frame_index_box.TNUM, &buffer[output_pos]);
+  output_pos += 4;
+  StoreBE32(frame_index_box.TDEN, &buffer[output_pos]);
+  output_pos += 4;
+  // When we record a frame in the index, the record needs to know
+  // how many frames until the next indexed frame. That is why
+  // we store the 'prev' record. That 'prev' record needs to store
+  // the offset byte position to previously recorded indexed frame,
+  // that's why we also trace previous to the previous frame.
+  int prev_prev_ix = -1;  // For position offset (OFFi) delta coding.
+  int prev_ix = 0;
+  int T_prev = 0;
+  int T = 0;
+  for (size_t i = 1; i < frame_index_box.entries.size(); ++i) {
+    if (frame_index_box.entries[i].to_be_indexed) {
+      // Now we can record the previous entry, since we need to store
+      // there how many frames until the next one.
+      int64_t OFFi = frame_index_box.entries[prev_ix].OFFi;
+      if (prev_prev_ix != -1) {
+        // Offi needs to be offset of start byte of this frame compared to start
+        // byte of previous frame from this index in the JPEG XL codestream. For
+        // the first frame, this is the offset from the first byte of the JPEG
+        // XL codestream.
+        OFFi -= frame_index_box.entries[prev_prev_ix].OFFi;
+      }
+      int32_t Ti = T_prev;
+      int32_t Fi = i - prev_ix;
+      ok &= jxl::EncodeVarInt(OFFi, buffer_vec.size(), &output_pos, buffer);
+      ok &= jxl::EncodeVarInt(Ti, buffer_vec.size(), &output_pos, buffer);
+      ok &= jxl::EncodeVarInt(Fi, buffer_vec.size(), &output_pos, buffer);
+      prev_prev_ix = prev_ix;
+      prev_ix = i;
+      T_prev = T;
+      T += frame_index_box.entries[i].duration;
+    }
+  }
+  {
+    // Last frame.
+    size_t i = frame_index_box.entries.size();
+    int64_t OFFi = frame_index_box.entries[prev_ix].OFFi;
+    if (prev_prev_ix != -1) {
+      OFFi -= frame_index_box.entries[prev_prev_ix].OFFi;
+    }
+    int32_t Ti = T_prev;
+    int32_t Fi = i - prev_ix;
+    ok &= jxl::EncodeVarInt(OFFi, buffer_vec.size(), &output_pos, buffer);
+    ok &= jxl::EncodeVarInt(Ti, buffer_vec.size(), &output_pos, buffer);
+    ok &= jxl::EncodeVarInt(Fi, buffer_vec.size(), &output_pos, buffer);
+  }
+  // Enough buffer has been allocated, this function should never fail in
+  // writing.
+  JXL_ASSERT(ok);
+  return ok;
+}
+
+}  // namespace
+
+JxlEncoderStatus JxlEncoderStruct::RefillOutputByteQueue() {
+  jxl::PaddedBytes bytes;
+
+  jxl::JxlEncoderQueuedInput& input = input_queue[0];
+
+  // TODO(lode): split this into 3 functions: for adding the signature and other
+  // initial headers (jbrd, ...), one for adding frame, and one for adding user
+  // box.
+
+  if (!wrote_bytes) {
+    // First time encoding any data, verify the level 5 vs level 10 settings
+    std::string level_message;
+    int required_level = VerifyLevelSettings(this, &level_message);
+    // Only level 5 and 10 are defined, and the function can return -1 to
+    // indicate full incompatibility.
+    JXL_ASSERT(required_level == -1 || required_level == 5 ||
+               required_level == 10);
+    // codestream_level == -1 means auto-set to the required level
+    if (codestream_level == -1) codestream_level = required_level;
+    if (codestream_level == 5 && required_level != 5) {
+      // If the required level is 10, return error rather than automatically
+      // setting the level to 10, to avoid inadvertently creating a level 10
+      // JXL file while intending to target a level 5 decoder.
+      return JXL_API_ERROR(
+          this, JXL_ENC_ERR_API_USAGE, "%s",
+          ("Codestream level verification for level 5 failed: " + level_message)
+              .c_str());
+    }
+    if (required_level == -1) {
+      return JXL_API_ERROR(
+          this, JXL_ENC_ERR_API_USAGE, "%s",
+          ("Codestream level verification for level 10 failed: " +
+           level_message)
+              .c_str());
+    }
+
+    jxl::BitWriter writer;
+    if (!WriteCodestreamHeaders(&metadata, &writer, nullptr)) {
+      return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC,
+                           "Failed to write codestream header");
+    }
+    // Only send ICC (at least several hundred bytes) if fields aren't enough.
+    if (metadata.m.color_encoding.WantICC()) {
+      if (!jxl::WriteICC(metadata.m.color_encoding.ICC(), &writer,
+                         jxl::kLayerHeader, nullptr)) {
+        return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC,
+                             "Failed to write ICC profile");
+      }
+    }
+    // TODO(lode): preview should be added here if a preview image is added
+
+    writer.ZeroPadToByte();
+
+    // Not actually the end of frame, but the end of metadata/ICC, but helps
+    // the next frame to start here for indexing purposes.
+    codestream_bytes_written_end_of_frame +=
+        jxl::DivCeil(writer.BitsWritten(), 8);
+
+    bytes = std::move(writer).TakeBytes();
+
+    if (MustUseContainer()) {
+      // Add "JXL " and ftyp box.
+      output_byte_queue.insert(
+          output_byte_queue.end(), jxl::kContainerHeader,
+          jxl::kContainerHeader + sizeof(jxl::kContainerHeader));
+      if (codestream_level != 5) {
+        // Add jxll box directly after the ftyp box to indicate the codestream
+        // level.
+        output_byte_queue.insert(
+            output_byte_queue.end(), jxl::kLevelBoxHeader,
+            jxl::kLevelBoxHeader + sizeof(jxl::kLevelBoxHeader));
+        output_byte_queue.push_back(codestream_level);
+      }
+
+      // Whether to write the basic info and color profile header of the
+      // codestream into an early separate jxlp box, so that it comes before
+      // metadata or jpeg reconstruction boxes. In theory this could simply
+      // always be done, but there's no reason to add an extra box with box
+      // header overhead if the codestream will already come immediately after
+      // the signature and level boxes.
+      bool partial_header =
+          store_jpeg_metadata ||
+          (use_boxes && (!input.frame && !input.fast_lossless_frame));
+
+      if (partial_header) {
+        jxl::AppendBoxHeader(jxl::MakeBoxType("jxlp"), bytes.size() + 4,
+                             /*unbounded=*/false, &output_byte_queue);
+        AppendJxlpBoxCounter(jxlp_counter++, /*last=*/false,
+                             &output_byte_queue);
+        output_byte_queue.insert(output_byte_queue.end(), bytes.data(),
+                                 bytes.data() + bytes.size());
+        bytes.clear();
+      }
+
+      if (store_jpeg_metadata && !jpeg_metadata.empty()) {
+        jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_metadata.size(),
+                             false, &output_byte_queue);
+        output_byte_queue.insert(output_byte_queue.end(), jpeg_metadata.begin(),
+                                 jpeg_metadata.end());
+      }
+    }
+    wrote_bytes = true;
+  }
+
+  // Choose frame or box processing: exactly one of the two unique pointers (box
+  // or frame) in the input queue item is non-null.
+  if (input.frame || input.fast_lossless_frame) {
+    jxl::MemoryManagerUniquePtr<jxl::JxlEncoderQueuedFrame> input_frame =
+        std::move(input.frame);
+    if (input.fast_lossless_frame) {
+      output_fast_frame_queue.push_back(std::move(input.fast_lossless_frame));
+    }
+    input_queue.erase(input_queue.begin());
+    num_queued_frames--;
+    if (input_frame) {
+      for (unsigned idx = 0; idx < input_frame->ec_initialized.size(); idx++) {
+        if (!input_frame->ec_initialized[idx]) {
+          return JXL_API_ERROR(this, JXL_ENC_ERR_API_USAGE,
+                               "Extra channel %u is not initialized", idx);
+        }
+      }
+
+      // TODO(zond): If the input queue is empty and the frames_closed is true,
+      // then mark this frame as the last.
+
+      // TODO(zond): Handle progressive mode like EncodeFile does it.
+      // TODO(zond): Handle animation like EncodeFile does it, by checking if
+      //             JxlEncoderCloseFrames has been called and if the frame
+      //             queue is empty (to see if it's the last animation frame).
+
+      if (metadata.m.xyb_encoded) {
+        input_frame->option_values.cparams.color_transform =
+            jxl::ColorTransform::kXYB;
+      } else {
+        // TODO(zond): Figure out when to use kYCbCr instead.
+        input_frame->option_values.cparams.color_transform =
+            jxl::ColorTransform::kNone;
+      }
+    }
+
+    uint32_t duration;
+    uint32_t timecode;
+    if (input_frame && metadata.m.have_animation) {
+      duration = input_frame->option_values.header.duration;
+      timecode = input_frame->option_values.header.timecode;
+    } else {
+      // If have_animation is false, the encoder should ignore the duration and
+      // timecode values. However, assigning them to ib will cause the encoder
+      // to write an invalid frame header that can't be decoded so ensure
+      // they're the default value of 0 here.
+      duration = 0;
+      timecode = 0;
+    }
+
+    bool last_frame = frames_closed && !num_queued_frames;
+
+    size_t codestream_byte_size = 0;
+
+    jxl::BitWriter writer;
+
+    if (input_frame) {
+      jxl::PassesEncoderState enc_state;
+
+      frame_index_box.AddFrame(codestream_bytes_written_end_of_frame, duration,
+                               input_frame->option_values.frame_index_box);
+
+      // EncodeFrame creates jxl::FrameHeader object internally based on the
+      // FrameInfo, imagebundle, cparams and metadata. Copy the information to
+      // these.
+      jxl::ImageBundle& ib = input_frame->frame;
+      ib.duration = duration;
+      ib.timecode = timecode;
+      ib.name = input_frame->option_values.frame_name;
+      ib.blendmode = static_cast<jxl::BlendMode>(
+          input_frame->option_values.header.layer_info.blend_info.blendmode);
+      ib.blend =
+          input_frame->option_values.header.layer_info.blend_info.blendmode !=
+          JXL_BLEND_REPLACE;
+
+      size_t save_as_reference =
+          input_frame->option_values.header.layer_info.save_as_reference;
+      if (save_as_reference >= 3) {
+        return JXL_API_ERROR(
+            this, JXL_ENC_ERR_API_USAGE,
+            "Cannot use save_as_reference values >=3 (found: %d)",
+            (int)save_as_reference);
+      }
+      ib.use_for_next_frame = !!save_as_reference;
+
+      jxl::FrameInfo frame_info;
+      frame_info.is_last = last_frame;
+      frame_info.save_as_reference = save_as_reference;
+      frame_info.source =
+          input_frame->option_values.header.layer_info.blend_info.source;
+      frame_info.clamp =
+          input_frame->option_values.header.layer_info.blend_info.clamp;
+      frame_info.alpha_channel =
+          input_frame->option_values.header.layer_info.blend_info.alpha;
+      frame_info.extra_channel_blending_info.resize(
+          metadata.m.num_extra_channels);
+      // If extra channel blend info has not been set, use the blend mode from
+      // the layer_info.
+      JxlBlendInfo default_blend_info =
+          input_frame->option_values.header.layer_info.blend_info;
+      for (size_t i = 0; i < metadata.m.num_extra_channels; ++i) {
+        auto& to = frame_info.extra_channel_blending_info[i];
+        const auto& from =
+            i < input_frame->option_values.extra_channel_blend_info.size()
+                ? input_frame->option_values.extra_channel_blend_info[i]
+                : default_blend_info;
+        to.mode = static_cast<jxl::BlendMode>(from.blendmode);
+        to.source = from.source;
+        to.alpha_channel = from.alpha;
+        to.clamp = (from.clamp != 0);
+      }
+
+      if (input_frame->option_values.header.layer_info.have_crop) {
+        ib.origin.x0 = input_frame->option_values.header.layer_info.crop_x0;
+        ib.origin.y0 = input_frame->option_values.header.layer_info.crop_y0;
+      }
+      JXL_ASSERT(writer.BitsWritten() == 0);
+      if (!jxl::EncodeFrame(input_frame->option_values.cparams, frame_info,
+                            &metadata, input_frame->frame, &enc_state, cms,
+                            thread_pool.get(), &writer,
+                            /*aux_out=*/nullptr)) {
+        return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC,
+                             "Failed to encode frame");
+      }
+      codestream_bytes_written_beginning_of_frame =
+          codestream_bytes_written_end_of_frame;
+      codestream_bytes_written_end_of_frame +=
+          jxl::DivCeil(writer.BitsWritten(), 8);
+
+      // Possibly bytes already contains the codestream header: in case this is
+      // the first frame, and the codestream header was not encoded as jxlp
+      // above.
+      bytes.append(std::move(writer).TakeBytes());
+      codestream_byte_size = bytes.size();
+    } else {
+      JXL_CHECK(!output_fast_frame_queue.empty());
+      JxlFastLosslessPrepareHeader(output_fast_frame_queue.front().get(),
+                                   /*add_image_header=*/0, last_frame);
+      codestream_byte_size =
+          JxlFastLosslessOutputSize(output_fast_frame_queue.front().get()) +
+          bytes.size();
+    }
+
+    if (MustUseContainer()) {
+      if (last_frame && jxlp_counter == 0) {
+        // If this is the last frame and no jxlp boxes were used yet, it's
+        // slighly more efficient to write a jxlc box since it has 4 bytes
+        // less overhead.
+        jxl::AppendBoxHeader(jxl::MakeBoxType("jxlc"), codestream_byte_size,
+                             /*unbounded=*/false, &output_byte_queue);
+      } else {
+        jxl::AppendBoxHeader(jxl::MakeBoxType("jxlp"), codestream_byte_size + 4,
+                             /*unbounded=*/false, &output_byte_queue);
+        AppendJxlpBoxCounter(jxlp_counter++, last_frame, &output_byte_queue);
+      }
+    }
+
+    output_byte_queue.insert(output_byte_queue.end(), bytes.data(),
+                             bytes.data() + bytes.size());
+
+    if (input_frame) {
+      last_used_cparams = input_frame->option_values.cparams;
+    }
+    if (last_frame && frame_index_box.StoreFrameIndexBox()) {
+      bytes.clear();
+      EncodeFrameIndexBox(frame_index_box, writer);
+      jxl::AppendBoxHeader(jxl::MakeBoxType("jxli"), bytes.size(),
+                           /*unbounded=*/false, &output_byte_queue);
+    }
+  } else {
+    // Not a frame, so is a box instead
+    jxl::MemoryManagerUniquePtr<jxl::JxlEncoderQueuedBox> box =
+        std::move(input.box);
+    input_queue.erase(input_queue.begin());
+    num_queued_boxes--;
+
+    if (box->compress_box) {
+      jxl::PaddedBytes compressed(4);
+      // Prepend the original box type in the brob box contents
+      for (size_t i = 0; i < 4; i++) {
+        compressed[i] = static_cast<uint8_t>(box->type[i]);
+      }
+      if (JXL_ENC_SUCCESS !=
+          BrotliCompress((brotli_effort >= 0 ? brotli_effort : 4),
+                         box->contents.data(), box->contents.size(),
+                         &compressed)) {
+        return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC,
+                             "Brotli compression for brob box failed");
+      }
+      jxl::AppendBoxHeader(jxl::MakeBoxType("brob"), compressed.size(), false,
+                           &output_byte_queue);
+      output_byte_queue.insert(output_byte_queue.end(), compressed.data(),
+                               compressed.data() + compressed.size());
+    } else {
+      jxl::AppendBoxHeader(box->type, box->contents.size(), false,
+                           &output_byte_queue);
+      output_byte_queue.insert(output_byte_queue.end(), box->contents.data(),
+                               box->contents.data() + box->contents.size());
+    }
+  }
+
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetColorEncoding(JxlEncoder* enc,
+                                            const JxlColorEncoding* color) {
+  if (!enc->basic_info_set) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Basic info not yet set");
+  }
+  if (enc->color_encoding_set) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "Color encoding is already set");
+  }
+  if (!jxl::ConvertExternalToInternalColorEncoding(
+          *color, &enc->metadata.m.color_encoding)) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_GENERIC, "Error in color conversion");
+  }
+  if (enc->metadata.m.color_encoding.GetColorSpace() ==
+      jxl::ColorSpace::kGray) {
+    if (enc->basic_info.num_color_channels != 1)
+      return JXL_API_ERROR(
+          enc, JXL_ENC_ERR_API_USAGE,
+          "Cannot use grayscale color encoding with num_color_channels != 1");
+  } else {
+    if (enc->basic_info.num_color_channels != 3)
+      return JXL_API_ERROR(
+          enc, JXL_ENC_ERR_API_USAGE,
+          "Cannot use RGB color encoding with num_color_channels != 3");
+  }
+  enc->color_encoding_set = true;
+  if (!enc->intensity_target_set) {
+    jxl::SetIntensityTarget(&enc->metadata.m);
+  }
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetICCProfile(JxlEncoder* enc,
+                                         const uint8_t* icc_profile,
+                                         size_t size) {
+  if (!enc->basic_info_set) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Basic info not yet set");
+  }
+  if (enc->color_encoding_set) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "ICC profile is already set");
+  }
+  jxl::PaddedBytes icc;
+  icc.assign(icc_profile, icc_profile + size);
+  if (!enc->metadata.m.color_encoding.SetICC(std::move(icc))) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_BAD_INPUT,
+                         "ICC profile could not be set");
+  }
+  if (enc->metadata.m.color_encoding.GetColorSpace() ==
+      jxl::ColorSpace::kGray) {
+    if (enc->basic_info.num_color_channels != 1)
+      return JXL_API_ERROR(
+          enc, JXL_ENC_ERR_BAD_INPUT,
+          "Cannot use grayscale ICC profile with num_color_channels != 1");
+  } else {
+    if (enc->basic_info.num_color_channels != 3)
+      return JXL_API_ERROR(
+          enc, JXL_ENC_ERR_BAD_INPUT,
+          "Cannot use RGB ICC profile with num_color_channels != 3");
+    // TODO(jon): also check that a kBlack extra channel is provided in the CMYK
+    // case
+  }
+  enc->color_encoding_set = true;
+  if (!enc->intensity_target_set) {
+    jxl::SetIntensityTarget(&enc->metadata.m);
+  }
+
+  if (!enc->basic_info.uses_original_profile) {
+    enc->metadata.m.color_encoding.DecideIfWantICC();
+  }
+
+  return JXL_ENC_SUCCESS;
+}
+
+void JxlEncoderInitBasicInfo(JxlBasicInfo* info) {
+  info->have_container = JXL_FALSE;
+  info->xsize = 0;
+  info->ysize = 0;
+  info->bits_per_sample = 8;
+  info->exponent_bits_per_sample = 0;
+  info->intensity_target = 0.f;
+  info->min_nits = 0.f;
+  info->relative_to_max_display = JXL_FALSE;
+  info->linear_below = 0.f;
+  info->uses_original_profile = JXL_FALSE;
+  info->have_preview = JXL_FALSE;
+  info->have_animation = JXL_FALSE;
+  info->orientation = JXL_ORIENT_IDENTITY;
+  info->num_color_channels = 3;
+  info->num_extra_channels = 0;
+  info->alpha_bits = 0;
+  info->alpha_exponent_bits = 0;
+  info->alpha_premultiplied = JXL_FALSE;
+  info->preview.xsize = 0;
+  info->preview.ysize = 0;
+  info->intrinsic_xsize = 0;
+  info->intrinsic_ysize = 0;
+  info->animation.tps_numerator = 10;
+  info->animation.tps_denominator = 1;
+  info->animation.num_loops = 0;
+  info->animation.have_timecodes = JXL_FALSE;
+}
+
+void JxlEncoderInitFrameHeader(JxlFrameHeader* frame_header) {
+  // For each field, the default value of the specification is used. Depending
+  // on whether an animation frame, or a composite still blending frame,
+  // is used, different fields have to be set up by the user after initing
+  // the frame header.
+  frame_header->duration = 0;
+  frame_header->timecode = 0;
+  frame_header->name_length = 0;
+  // In the specification, the default value of is_last is !frame_type, and the
+  // default frame_type is kRegularFrame which has value 0, so is_last is true
+  // by default. However, the encoder does not use this value (the field exists
+  // for the decoder to set) since last frame is determined by usage of
+  // JxlEncoderCloseFrames instead.
+  frame_header->is_last = JXL_TRUE;
+  frame_header->layer_info.have_crop = JXL_FALSE;
+  frame_header->layer_info.crop_x0 = 0;
+  frame_header->layer_info.crop_y0 = 0;
+  // These must be set if have_crop is enabled, but the default value has
+  // have_crop false, and these dimensions 0. The user must set these to the
+  // desired size after enabling have_crop (which is not yet implemented).
+  frame_header->layer_info.xsize = 0;
+  frame_header->layer_info.ysize = 0;
+  JxlEncoderInitBlendInfo(&frame_header->layer_info.blend_info);
+  frame_header->layer_info.save_as_reference = 0;
+}
+
+void JxlEncoderInitBlendInfo(JxlBlendInfo* blend_info) {
+  // Default blend mode in the specification is 0. Note that combining
+  // blend mode of replace with a duration is not useful, but the user has to
+  // manually set duration in case of animation, or manually change the blend
+  // mode in case of composite stills, so initing to a combination that is not
+  // useful on its own is not an issue.
+  blend_info->blendmode = JXL_BLEND_REPLACE;
+  blend_info->source = 0;
+  blend_info->alpha = 0;
+  blend_info->clamp = 0;
+}
+
+JxlEncoderStatus JxlEncoderSetBasicInfo(JxlEncoder* enc,
+                                        const JxlBasicInfo* info) {
+  if (!enc->metadata.size.Set(info->xsize, info->ysize)) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Invalid dimensions");
+  }
+  if (JXL_ENC_SUCCESS != CheckValidBitdepth(info->bits_per_sample,
+                                            info->exponent_bits_per_sample)) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Invalid bit depth");
+  }
+  enc->metadata.m.bit_depth.bits_per_sample = info->bits_per_sample;
+  enc->metadata.m.bit_depth.exponent_bits_per_sample =
+      info->exponent_bits_per_sample;
+  enc->metadata.m.bit_depth.floating_point_sample =
+      (info->exponent_bits_per_sample != 0u);
+  enc->metadata.m.modular_16_bit_buffer_sufficient =
+      (!info->uses_original_profile || info->bits_per_sample <= 12) &&
+      info->alpha_bits <= 12;
+  if ((info->intrinsic_xsize > 0 || info->intrinsic_ysize > 0) &&
+      (info->intrinsic_xsize != info->xsize ||
+       info->intrinsic_ysize != info->ysize)) {
+    if (info->intrinsic_xsize > (1ull << 30ull) ||
+        info->intrinsic_ysize > (1ull << 30ull) ||
+        !enc->metadata.m.intrinsic_size.Set(info->intrinsic_xsize,
+                                            info->intrinsic_ysize)) {
+      return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                           "Invalid intrinsic dimensions");
+    }
+    enc->metadata.m.have_intrinsic_size = true;
+  }
+
+  // The number of extra channels includes the alpha channel, so for example and
+  // RGBA with no other extra channels, has exactly num_extra_channels == 1
+  enc->metadata.m.num_extra_channels = info->num_extra_channels;
+  enc->metadata.m.extra_channel_info.resize(enc->metadata.m.num_extra_channels);
+  if (info->num_extra_channels == 0 && info->alpha_bits) {
+    return JXL_API_ERROR(
+        enc, JXL_ENC_ERR_API_USAGE,
+        "when alpha_bits is non-zero, the number of channels must be at least "
+        "1");
+  }
+  // If the user provides non-zero alpha_bits, we make the channel info at index
+  // zero the appropriate alpha channel.
+  if (info->alpha_bits) {
+    JxlExtraChannelInfo channel_info;
+    JxlEncoderInitExtraChannelInfo(JXL_CHANNEL_ALPHA, &channel_info);
+    channel_info.bits_per_sample = info->alpha_bits;
+    channel_info.exponent_bits_per_sample = info->alpha_exponent_bits;
+    if (JxlEncoderSetExtraChannelInfo(enc, 0, &channel_info)) {
+      return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                           "Problem setting extra channel info for alpha");
+    }
+  }
+
+  enc->metadata.m.xyb_encoded = !info->uses_original_profile;
+  if (info->orientation > 0 && info->orientation <= 8) {
+    enc->metadata.m.orientation = info->orientation;
+  } else {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid value for orientation field");
+  }
+  if (info->num_color_channels != 1 && info->num_color_channels != 3) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid number of color channels");
+  }
+  if (info->intensity_target != 0) {
+    enc->metadata.m.SetIntensityTarget(info->intensity_target);
+    enc->intensity_target_set = true;
+  } else if (enc->color_encoding_set) {
+    // If this is false, JxlEncoderSetColorEncoding will be called later and we
+    // will get one more chance to call jxl::SetIntensityTarget, after the color
+    // encoding is indeed set.
+    jxl::SetIntensityTarget(&enc->metadata.m);
+    enc->intensity_target_set = true;
+  }
+  enc->metadata.m.tone_mapping.min_nits = info->min_nits;
+  enc->metadata.m.tone_mapping.relative_to_max_display =
+      info->relative_to_max_display;
+  enc->metadata.m.tone_mapping.linear_below = info->linear_below;
+  enc->basic_info = *info;
+  enc->basic_info_set = true;
+
+  enc->metadata.m.have_animation = info->have_animation;
+  if (info->have_animation) {
+    if (info->animation.tps_denominator < 1) {
+      return JXL_API_ERROR(
+          enc, JXL_ENC_ERR_API_USAGE,
+          "If animation is used, tps_denominator must be >= 1");
+    }
+    if (info->animation.tps_numerator < 1) {
+      return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                           "If animation is used, tps_numerator must be >= 1");
+    }
+    enc->metadata.m.animation.tps_numerator = info->animation.tps_numerator;
+    enc->metadata.m.animation.tps_denominator = info->animation.tps_denominator;
+    enc->metadata.m.animation.num_loops = info->animation.num_loops;
+    enc->metadata.m.animation.have_timecodes = info->animation.have_timecodes;
+  }
+  std::string level_message;
+  int required_level = VerifyLevelSettings(enc, &level_message);
+  if (required_level == -1 ||
+      (static_cast<int>(enc->codestream_level) < required_level &&
+       enc->codestream_level != -1)) {
+    return JXL_API_ERROR(
+        enc, JXL_ENC_ERR_API_USAGE, "%s",
+        ("Codestream level verification for level " +
+         std::to_string(enc->codestream_level) + " failed: " + level_message)
+            .c_str());
+  }
+  return JXL_ENC_SUCCESS;
+}
+
+void JxlEncoderInitExtraChannelInfo(JxlExtraChannelType type,
+                                    JxlExtraChannelInfo* info) {
+  info->type = type;
+  info->bits_per_sample = 8;
+  info->exponent_bits_per_sample = 0;
+  info->dim_shift = 0;
+  info->name_length = 0;
+  info->alpha_premultiplied = JXL_FALSE;
+  info->spot_color[0] = 0;
+  info->spot_color[1] = 0;
+  info->spot_color[2] = 0;
+  info->spot_color[3] = 0;
+  info->cfa_channel = 0;
+}
+
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelInfo(
+    JxlEncoder* enc, size_t index, const JxlExtraChannelInfo* info) {
+  if (index >= enc->metadata.m.num_extra_channels) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid value for the index of extra channel");
+  }
+  if (JXL_ENC_SUCCESS != CheckValidBitdepth(info->bits_per_sample,
+                                            info->exponent_bits_per_sample)) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Invalid bit depth");
+  }
+
+  jxl::ExtraChannelInfo& channel = enc->metadata.m.extra_channel_info[index];
+  channel.type = static_cast<jxl::ExtraChannel>(info->type);
+  channel.bit_depth.bits_per_sample = info->bits_per_sample;
+  enc->metadata.m.modular_16_bit_buffer_sufficient &=
+      info->bits_per_sample <= 12;
+  channel.bit_depth.exponent_bits_per_sample = info->exponent_bits_per_sample;
+  channel.bit_depth.floating_point_sample = info->exponent_bits_per_sample != 0;
+  channel.dim_shift = info->dim_shift;
+  channel.name = "";
+  channel.alpha_associated = (info->alpha_premultiplied != 0);
+  channel.cfa_channel = info->cfa_channel;
+  channel.spot_color[0] = info->spot_color[0];
+  channel.spot_color[1] = info->spot_color[1];
+  channel.spot_color[2] = info->spot_color[2];
+  channel.spot_color[3] = info->spot_color[3];
+  std::string level_message;
+  int required_level = VerifyLevelSettings(enc, &level_message);
+  if (required_level == -1 ||
+      (static_cast<int>(enc->codestream_level) < required_level &&
+       enc->codestream_level != -1)) {
+    return JXL_API_ERROR(
+        enc, JXL_ENC_ERR_API_USAGE, "%s",
+        ("Codestream level verification for level " +
+         std::to_string(enc->codestream_level) + " failed: " + level_message)
+            .c_str());
+  }
+  return JXL_ENC_SUCCESS;
+}
+
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelName(JxlEncoder* enc,
+                                                          size_t index,
+                                                          const char* name,
+                                                          size_t size) {
+  if (index >= enc->metadata.m.num_extra_channels) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid value for the index of extra channel");
+  }
+  enc->metadata.m.extra_channel_info[index].name =
+      std::string(name, name + size);
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderFrameSettings* JxlEncoderFrameSettingsCreate(
+    JxlEncoder* enc, const JxlEncoderFrameSettings* source) {
+  auto opts = jxl::MemoryManagerMakeUnique<JxlEncoderFrameSettings>(
+      &enc->memory_manager);
+  if (!opts) return nullptr;
+  opts->enc = enc;
+  if (source != nullptr) {
+    opts->values = source->values;
+  } else {
+    opts->values.lossless = false;
+  }
+  opts->values.cparams.level = enc->codestream_level;
+  opts->values.cparams.ec_distance.resize(enc->metadata.m.num_extra_channels,
+                                          -1);
+
+  JxlEncoderFrameSettings* ret = opts.get();
+  enc->encoder_options.emplace_back(std::move(opts));
+  return ret;
+}
+
+JxlEncoderFrameSettings* JxlEncoderOptionsCreate(
+    JxlEncoder* enc, const JxlEncoderFrameSettings* source) {
+  // Deprecated function name, call the non-deprecated function
+  return JxlEncoderFrameSettingsCreate(enc, source);
+}
+
+JxlEncoderStatus JxlEncoderSetFrameLossless(
+    JxlEncoderFrameSettings* frame_settings, const JXL_BOOL lossless) {
+  if (lossless && frame_settings->enc->basic_info_set &&
+      frame_settings->enc->metadata.m.xyb_encoded) {
+    return JXL_API_ERROR(
+        frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+        "Set uses_original_profile=true for lossless encoding");
+  }
+  frame_settings->values.lossless = lossless;
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderOptionsSetLossless(
+    JxlEncoderFrameSettings* frame_settings, JXL_BOOL lossless) {
+  // Deprecated function name, call the non-deprecated function
+  return JxlEncoderSetFrameLossless(frame_settings, lossless);
+}
+
+JxlEncoderStatus JxlEncoderOptionsSetEffort(
+    JxlEncoderFrameSettings* frame_settings, const int effort) {
+  return JxlEncoderFrameSettingsSetOption(frame_settings,
+                                          JXL_ENC_FRAME_SETTING_EFFORT, effort);
+}
+
+JxlEncoderStatus JxlEncoderSetFrameDistance(
+    JxlEncoderFrameSettings* frame_settings, float distance) {
+  if (distance < 0.f || distance > 25.f) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Distance has to be in [0.0..25.0] (corresponding to "
+                         "quality in [0.0..100.0])");
+  }
+  if (distance > 0.f && distance < 0.01f) {
+    distance = 0.01f;
+  }
+  frame_settings->values.cparams.butteraugli_distance = distance;
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetExtraChannelDistance(
+    JxlEncoderFrameSettings* frame_settings, size_t index, float distance) {
+  if (index >= frame_settings->enc->metadata.m.num_extra_channels) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid value for the index of extra channel");
+  }
+  if (distance != -1.f && (distance < 0.f || distance > 25.f)) {
+    return JXL_API_ERROR(
+        frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+        "Distance has to be -1 or in [0.0..25.0] (corresponding to "
+        "quality in [0.0..100.0])");
+  }
+  if (distance > 0.f && distance < 0.01f) {
+    distance = 0.01f;
+  }
+
+  if (index >= frame_settings->values.cparams.ec_distance.size()) {
+    // This can only happen if JxlEncoderFrameSettingsCreate() was called before
+    // JxlEncoderSetBasicInfo().
+    frame_settings->values.cparams.ec_distance.resize(
+        frame_settings->enc->metadata.m.num_extra_channels, -1);
+  }
+
+  frame_settings->values.cparams.ec_distance[index] = distance;
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderOptionsSetDistance(
+    JxlEncoderFrameSettings* frame_settings, float distance) {
+  // Deprecated function name, call the non-deprecated function
+  return JxlEncoderSetFrameDistance(frame_settings, distance);
+}
+
+JxlEncoderStatus JxlEncoderOptionsSetDecodingSpeed(
+    JxlEncoderFrameSettings* frame_settings, int tier) {
+  return JxlEncoderFrameSettingsSetOption(
+      frame_settings, JXL_ENC_FRAME_SETTING_DECODING_SPEED, tier);
+}
+
+JxlEncoderStatus JxlEncoderFrameSettingsSetOption(
+    JxlEncoderFrameSettings* frame_settings, JxlEncoderFrameSettingId option,
+    int64_t value) {
+  // check if value is -1, 0 or 1 for Override-type options
+  switch (option) {
+    case JXL_ENC_FRAME_SETTING_NOISE:
+    case JXL_ENC_FRAME_SETTING_DOTS:
+    case JXL_ENC_FRAME_SETTING_PATCHES:
+    case JXL_ENC_FRAME_SETTING_GABORISH:
+    case JXL_ENC_FRAME_SETTING_MODULAR:
+    case JXL_ENC_FRAME_SETTING_KEEP_INVISIBLE:
+    case JXL_ENC_FRAME_SETTING_GROUP_ORDER:
+    case JXL_ENC_FRAME_SETTING_RESPONSIVE:
+    case JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC:
+    case JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC:
+    case JXL_ENC_FRAME_SETTING_LOSSY_PALETTE:
+    case JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL:
+    case JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES:
+      if (value < -1 || value > 1) {
+        return JXL_API_ERROR(
+            frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+            "Option value has to be -1 (default), 0 (off) or 1 (on)");
+      }
+      break;
+    default:
+      break;
+  }
+
+  switch (option) {
+    case JXL_ENC_FRAME_SETTING_EFFORT:
+      if (frame_settings->enc->allow_expert_options) {
+        if (value < 1 || value > 10) {
+          return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                               "Encode effort has to be in [1..10]");
+        }
+      } else {
+        if (value < 1 || value > 9) {
+          return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                               "Encode effort has to be in [1..9]");
+        }
+      }
+      frame_settings->values.cparams.speed_tier =
+          static_cast<jxl::SpeedTier>(10 - value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_BROTLI_EFFORT:
+      if (value < -1 || value > 11) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Brotli effort has to be in [-1..11]");
+      }
+      // set cparams for brotli use in JPEG frames
+      frame_settings->values.cparams.brotli_effort = value;
+      // set enc option for brotli use in brob boxes
+      frame_settings->enc->brotli_effort = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_DECODING_SPEED:
+      if (value < 0 || value > 4) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                             "Decoding speed has to be in [0..4]");
+      }
+      frame_settings->values.cparams.decoding_speed_tier = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_RESAMPLING:
+      if (value != -1 && value != 1 && value != 2 && value != 4 && value != 8) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Resampling factor has to be 1, 2, 4 or 8");
+      }
+      frame_settings->values.cparams.resampling = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING:
+      // TODO(lode): the jxl codestream allows choosing a different resampling
+      // factor for each extra channel, independently per frame. Move this
+      // option to a JxlEncoderFrameSettings-option that can be set per extra
+      // channel, so needs its own function rather than
+      // JxlEncoderFrameSettingsSetOption due to the extra channel index
+      // argument required.
+      if (value != -1 && value != 1 && value != 2 && value != 4 && value != 8) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Resampling factor has to be 1, 2, 4 or 8");
+      }
+      frame_settings->values.cparams.ec_resampling = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_ALREADY_DOWNSAMPLED:
+      if (value < 0 || value > 1) {
+        return JXL_ENC_ERROR;
+      }
+      frame_settings->values.cparams.already_downsampled = (value == 1);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_NOISE:
+      frame_settings->values.cparams.noise = static_cast<jxl::Override>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_DOTS:
+      frame_settings->values.cparams.dots = static_cast<jxl::Override>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_PATCHES:
+      frame_settings->values.cparams.patches =
+          static_cast<jxl::Override>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_EPF:
+      if (value < -1 || value > 3) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "EPF value has to be in [-1..3]");
+      }
+      frame_settings->values.cparams.epf = static_cast<int>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_GABORISH:
+      frame_settings->values.cparams.gaborish =
+          static_cast<jxl::Override>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_MODULAR:
+      frame_settings->values.cparams.modular_mode = (value == 1);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_KEEP_INVISIBLE:
+      frame_settings->values.cparams.keep_invisible =
+          static_cast<jxl::Override>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_GROUP_ORDER:
+      frame_settings->values.cparams.centerfirst = (value == 1);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_X:
+      if (value < -1) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Center x coordinate has to be -1 or positive");
+      }
+      frame_settings->values.cparams.center_x = static_cast<size_t>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_Y:
+      if (value < -1) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Center y coordinate has to be -1 or positive");
+      }
+      frame_settings->values.cparams.center_y = static_cast<size_t>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_RESPONSIVE:
+      frame_settings->values.cparams.responsive = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC:
+      frame_settings->values.cparams.progressive_mode = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC:
+      frame_settings->values.cparams.qprogressive_mode = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC:
+      if (value < -1 || value > 2) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Progressive DC has to be in [-1..2]");
+      }
+      frame_settings->values.cparams.progressive_dc = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_PALETTE_COLORS:
+      if (value < -1 || value > 70913) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..70913]");
+      }
+      if (value == -1) {
+        frame_settings->values.cparams.palette_colors = 1 << 10;
+      } else {
+        frame_settings->values.cparams.palette_colors = value;
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_LOSSY_PALETTE:
+      // TODO(lode): the defaults of some palette settings depend on others.
+      // See the logic in cjxl. Similar for other settings. This should be
+      // handled in the encoder during JxlEncoderProcessOutput (or,
+      // alternatively, in the cjxl binary like now)
+      frame_settings->values.cparams.lossy_palette = (value == 1);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM:
+      if (value < -1 || value > 2) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..2]");
+      }
+      if (value == -1) {
+        frame_settings->values.cparams.color_transform =
+            jxl::ColorTransform::kXYB;
+      } else {
+        frame_settings->values.cparams.color_transform =
+            static_cast<jxl::ColorTransform>(value);
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE:
+      if (value < -1 || value > 41) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..41]");
+      }
+      frame_settings->values.cparams.colorspace = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE:
+      if (value < -1 || value > 3) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..3]");
+      }
+      // TODO(lode): the default behavior of this parameter for cjxl is
+      // to choose 1 or 2 depending on the situation. This behavior needs to be
+      // implemented either in the C++ library by allowing to set this to -1, or
+      // kept in cjxl and set it to 1 or 2 using this API.
+      if (value == -1) {
+        frame_settings->values.cparams.modular_group_size_shift = 1;
+      } else {
+        frame_settings->values.cparams.modular_group_size_shift = value;
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR:
+      if (value < -1 || value > 15) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..15]");
+      }
+      frame_settings->values.cparams.options.predictor =
+          static_cast<jxl::Predictor>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS:
+      // The max allowed value can in theory be higher. However, it depends on
+      // the effort setting. 11 is the highest safe value that doesn't cause
+      // tree_samples to be >= 64 in the encoder. The specification may allow
+      // more than this. With more fine tuning higher values could be allowed.
+      // For N-channel images, the largest useful value is N-1.
+      if (value < -1 || value > 11) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..11]");
+      }
+      if (value == -1) {
+        frame_settings->values.cparams.options.max_properties = 0;
+      } else {
+        frame_settings->values.cparams.options.max_properties = value;
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL:
+      if (value == -1) {
+        frame_settings->values.cparams.force_cfl_jpeg_recompression = true;
+      } else {
+        frame_settings->values.cparams.force_cfl_jpeg_recompression = value;
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_INDEX_BOX:
+      frame_settings->values.frame_index_box = true;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_PHOTON_NOISE:
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                           "Float option, try setting it with "
+                           "JxlEncoderFrameSettingsSetFloatOption");
+    case JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES:
+      frame_settings->values.cparams.jpeg_compress_boxes = value;
+      return JXL_ENC_SUCCESS;
+    default:
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                           "Unknown option");
+  }
+}
+
+JxlEncoderStatus JxlEncoderFrameSettingsSetFloatOption(
+    JxlEncoderFrameSettings* frame_settings, JxlEncoderFrameSettingId option,
+    float value) {
+  switch (option) {
+    case JXL_ENC_FRAME_SETTING_PHOTON_NOISE:
+      if (value < 0) return JXL_ENC_ERROR;
+      // TODO(lode): add encoder setting to set the 8 floating point values of
+      // the noise synthesis parameters per frame for more fine grained control.
+      frame_settings->values.cparams.photon_noise_iso = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT:
+      if (value < -1.f || value > 100.f) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be smaller than 100");
+      }
+      // This value is called "iterations" or "nb_repeats" in cjxl, but is in
+      // fact a fraction in range 0.0-1.0, with the default value 0.5.
+      // Convert from floating point percentage to floating point fraction here.
+      if (value < -.5f) {
+        // TODO(lode): for this and many other settings (also in
+        // JxlEncoderFrameSettingsSetOption), avoid duplicating the default
+        // values here and in enc_params.h and options.h, have one location
+        // where the defaults are specified.
+        frame_settings->values.cparams.options.nb_repeats = 0.5f;
+      } else {
+        frame_settings->values.cparams.options.nb_repeats = value * 0.01f;
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GLOBAL_PERCENT:
+      if (value < -1.f || value > 100.f) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..100]");
+      }
+      if (value < -.5f) {
+        frame_settings->values.cparams.channel_colors_pre_transform_percent =
+            95.0f;
+      } else {
+        frame_settings->values.cparams.channel_colors_pre_transform_percent =
+            value;
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT:
+      if (value < -1.f || value > 100.f) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..100]");
+      }
+      if (value < -.5f) {
+        frame_settings->values.cparams.channel_colors_percent = 80.0f;
+      } else {
+        frame_settings->values.cparams.channel_colors_percent = value;
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_EFFORT:
+    case JXL_ENC_FRAME_SETTING_DECODING_SPEED:
+    case JXL_ENC_FRAME_SETTING_RESAMPLING:
+    case JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING:
+    case JXL_ENC_FRAME_SETTING_ALREADY_DOWNSAMPLED:
+    case JXL_ENC_FRAME_SETTING_NOISE:
+    case JXL_ENC_FRAME_SETTING_DOTS:
+    case JXL_ENC_FRAME_SETTING_PATCHES:
+    case JXL_ENC_FRAME_SETTING_EPF:
+    case JXL_ENC_FRAME_SETTING_GABORISH:
+    case JXL_ENC_FRAME_SETTING_MODULAR:
+    case JXL_ENC_FRAME_SETTING_KEEP_INVISIBLE:
+    case JXL_ENC_FRAME_SETTING_GROUP_ORDER:
+    case JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_X:
+    case JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_Y:
+    case JXL_ENC_FRAME_SETTING_RESPONSIVE:
+    case JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC:
+    case JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC:
+    case JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC:
+    case JXL_ENC_FRAME_SETTING_PALETTE_COLORS:
+    case JXL_ENC_FRAME_SETTING_LOSSY_PALETTE:
+    case JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM:
+    case JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE:
+    case JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE:
+    case JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR:
+    case JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS:
+    case JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL:
+    case JXL_ENC_FRAME_INDEX_BOX:
+    case JXL_ENC_FRAME_SETTING_BROTLI_EFFORT:
+    case JXL_ENC_FRAME_SETTING_FILL_ENUM:
+    case JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES:
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                           "Int option, try setting it with "
+                           "JxlEncoderFrameSettingsSetOption");
+    default:
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                           "Unknown option");
+  }
+}
+JxlEncoder* JxlEncoderCreate(const JxlMemoryManager* memory_manager) {
+  JxlMemoryManager local_memory_manager;
+  if (!jxl::MemoryManagerInit(&local_memory_manager, memory_manager)) {
+    return nullptr;
+  }
+
+  void* alloc =
+      jxl::MemoryManagerAlloc(&local_memory_manager, sizeof(JxlEncoder));
+  if (!alloc) return nullptr;
+  JxlEncoder* enc = new (alloc) JxlEncoder();
+  enc->memory_manager = local_memory_manager;
+  // TODO(sboukortt): add an API function to set this.
+  enc->cms = jxl::GetJxlCms();
+
+  // Initialize all the field values.
+  JxlEncoderReset(enc);
+
+  return enc;
+}
+
+void JxlEncoderReset(JxlEncoder* enc) {
+  enc->thread_pool.reset();
+  enc->input_queue.clear();
+  enc->num_queued_frames = 0;
+  enc->num_queued_boxes = 0;
+  enc->encoder_options.clear();
+  enc->output_byte_queue.clear();
+  enc->output_fast_frame_queue.clear();
+  enc->codestream_bytes_written_beginning_of_frame = 0;
+  enc->codestream_bytes_written_end_of_frame = 0;
+  enc->wrote_bytes = false;
+  enc->jxlp_counter = 0;
+  enc->metadata = jxl::CodecMetadata();
+  enc->last_used_cparams = jxl::CompressParams();
+  enc->frames_closed = false;
+  enc->boxes_closed = false;
+  enc->basic_info_set = false;
+  enc->color_encoding_set = false;
+  enc->intensity_target_set = false;
+  enc->use_container = false;
+  enc->use_boxes = false;
+  enc->codestream_level = -1;
+  JxlEncoderInitBasicInfo(&enc->basic_info);
+}
+
+void JxlEncoderDestroy(JxlEncoder* enc) {
+  if (enc) {
+    JxlMemoryManager local_memory_manager = enc->memory_manager;
+    // Call destructor directly since custom free function is used.
+    enc->~JxlEncoder();
+    jxl::MemoryManagerFree(&local_memory_manager, enc);
+  }
+}
+
+JxlEncoderError JxlEncoderGetError(JxlEncoder* enc) { return enc->error; }
+
+JxlEncoderStatus JxlEncoderUseContainer(JxlEncoder* enc,
+                                        JXL_BOOL use_container) {
+  if (enc->wrote_bytes) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "this setting can only be set at the beginning");
+  }
+  enc->use_container = static_cast<bool>(use_container);
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderStoreJPEGMetadata(JxlEncoder* enc,
+                                             JXL_BOOL store_jpeg_metadata) {
+  if (enc->wrote_bytes) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "this setting can only be set at the beginning");
+  }
+  enc->store_jpeg_metadata = static_cast<bool>(store_jpeg_metadata);
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetCodestreamLevel(JxlEncoder* enc, int level) {
+  if (level != -1 && level != 5 && level != 10) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_NOT_SUPPORTED, "invalid level");
+  }
+  if (enc->wrote_bytes) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "this setting can only be set at the beginning");
+  }
+  enc->codestream_level = level;
+  return JXL_ENC_SUCCESS;
+}
+
+int JxlEncoderGetRequiredCodestreamLevel(const JxlEncoder* enc) {
+  return VerifyLevelSettings(enc, nullptr);
+}
+
+void JxlEncoderSetCms(JxlEncoder* enc, JxlCmsInterface cms) {
+  jxl::msan::MemoryIsInitialized(&cms, sizeof(cms));
+  enc->cms = cms;
+}
+
+JxlEncoderStatus JxlEncoderSetParallelRunner(JxlEncoder* enc,
+                                             JxlParallelRunner parallel_runner,
+                                             void* parallel_runner_opaque) {
+  if (enc->thread_pool) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "parallel runner already set");
+  }
+  enc->thread_pool = jxl::MemoryManagerMakeUnique<jxl::ThreadPool>(
+      &enc->memory_manager, parallel_runner, parallel_runner_opaque);
+  if (!enc->thread_pool) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_GENERIC,
+                         "error setting parallel runner");
+  }
+  return JXL_ENC_SUCCESS;
+}
+
+namespace {
+JxlEncoderStatus GetCurrentDimensions(
+    const JxlEncoderFrameSettings* frame_settings, size_t& xsize,
+    size_t& ysize) {
+  xsize = frame_settings->enc->metadata.xsize();
+  ysize = frame_settings->enc->metadata.ysize();
+  if (frame_settings->values.header.layer_info.have_crop) {
+    xsize = frame_settings->values.header.layer_info.xsize;
+    ysize = frame_settings->values.header.layer_info.ysize;
+  }
+  if (frame_settings->values.cparams.already_downsampled) {
+    size_t factor = frame_settings->values.cparams.resampling;
+    xsize = jxl::DivCeil(xsize, factor);
+    ysize = jxl::DivCeil(ysize, factor);
+  }
+  if (xsize == 0 || ysize == 0) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "zero-sized frame is not allowed");
+  }
+  return JXL_ENC_SUCCESS;
+}
+}  // namespace
+
+JxlEncoderStatus JxlEncoderAddJPEGFrame(
+    const JxlEncoderFrameSettings* frame_settings, const uint8_t* buffer,
+    size_t size) {
+  if (frame_settings->enc->frames_closed) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Frame input is already closed");
+  }
+
+  jxl::CodecInOut io;
+  if (!jxl::jpeg::DecodeImageJPG(jxl::Span<const uint8_t>(buffer, size), &io)) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_BAD_INPUT,
+                         "Error during decode of input JPEG");
+  }
+
+  if (!frame_settings->enc->color_encoding_set) {
+    if (!SetColorEncodingFromJpegData(
+            *io.Main().jpeg_data,
+            &frame_settings->enc->metadata.m.color_encoding)) {
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_BAD_INPUT,
+                           "Error in input JPEG color space");
+    }
+  }
+
+  if (!frame_settings->enc->basic_info_set) {
+    JxlBasicInfo basic_info;
+    JxlEncoderInitBasicInfo(&basic_info);
+    basic_info.xsize = io.Main().jpeg_data->width;
+    basic_info.ysize = io.Main().jpeg_data->height;
+    basic_info.uses_original_profile = true;
+    if (JxlEncoderSetBasicInfo(frame_settings->enc, &basic_info) !=
+        JXL_ENC_SUCCESS) {
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                           "Error setting basic info");
+    }
+  }
+
+  if (frame_settings->enc->metadata.m.xyb_encoded) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Can't XYB encode a lossless JPEG");
+  }
+  if (!io.blobs.exif.empty()) {
+    JxlOrientation orientation = static_cast<JxlOrientation>(
+        frame_settings->enc->metadata.m.orientation);
+    jxl::InterpretExif(io.blobs.exif, &orientation);
+    frame_settings->enc->metadata.m.orientation = orientation;
+
+    size_t exif_size = io.blobs.exif.size();
+    // Exif data in JPEG is limited to 64k
+    if (exif_size > 0xFFFF) {
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                           "Exif larger than possible in JPEG?");
+    }
+    exif_size += 4;  // prefix 4 zero bytes for tiff offset
+    std::vector<uint8_t> exif(exif_size);
+    memcpy(exif.data() + 4, io.blobs.exif.data(), io.blobs.exif.size());
+    JxlEncoderUseBoxes(frame_settings->enc);
+    JxlEncoderAddBox(frame_settings->enc, "Exif", exif.data(), exif_size,
+                     frame_settings->values.cparams.jpeg_compress_boxes);
+  }
+  if (!io.blobs.xmp.empty()) {
+    JxlEncoderUseBoxes(frame_settings->enc);
+    JxlEncoderAddBox(frame_settings->enc, "xml ", io.blobs.xmp.data(),
+                     io.blobs.xmp.size(),
+                     frame_settings->values.cparams.jpeg_compress_boxes);
+  }
+  if (!io.blobs.jumbf.empty()) {
+    JxlEncoderUseBoxes(frame_settings->enc);
+    JxlEncoderAddBox(frame_settings->enc, "jumb", io.blobs.jumbf.data(),
+                     io.blobs.jumbf.size(),
+                     frame_settings->values.cparams.jpeg_compress_boxes);
+  }
+  if (frame_settings->enc->store_jpeg_metadata) {
+    jxl::jpeg::JPEGData data_in = *io.Main().jpeg_data;
+    jxl::PaddedBytes jpeg_data;
+    if (!jxl::jpeg::EncodeJPEGData(data_in, &jpeg_data,
+                                   frame_settings->values.cparams)) {
+      return JXL_API_ERROR(
+          frame_settings->enc, JXL_ENC_ERR_JBRD,
+          "JPEG bitstream reconstruction data cannot be encoded");
+    }
+    frame_settings->enc->jpeg_metadata = std::vector<uint8_t>(
+        jpeg_data.data(), jpeg_data.data() + jpeg_data.size());
+  }
+
+  auto queued_frame = jxl::MemoryManagerMakeUnique<jxl::JxlEncoderQueuedFrame>(
+      &frame_settings->enc->memory_manager,
+      // JxlEncoderQueuedFrame is a struct with no constructors, so we use the
+      // default move constructor there.
+      jxl::JxlEncoderQueuedFrame{
+          frame_settings->values,
+          jxl::ImageBundle(&frame_settings->enc->metadata.m),
+          {}});
+  if (!queued_frame) {
+    // TODO(jon): when can this happen? is this an API usage error?
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                         "No frame queued?");
+  }
+  queued_frame->frame.SetFromImage(std::move(*io.Main().color()),
+                                   io.Main().c_current());
+  size_t xsize, ysize;
+  if (GetCurrentDimensions(frame_settings, xsize, ysize) != JXL_ENC_SUCCESS) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                         "bad dimensions");
+  }
+  if (xsize != static_cast<size_t>(io.Main().jpeg_data->width) ||
+      ysize != static_cast<size_t>(io.Main().jpeg_data->height)) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                         "JPEG dimensions don't match frame dimensions");
+  }
+  std::vector<jxl::ImageF> extra_channels(
+      frame_settings->enc->metadata.m.num_extra_channels);
+  for (auto& extra_channel : extra_channels) {
+    extra_channel = jxl::ImageF(xsize, ysize);
+    queued_frame->ec_initialized.push_back(0);
+  }
+  queued_frame->frame.SetExtraChannels(std::move(extra_channels));
+  queued_frame->frame.jpeg_data = std::move(io.Main().jpeg_data);
+  queued_frame->frame.color_transform = io.Main().color_transform;
+  queued_frame->frame.chroma_subsampling = io.Main().chroma_subsampling;
+
+  QueueFrame(frame_settings, queued_frame);
+  return JXL_ENC_SUCCESS;
+}
+
+static bool CanDoFastLossless(const JxlEncoderFrameSettings* frame_settings,
+                              const JxlPixelFormat* pixel_format,
+                              bool has_alpha) {
+  if (!frame_settings->values.lossless) {
+    return false;
+  }
+  // TODO(veluca): many of the following options could be made to work, but are
+  // just not implemented in FJXL's frame header handling yet.
+  if (frame_settings->values.frame_index_box) {
+    return false;
+  }
+  if (frame_settings->values.header.layer_info.have_crop) {
+    return false;
+  }
+  if (frame_settings->enc->metadata.m.have_animation) {
+    return false;
+  }
+  if (frame_settings->values.cparams.speed_tier != jxl::SpeedTier::kLightning) {
+    return false;
+  }
+  if (frame_settings->values.image_bit_depth.type ==
+          JxlBitDepthType::JXL_BIT_DEPTH_CUSTOM &&
+      frame_settings->values.image_bit_depth.bits_per_sample !=
+          frame_settings->enc->metadata.m.bit_depth.bits_per_sample) {
+    return false;
+  }
+  // TODO(veluca): implement support for LSB-padded input in fast_lossless.
+  if (frame_settings->values.image_bit_depth.type ==
+          JxlBitDepthType::JXL_BIT_DEPTH_FROM_PIXEL_FORMAT &&
+      frame_settings->values.image_bit_depth.bits_per_sample % 8 != 0) {
+    return false;
+  }
+  if (!frame_settings->values.frame_name.empty()) {
+    return false;
+  }
+  // No extra channels other than alpha.
+  if (!(has_alpha && frame_settings->enc->metadata.m.num_extra_channels == 1) &&
+      frame_settings->enc->metadata.m.num_extra_channels != 0) {
+    return false;
+  }
+  if (frame_settings->enc->metadata.m.bit_depth.bits_per_sample > 16) {
+    return false;
+  }
+  if (pixel_format->data_type != JxlDataType::JXL_TYPE_FLOAT16 &&
+      pixel_format->data_type != JxlDataType::JXL_TYPE_UINT16 &&
+      pixel_format->data_type != JxlDataType::JXL_TYPE_UINT8) {
+    return false;
+  }
+  if ((frame_settings->enc->metadata.m.bit_depth.bits_per_sample > 8) !=
+      (pixel_format->data_type == JxlDataType::JXL_TYPE_UINT16 ||
+       pixel_format->data_type == JxlDataType::JXL_TYPE_FLOAT16)) {
+    return false;
+  }
+  if (!((pixel_format->num_channels == 1 || pixel_format->num_channels == 3) &&
+        !has_alpha) &&
+      !((pixel_format->num_channels == 2 || pixel_format->num_channels == 4) &&
+        has_alpha)) {
+    return false;
+  }
+
+  return true;
+}
+
+JxlEncoderStatus JxlEncoderAddImageFrame(
+    const JxlEncoderFrameSettings* frame_settings,
+    const JxlPixelFormat* pixel_format, const void* buffer, size_t size) {
+  if (!frame_settings->enc->basic_info_set ||
+      (!frame_settings->enc->color_encoding_set &&
+       !frame_settings->enc->metadata.m.xyb_encoded)) {
+    // Basic Info must be set, and color encoding must be set directly,
+    // or set to XYB via JxlBasicInfo.uses_original_profile = JXL_FALSE
+    // Otherwise, this is an API misuse.
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Basic info or color encoding not set yet");
+  }
+
+  if (frame_settings->enc->frames_closed) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Frame input already closed");
+  }
+  if (pixel_format->num_channels < 3) {
+    if (frame_settings->enc->basic_info.num_color_channels != 1) {
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                           "Grayscale pixel format input for an RGB image");
+    }
+  } else {
+    if (frame_settings->enc->basic_info.num_color_channels != 3) {
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                           "RGB pixel format input for a grayscale image");
+    }
+  }
+
+  bool has_alpha = frame_settings->enc->metadata.m.HasAlpha();
+
+  size_t xsize, ysize;
+  if (GetCurrentDimensions(frame_settings, xsize, ysize) != JXL_ENC_SUCCESS) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                         "bad dimensions");
+  }
+
+  // All required conditions to do fast-lossless.
+  if (CanDoFastLossless(frame_settings, pixel_format, has_alpha)) {
+    const size_t bytes_per_pixel =
+        pixel_format->data_type == JxlDataType::JXL_TYPE_UINT8
+            ? pixel_format->num_channels
+            : pixel_format->num_channels * 2;
+    const size_t last_row_size = xsize * bytes_per_pixel;
+    const size_t align = pixel_format->align;
+    const size_t row_size =
+        (align > 1 ? jxl::DivCeil(last_row_size, align) * align
+                   : last_row_size);
+    const size_t bytes_to_read = row_size * (ysize - 1) + last_row_size;
+    if (bytes_to_read > size) {
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                           "provided image buffer too small");
+    }
+    const bool big_endian =
+        pixel_format->endianness == JXL_BIG_ENDIAN ||
+        (pixel_format->endianness == JXL_NATIVE_ENDIAN && !IsLittleEndian());
+
+    auto runner = +[](void* void_pool, void* opaque, void fun(void*, size_t),
+                      size_t count) {
+      auto* pool = reinterpret_cast<jxl::ThreadPool*>(void_pool);
+      JXL_CHECK(jxl::RunOnPool(
+          pool, 0, count, jxl::ThreadPool::NoInit,
+          [&](size_t i, size_t) { fun(opaque, i); }, "Encode fast lossless"));
+    };
+    QueueFastLosslessFrame(
+        frame_settings,
+        JxlFastLosslessPrepareFrame(
+            reinterpret_cast<const unsigned char*>(buffer), xsize, row_size,
+            ysize, pixel_format->num_channels,
+            frame_settings->enc->metadata.m.bit_depth.bits_per_sample,
+            big_endian, /*effort=*/2, frame_settings->enc->thread_pool.get(),
+            runner));
+    return JXL_ENC_SUCCESS;
+  }
+
+  auto queued_frame = jxl::MemoryManagerMakeUnique<jxl::JxlEncoderQueuedFrame>(
+      &frame_settings->enc->memory_manager,
+      // JxlEncoderQueuedFrame is a struct with no constructors, so we use the
+      // default move constructor there.
+      jxl::JxlEncoderQueuedFrame{
+          frame_settings->values,
+          jxl::ImageBundle(&frame_settings->enc->metadata.m),
+          {}});
+
+  if (!queued_frame) {
+    // TODO(jon): when can this happen? is this an API usage error?
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                         "No frame queued?");
+  }
+
+  jxl::ColorEncoding c_current;
+  if (!frame_settings->enc->color_encoding_set) {
+    if ((pixel_format->data_type == JXL_TYPE_FLOAT) ||
+        (pixel_format->data_type == JXL_TYPE_FLOAT16)) {
+      c_current =
+          jxl::ColorEncoding::LinearSRGB(pixel_format->num_channels < 3);
+    } else {
+      c_current = jxl::ColorEncoding::SRGB(pixel_format->num_channels < 3);
+    }
+  } else {
+    c_current = frame_settings->enc->metadata.m.color_encoding;
+  }
+  uint32_t num_channels = pixel_format->num_channels;
+  size_t has_interleaved_alpha =
+      static_cast<size_t>(num_channels == 2 || num_channels == 4);
+  if (has_interleaved_alpha >
+      frame_settings->enc->metadata.m.num_extra_channels) {
+    return JXL_API_ERROR(
+        frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+        "number of extra channels mismatch (need 1 extra channel for alpha)");
+  }
+  std::vector<jxl::ImageF> extra_channels(
+      frame_settings->enc->metadata.m.num_extra_channels);
+  for (auto& extra_channel : extra_channels) {
+    extra_channel = jxl::ImageF(xsize, ysize);
+  }
+  queued_frame->frame.SetExtraChannels(std::move(extra_channels));
+  for (auto& ec_info : frame_settings->enc->metadata.m.extra_channel_info) {
+    if (has_interleaved_alpha && ec_info.type == jxl::ExtraChannel::kAlpha) {
+      queued_frame->ec_initialized.push_back(1);
+      has_interleaved_alpha = 0;  // only first Alpha is initialized
+    } else {
+      queued_frame->ec_initialized.push_back(0);
+    }
+  }
+  queued_frame->frame.origin.x0 =
+      frame_settings->values.header.layer_info.crop_x0;
+  queued_frame->frame.origin.y0 =
+      frame_settings->values.header.layer_info.crop_y0;
+  queued_frame->frame.use_for_next_frame =
+      (frame_settings->values.header.layer_info.save_as_reference != 0u);
+  queued_frame->frame.blendmode =
+      frame_settings->values.header.layer_info.blend_info.blendmode ==
+              JXL_BLEND_REPLACE
+          ? jxl::BlendMode::kReplace
+          : jxl::BlendMode::kBlend;
+  queued_frame->frame.blend =
+      frame_settings->values.header.layer_info.blend_info.source > 0;
+
+  if (JXL_ENC_SUCCESS !=
+      VerifyInputBitDepth(frame_settings->values.image_bit_depth,
+                          *pixel_format)) {
+    return JXL_API_ERROR_NOSET("Invalid input bit depth");
+  }
+  size_t bits_per_sample =
+      GetBitDepth(frame_settings->values.image_bit_depth,
+                  frame_settings->enc->metadata.m, *pixel_format);
+  const uint8_t* uint8_buffer = reinterpret_cast<const uint8_t*>(buffer);
+  if (!jxl::ConvertFromExternal(
+          jxl::Span<const uint8_t>(uint8_buffer, size), xsize, ysize, c_current,
+          bits_per_sample, *pixel_format,
+          frame_settings->enc->thread_pool.get(), &(queued_frame->frame))) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid input buffer");
+  }
+  if (frame_settings->values.lossless &&
+      frame_settings->enc->metadata.m.xyb_encoded) {
+    return JXL_API_ERROR(
+        frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+        "Set uses_original_profile=true for lossless encoding");
+  }
+  queued_frame->option_values.cparams.level =
+      frame_settings->enc->codestream_level;
+
+  QueueFrame(frame_settings, queued_frame);
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderUseBoxes(JxlEncoder* enc) {
+  if (enc->wrote_bytes) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "this setting can only be set at the beginning");
+  }
+  enc->use_boxes = true;
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderAddBox(JxlEncoder* enc, const JxlBoxType type,
+                                  const uint8_t* contents, size_t size,
+                                  JXL_BOOL compress_box) {
+  if (!enc->use_boxes) {
+    return JXL_API_ERROR(
+        enc, JXL_ENC_ERR_API_USAGE,
+        "must set JxlEncoderUseBoxes at the beginning to add boxes");
+  }
+  if (compress_box) {
+    if (memcmp("jxl", type, 3) == 0) {
+      return JXL_API_ERROR(
+          enc, JXL_ENC_ERR_API_USAGE,
+          "brob box may not contain a type starting with \"jxl\"");
+    }
+    if (memcmp("jbrd", type, 4) == 0) {
+      return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                           "jbrd box may not be brob compressed");
+    }
+    if (memcmp("brob", type, 4) == 0) {
+      // The compress_box will compress an existing non-brob box into a brob
+      // box. If already giving a valid brotli-compressed brob box, set
+      // compress_box to false since it is already compressed.
+      return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                           "a brob box cannot contain another brob box");
+    }
+  }
+
+  auto box = jxl::MemoryManagerMakeUnique<jxl::JxlEncoderQueuedBox>(
+      &enc->memory_manager);
+
+  box->type = jxl::MakeBoxType(type);
+  box->contents.assign(contents, contents + size);
+  box->compress_box = !!compress_box;
+  QueueBox(enc, box);
+  return JXL_ENC_SUCCESS;
+}
+
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelBuffer(
+    const JxlEncoderOptions* frame_settings, const JxlPixelFormat* pixel_format,
+    const void* buffer, size_t size, uint32_t index) {
+  if (index >= frame_settings->enc->metadata.m.num_extra_channels) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid value for the index of extra channel");
+  }
+  if (!frame_settings->enc->basic_info_set ||
+      !frame_settings->enc->color_encoding_set) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Basic info has to be set first");
+  }
+  if (frame_settings->enc->input_queue.empty()) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "First add image frame, then extra channels");
+  }
+  if (frame_settings->enc->frames_closed) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Frame input already closed");
+  }
+  size_t xsize, ysize;
+  if (GetCurrentDimensions(frame_settings, xsize, ysize) != JXL_ENC_SUCCESS) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                         "bad dimensions");
+  }
+  JxlPixelFormat ec_format = *pixel_format;
+  ec_format.num_channels = 1;
+  if (JXL_ENC_SUCCESS !=
+      VerifyInputBitDepth(frame_settings->values.image_bit_depth, ec_format)) {
+    return JXL_API_ERROR_NOSET("Invalid input bit depth");
+  }
+  size_t bits_per_sample = GetBitDepth(
+      frame_settings->values.image_bit_depth,
+      frame_settings->enc->metadata.m.extra_channel_info[index], ec_format);
+  const uint8_t* uint8_buffer = reinterpret_cast<const uint8_t*>(buffer);
+  auto queued_frame = frame_settings->enc->input_queue.back().frame.get();
+  if (!jxl::ConvertFromExternal(jxl::Span<const uint8_t>(uint8_buffer, size),
+                                xsize, ysize, bits_per_sample, ec_format, 0,
+                                frame_settings->enc->thread_pool.get(),
+                                &queued_frame->frame.extra_channels()[index])) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Failed to set buffer for extra channel");
+  }
+  queued_frame->ec_initialized[index] = 1;
+
+  return JXL_ENC_SUCCESS;
+}
+
+void JxlEncoderCloseFrames(JxlEncoder* enc) { enc->frames_closed = true; }
+
+void JxlEncoderCloseBoxes(JxlEncoder* enc) { enc->boxes_closed = true; }
+
+void JxlEncoderCloseInput(JxlEncoder* enc) {
+  JxlEncoderCloseFrames(enc);
+  JxlEncoderCloseBoxes(enc);
+}
+JxlEncoderStatus JxlEncoderProcessOutput(JxlEncoder* enc, uint8_t** next_out,
+                                         size_t* avail_out) {
+  while (*avail_out >= 32 &&
+         (!enc->output_byte_queue.empty() ||
+          !enc->output_fast_frame_queue.empty() || !enc->input_queue.empty())) {
+    if (!enc->output_byte_queue.empty()) {
+      size_t to_copy = std::min(*avail_out, enc->output_byte_queue.size());
+      std::copy_n(enc->output_byte_queue.begin(), to_copy, *next_out);
+      *next_out += to_copy;
+      *avail_out -= to_copy;
+      enc->output_byte_queue.erase(enc->output_byte_queue.begin(),
+                                   enc->output_byte_queue.begin() + to_copy);
+    } else if (!enc->output_fast_frame_queue.empty()) {
+      size_t count = JxlFastLosslessWriteOutput(
+          enc->output_fast_frame_queue.front().get(), *next_out, *avail_out);
+      *next_out += count;
+      *avail_out -= count;
+      if (count == 0) {
+        enc->output_fast_frame_queue.pop_front();
+      }
+
+    } else if (!enc->input_queue.empty()) {
+      if (enc->RefillOutputByteQueue() != JXL_ENC_SUCCESS) {
+        return JXL_ENC_ERROR;
+      }
+    }
+  }
+
+  if (!enc->output_byte_queue.empty() ||
+      !enc->output_fast_frame_queue.empty() || !enc->input_queue.empty()) {
+    return JXL_ENC_NEED_MORE_OUTPUT;
+  }
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetFrameHeader(JxlEncoderOptions* frame_settings,
+                                          const JxlFrameHeader* frame_header) {
+  if (frame_header->layer_info.blend_info.source > 3) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "invalid blending source index");
+  }
+  // If there are no extra channels, it's ok for the value to be 0.
+  if (frame_header->layer_info.blend_info.alpha != 0 &&
+      frame_header->layer_info.blend_info.alpha >=
+          frame_settings->enc->metadata.m.extra_channel_info.size()) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "alpha blend channel index out of bounds");
+  }
+
+  frame_settings->values.header = *frame_header;
+  // Setting the frame header resets the frame name, it must be set again with
+  // JxlEncoderSetFrameName if desired.
+  frame_settings->values.frame_name = "";
+
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetExtraChannelBlendInfo(
+    JxlEncoderOptions* frame_settings, size_t index,
+    const JxlBlendInfo* blend_info) {
+  if (index >= frame_settings->enc->metadata.m.num_extra_channels) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid value for the index of extra channel");
+  }
+
+  if (frame_settings->values.extra_channel_blend_info.size() !=
+      frame_settings->enc->metadata.m.num_extra_channels) {
+    JxlBlendInfo default_blend_info;
+    JxlEncoderInitBlendInfo(&default_blend_info);
+    frame_settings->values.extra_channel_blend_info.resize(
+        frame_settings->enc->metadata.m.num_extra_channels, default_blend_info);
+  }
+  frame_settings->values.extra_channel_blend_info[index] = *blend_info;
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetFrameName(JxlEncoderFrameSettings* frame_settings,
+                                        const char* frame_name) {
+  std::string str = frame_name ? frame_name : "";
+  if (str.size() > 1071) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "frame name can be max 1071 bytes long");
+  }
+  frame_settings->values.frame_name = str;
+  frame_settings->values.header.name_length = str.size();
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetFrameBitDepth(
+    JxlEncoderFrameSettings* frame_settings, const JxlBitDepth* bit_depth) {
+  if (bit_depth->type != JXL_BIT_DEPTH_FROM_PIXEL_FORMAT &&
+      bit_depth->type != JXL_BIT_DEPTH_FROM_CODESTREAM) {
+    return JXL_API_ERROR_NOSET(
+        "Only JXL_BIT_DEPTH_FROM_PIXEL_FORMAT and "
+        "JXL_BIT_DEPTH_FROM_CODESTREAM is implemented "
+        "for input buffers.");
+  }
+  frame_settings->values.image_bit_depth = *bit_depth;
+  return JXL_ENC_SUCCESS;
+}
+
+void JxlColorEncodingSetToSRGB(JxlColorEncoding* color_encoding,
+                               JXL_BOOL is_gray) {
+  ConvertInternalToExternalColorEncoding(jxl::ColorEncoding::SRGB(is_gray),
+                                         color_encoding);
+}
+
+void JxlColorEncodingSetToLinearSRGB(JxlColorEncoding* color_encoding,
+                                     JXL_BOOL is_gray) {
+  ConvertInternalToExternalColorEncoding(
+      jxl::ColorEncoding::LinearSRGB(is_gray), color_encoding);
+}
+
+void JxlEncoderAllowExpertOptions(JxlEncoder* enc) {
+  enc->allow_expert_options = true;
+}
diff --git a/third_party/jpeg-xl/lib/jxl/encode_internal.h b/third_party/jpeg-xl/lib/jxl/encode_internal.h
new file mode 100644
index 0000000000..7713c5cab6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/encode_internal.h
@@ -0,0 +1,275 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+#ifndef LIB_JXL_ENCODE_INTERNAL_H_
+#define LIB_JXL_ENCODE_INTERNAL_H_
+
+#include <jxl/encode.h>
+#include <jxl/memory_manager.h>
+#include <jxl/parallel_runner.h>
+#include <jxl/types.h>
+
+#include <deque>
+#include <vector>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/enc_fast_lossless.h"
+#include "lib/jxl/enc_frame.h"
+#include "lib/jxl/memory_manager_internal.h"
+
+namespace jxl {
+
+/* Frame index box 'jxli' will start with Varint() for
+NF: has type Varint(): number of frames listed in the index.
+TNUM: has type u32: numerator of tick unit.
+TDEN: has type u32: denominator of tick unit. Value 0 means the file is
+ill-formed. per frame i listed: OFFi: has type Varint(): offset of start byte of
+this frame compared to start byte of previous frame from this index in the JPEG
+XL codestream. For the first frame, this is the offset from the first byte of
+the JPEG XL codestream. Ti: has type Varint(): duration in ticks between the
+start of this frame and the start of the next frame in the index. If this is the
+last frame in the index, this is the duration in ticks between the start of this
+frame and the end of the stream. A tick lasts TNUM / TDEN seconds. Fi: has type
+Varint(): amount of frames the next frame in the index occurs after this frame.
+If this is the last frame in the index, this is the amount of frames after this
+frame in the remainder of the stream. Only frames that are presented by the
+decoder are counted for this purpose, this excludes frames that are not intended
+for display but for compositing with other frames, such as frames that aren't
+the last frame with a duration of 0 ticks.
+
+All the frames listed in jxli are keyframes and the first frame is
+present in the list.
+There shall be either zero or one Frame Index boxes in a JPEG XL file.
+The offsets OFFi per frame are given as bytes in the codestream, not as
+bytes in the file format using the box structure. This means if JPEG XL Partial
+Codestream boxes are used, the offset is counted within the concatenated
+codestream, bytes from box headers or non-codestream boxes are not counted.
+*/
+
+typedef struct JxlEncoderFrameIndexBoxEntryStruct {
+  bool to_be_indexed;
+  uint32_t duration;
+  uint64_t OFFi;
+} JxlEncoderFrameIndexBoxEntry;
+
+typedef struct JxlEncoderFrameIndexBoxStruct {
+  // We always need to record the first frame entry, so presence of the
+  // first entry alone is not an indication if it was requested to be
+  // stored.
+  bool index_box_requested_through_api = false;
+
+  int64_t NF() const { return entries.size(); }
+  bool StoreFrameIndexBox() {
+    for (auto e : entries) {
+      if (e.to_be_indexed) {
+        return true;
+      }
+    }
+    return false;
+  }
+  int32_t TNUM = 1;
+  int32_t TDEN = 1000;
+
+  std::vector<JxlEncoderFrameIndexBoxEntry> entries;
+
+  // That way we can ensure that every index box will have the first frame.
+  // If the API user decides to mark it as an indexed frame, we call
+  // the AddFrame again, this time with requested.
+  void AddFrame(uint64_t OFFi, uint32_t duration, bool to_be_indexed) {
+    // We call AddFrame to every frame.
+    // Recording the first frame is required by the standard.
+    // Knowing the last frame is required, since the last indexed frame
+    // needs to know how many frames until the end.
+    // To be able to tell how many frames there are between each index
+    // entry we just record every frame here.
+    if (entries.size() == 1) {
+      if (OFFi == entries[0].OFFi) {
+        // API use for the first frame, let's clear the already recorded first
+        // frame.
+        entries.clear();
+      }
+    }
+    JxlEncoderFrameIndexBoxEntry e;
+    e.to_be_indexed = to_be_indexed;
+    e.OFFi = OFFi;
+    e.duration = duration;
+    entries.push_back(e);
+  }
+} JxlEncoderFrameIndexBox;
+
+// The encoder options (such as quality, compression speed, ...) for a single
+// frame, but not encoder-wide options such as box-related options.
+typedef struct JxlEncoderFrameSettingsValuesStruct {
+  // lossless is a separate setting from cparams because it is a combination
+  // setting that overrides multiple settings inside of cparams.
+  bool lossless;
+  CompressParams cparams;
+  JxlFrameHeader header;
+  std::vector<JxlBlendInfo> extra_channel_blend_info;
+  std::string frame_name;
+  JxlBitDepth image_bit_depth;
+  bool frame_index_box = false;
+} JxlEncoderFrameSettingsValues;
+
+typedef std::array<uint8_t, 4> BoxType;
+
+// Utility function that makes a BoxType from a string literal. The string must
+// have 4 characters, a 5th null termination character is optional.
+constexpr BoxType MakeBoxType(const char* type) {
+  return BoxType(
+      {{static_cast<uint8_t>(type[0]), static_cast<uint8_t>(type[1]),
+        static_cast<uint8_t>(type[2]), static_cast<uint8_t>(type[3])}});
+}
+
+constexpr unsigned char kContainerHeader[] = {
+    0,   0,   0, 0xc, 'J',  'X', 'L', ' ', 0xd, 0xa, 0x87,
+    0xa, 0,   0, 0,   0x14, 'f', 't', 'y', 'p', 'j', 'x',
+    'l', ' ', 0, 0,   0,    0,   'j', 'x', 'l', ' '};
+
+constexpr unsigned char kLevelBoxHeader[] = {0, 0, 0, 0x9, 'j', 'x', 'l', 'l'};
+
+struct JxlEncoderQueuedFrame {
+  JxlEncoderFrameSettingsValues option_values;
+  ImageBundle frame;
+  std::vector<uint8_t> ec_initialized;
+};
+
+struct JxlEncoderQueuedBox {
+  BoxType type;
+  std::vector<uint8_t> contents;
+  bool compress_box;
+};
+
+using FJXLFrameUniquePtr =
+    std::unique_ptr<JxlFastLosslessFrameState,
+                    decltype(&JxlFastLosslessFreeFrameState)>;
+
+// Either a frame, or a box, not both.
+// Can also be a FJXL frame.
+struct JxlEncoderQueuedInput {
+  explicit JxlEncoderQueuedInput(const JxlMemoryManager& memory_manager)
+      : frame(nullptr, jxl::MemoryManagerDeleteHelper(&memory_manager)),
+        box(nullptr, jxl::MemoryManagerDeleteHelper(&memory_manager)) {}
+  MemoryManagerUniquePtr<JxlEncoderQueuedFrame> frame;
+  MemoryManagerUniquePtr<JxlEncoderQueuedBox> box;
+  FJXLFrameUniquePtr fast_lossless_frame = {nullptr,
+                                            JxlFastLosslessFreeFrameState};
+};
+
+// Appends a JXL container box header with given type, size, and unbounded
+// properties to output.
+template <typename T>
+void AppendBoxHeader(const jxl::BoxType& type, size_t size, bool unbounded,
+                     T* output) {
+  uint64_t box_size = 0;
+  bool large_size = false;
+  if (!unbounded) {
+    box_size = size + 8;
+    if (box_size >= 0x100000000ull) {
+      large_size = true;
+    }
+  }
+
+  {
+    const uint64_t store = large_size ? 1 : box_size;
+    for (size_t i = 0; i < 4; i++) {
+      output->push_back(store >> (8 * (3 - i)) & 0xff);
+    }
+  }
+  for (size_t i = 0; i < 4; i++) {
+    output->push_back(type[i]);
+  }
+
+  if (large_size) {
+    for (size_t i = 0; i < 8; i++) {
+      output->push_back(box_size >> (8 * (7 - i)) & 0xff);
+    }
+  }
+}
+
+}  // namespace jxl
+
+// Internal use only struct, can only be initialized correctly by
+// JxlEncoderCreate.
+struct JxlEncoderStruct {
+  JxlEncoderError error = JxlEncoderError::JXL_ENC_ERR_OK;
+  JxlMemoryManager memory_manager;
+  jxl::MemoryManagerUniquePtr<jxl::ThreadPool> thread_pool{
+      nullptr, jxl::MemoryManagerDeleteHelper(&memory_manager)};
+  JxlCmsInterface cms;
+  std::vector<jxl::MemoryManagerUniquePtr<JxlEncoderFrameSettings>>
+      encoder_options;
+
+  size_t num_queued_frames;
+  size_t num_queued_boxes;
+  std::vector<jxl::JxlEncoderQueuedInput> input_queue;
+  std::deque<uint8_t> output_byte_queue;
+  std::deque<jxl::FJXLFrameUniquePtr> output_fast_frame_queue;
+
+  // How many codestream bytes have been written, i.e.,
+  // content of jxlc and jxlp boxes. Frame index box jxli
+  // requires position indices to point to codestream bytes,
+  // so we need to keep track of the total of flushed or queue
+  // codestream bytes. These bytes may be in a single jxlc box
+  // or across multiple jxlp boxes.
+  size_t codestream_bytes_written_beginning_of_frame;
+  size_t codestream_bytes_written_end_of_frame;
+  jxl::JxlEncoderFrameIndexBox frame_index_box;
+
+  // Force using the container even if not needed
+  bool use_container;
+  // User declared they will add metadata boxes
+  bool use_boxes;
+
+  // TODO(lode): move level into jxl::CompressParams since some C++
+  // implementation decisions should be based on it: level 10 allows more
+  // features to be used.
+  int32_t codestream_level;
+  bool store_jpeg_metadata;
+  jxl::CodecMetadata metadata;
+  std::vector<uint8_t> jpeg_metadata;
+
+  // Wrote any output at all, so wrote the data before the first user added
+  // frame or box, such as signature, basic info, ICC profile or jpeg
+  // reconstruction box.
+  bool wrote_bytes;
+  jxl::CompressParams last_used_cparams;
+  JxlBasicInfo basic_info;
+
+  // Encoder wrote a jxlp (partial codestream) box, so any next codestream
+  // parts must also be written in jxlp boxes, a single jxlc box cannot be
+  // used. The counter is used for the 4-byte jxlp box index header.
+  size_t jxlp_counter;
+
+  bool frames_closed;
+  bool boxes_closed;
+  bool basic_info_set;
+  bool color_encoding_set;
+  bool intensity_target_set;
+  bool allow_expert_options = false;
+  int brotli_effort = -1;
+
+  // Takes the first frame in the input_queue, encodes it, and appends
+  // the bytes to the output_byte_queue.
+  JxlEncoderStatus RefillOutputByteQueue();
+
+  bool MustUseContainer() const {
+    return use_container || (codestream_level != 5 && codestream_level != -1) ||
+           store_jpeg_metadata || use_boxes;
+  }
+
+  // Appends the bytes of a JXL box header with the provided type and size to
+  // the end of the output_byte_queue. If unbounded is true, the size won't be
+  // added to the header and the box will be assumed to continue until EOF.
+  void AppendBoxHeader(const jxl::BoxType& type, size_t size, bool unbounded);
+};
+
+struct JxlEncoderFrameSettingsStruct {
+  JxlEncoder* enc;
+  jxl::JxlEncoderFrameSettingsValues values;
+};
+
+#endif  // LIB_JXL_ENCODE_INTERNAL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/encode_test.cc b/third_party/jpeg-xl/lib/jxl/encode_test.cc
new file mode 100644
index 0000000000..3f1d77fd62
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/encode_test.cc
@@ -0,0 +1,1405 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/encode.h>
+#include <jxl/encode_cxx.h>
+
+#include "lib/extras/codec.h"
+#include "lib/extras/dec/jxl.h"
+#include "lib/jxl/enc_butteraugli_pnorm.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/encode_internal.h"
+#include "lib/jxl/jpeg/dec_jpeg_data.h"
+#include "lib/jxl/jpeg/dec_jpeg_data_writer.h"
+#include "lib/jxl/test_image.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+TEST(EncodeTest, AddFrameAfterCloseInputTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+
+  JxlEncoderCloseInput(enc.get());
+
+  size_t xsize = 64;
+  size_t ysize = 64;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+
+  jxl::CodecInOut input_io =
+      jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize);
+
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = false;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding,
+                            /*is_gray=*/pixel_format.num_channels < 3);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+  EXPECT_EQ(JXL_ENC_ERROR,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+}
+
+TEST(EncodeTest, AddJPEGAfterCloseTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+
+  JxlEncoderCloseInput(enc.get());
+
+  const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg";
+  const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path);
+
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+
+  EXPECT_EQ(JXL_ENC_ERROR,
+            JxlEncoderAddJPEGFrame(frame_settings, orig.data(), orig.size()));
+}
+
+TEST(EncodeTest, AddFrameBeforeColorEncodingTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+
+  size_t xsize = 64;
+  size_t ysize = 64;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+
+  jxl::CodecInOut input_io =
+      jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize);
+
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = true;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+  EXPECT_EQ(JXL_ENC_ERROR,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+}
+
+TEST(EncodeTest, AddFrameBeforeBasicInfoTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+
+  size_t xsize = 64;
+  size_t ysize = 64;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+
+  jxl::CodecInOut input_io =
+      jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize);
+
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding,
+                            /*is_gray=*/pixel_format.num_channels < 3);
+  EXPECT_EQ(JXL_ENC_ERROR,
+            JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+  EXPECT_EQ(JXL_ENC_ERROR,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+}
+
+TEST(EncodeTest, DefaultAllocTest) {
+  JxlEncoder* enc = JxlEncoderCreate(nullptr);
+  EXPECT_NE(nullptr, enc);
+  JxlEncoderDestroy(enc);
+}
+
+TEST(EncodeTest, CustomAllocTest) {
+  struct CalledCounters {
+    int allocs = 0;
+    int frees = 0;
+  } counters;
+
+  JxlMemoryManager mm;
+  mm.opaque = &counters;
+  mm.alloc = [](void* opaque, size_t size) {
+    reinterpret_cast<CalledCounters*>(opaque)->allocs++;
+    return malloc(size);
+  };
+  mm.free = [](void* opaque, void* address) {
+    reinterpret_cast<CalledCounters*>(opaque)->frees++;
+    free(address);
+  };
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(&mm);
+    EXPECT_NE(nullptr, enc.get());
+    EXPECT_LE(1, counters.allocs);
+    EXPECT_EQ(0, counters.frees);
+  }
+  EXPECT_LE(1, counters.frees);
+}
+
+TEST(EncodeTest, DefaultParallelRunnerTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetParallelRunner(enc.get(), nullptr, nullptr));
+}
+
+void VerifyFrameEncoding(size_t xsize, size_t ysize, JxlEncoder* enc,
+                         const JxlEncoderFrameSettings* frame_settings,
+                         size_t max_compressed_size,
+                         bool lossy_use_original_profile) {
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+
+  jxl::CodecInOut input_io =
+      jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize);
+
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  if (frame_settings->values.lossless || lossy_use_original_profile) {
+    basic_info.uses_original_profile = true;
+  } else {
+    basic_info.uses_original_profile = false;
+  }
+  // 16-bit alpha means this requires level 10
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc, 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info));
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding, true);
+  EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderSetColorEncoding(enc, &color_encoding));
+  JxlColorEncodingSetToSRGB(&color_encoding, false);
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetColorEncoding(enc, &color_encoding));
+  pixel_format.num_channels = 1;
+  EXPECT_EQ(JXL_ENC_ERROR,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+  pixel_format.num_channels = 4;
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+  JxlEncoderCloseInput(enc);
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc, &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed.data();
+      compressed.resize(compressed.size() * 2);
+      next_out = compressed.data() + offset;
+      avail_out = compressed.size() - offset;
+    }
+  }
+  compressed.resize(next_out - compressed.data());
+  EXPECT_LE(compressed.size(), max_compressed_size);
+  EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+  jxl::CodecInOut decoded_io;
+  EXPECT_TRUE(jxl::test::DecodeFile(
+      {}, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+      &decoded_io));
+
+  EXPECT_LE(
+      ComputeDistance2(input_io.Main(), decoded_io.Main(), jxl::GetJxlCms()),
+#if JXL_HIGH_PRECISION
+      1.84);
+#else
+      8.7);
+#endif
+}
+
+void VerifyFrameEncoding(JxlEncoder* enc,
+                         const JxlEncoderFrameSettings* frame_settings) {
+  VerifyFrameEncoding(63, 129, enc, frame_settings, 2700,
+                      /*lossy_use_original_profile=*/false);
+}
+
+TEST(EncodeTest, FrameEncodingTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+  VerifyFrameEncoding(enc.get(),
+                      JxlEncoderFrameSettingsCreate(enc.get(), nullptr));
+}
+
+TEST(EncodeTest, EncoderResetTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+  VerifyFrameEncoding(50, 200, enc.get(),
+                      JxlEncoderFrameSettingsCreate(enc.get(), nullptr), 4300,
+                      false);
+  // Encoder should become reusable for a new image from scratch after using
+  // reset.
+  JxlEncoderReset(enc.get());
+  VerifyFrameEncoding(157, 77, enc.get(),
+                      JxlEncoderFrameSettingsCreate(enc.get(), nullptr), 2300,
+                      false);
+}
+
+TEST(EncodeTest, CmsTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+  bool cms_called = false;
+  JxlCmsInterface cms = jxl::GetJxlCms();
+  struct InitData {
+    void* original_init_data;
+    jpegxl_cms_init_func original_init;
+    bool* cms_called;
+  };
+  InitData init_data = {/*original_init_data=*/cms.init_data,
+                        /*original_init=*/cms.init,
+                        /*cms_called=*/&cms_called};
+  cms.init_data = &init_data;
+  cms.init = +[](void* raw_init_data, size_t num_threads,
+                 size_t pixels_per_thread, const JxlColorProfile* input_profile,
+                 const JxlColorProfile* output_profile,
+                 float intensity_target) {
+    const InitData* init_data = static_cast<const InitData*>(raw_init_data);
+    *init_data->cms_called = true;
+    return init_data->original_init(init_data->original_init_data, num_threads,
+                                    pixels_per_thread, input_profile,
+                                    output_profile, intensity_target);
+  };
+  JxlEncoderSetCms(enc.get(), cms);
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), nullptr);
+  JxlEncoderSetFrameLossless(frame_settings, false);
+  ASSERT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderFrameSettingsSetOption(frame_settings,
+                                             JXL_ENC_FRAME_SETTING_EFFORT, 8));
+  VerifyFrameEncoding(enc.get(), frame_settings);
+  EXPECT_TRUE(cms_called);
+}
+
+TEST(EncodeTest, frame_settingsTest) {
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 5));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_EQ(jxl::SpeedTier::kHare, enc->last_used_cparams.speed_tier);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    // Lower than currently supported values
+    EXPECT_EQ(JXL_ENC_ERROR,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 0));
+    // Higher than currently supported values
+    EXPECT_EQ(JXL_ENC_ERROR,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 11));
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderSetFrameLossless(frame_settings, JXL_TRUE));
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 3600, false);
+    EXPECT_EQ(true, enc->last_used_cparams.IsLossless());
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetFrameDistance(frame_settings, 0.5));
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 3030, false);
+    EXPECT_EQ(0.5, enc->last_used_cparams.butteraugli_distance);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    // Disallowed negative distance
+    EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderSetFrameDistance(frame_settings, -1));
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_DECODING_SPEED, 2));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_EQ(2u, enc->last_used_cparams.decoding_speed_tier);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_ERROR,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_GROUP_ORDER, 100));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_GROUP_ORDER, 1));
+    EXPECT_EQ(
+        JXL_ENC_SUCCESS,
+        JxlEncoderFrameSettingsSetOption(
+            frame_settings, JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_X, 5));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_EQ(true, enc->last_used_cparams.centerfirst);
+    EXPECT_EQ(5, enc->last_used_cparams.center_x);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_RESPONSIVE, 0));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC, 1));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC, -1));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 2));
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 2830,
+                        /*lossy_use_original_profile=*/false);
+    EXPECT_EQ(false, enc->last_used_cparams.responsive);
+    EXPECT_EQ(true, enc->last_used_cparams.progressive_mode);
+    EXPECT_EQ(2, enc->last_used_cparams.progressive_dc);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(
+        JXL_ENC_SUCCESS,
+        JxlEncoderFrameSettingsSetFloatOption(
+            frame_settings, JXL_ENC_FRAME_SETTING_PHOTON_NOISE, 1777.777));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_NEAR(1777.777f, enc->last_used_cparams.photon_noise_iso, 1E-4);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetFloatOption(
+                  frame_settings,
+                  JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GLOBAL_PERCENT, 55.0f));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetFloatOption(
+                  frame_settings,
+                  JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT, 25.0f));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_PALETTE_COLORS, 70000));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_LOSSY_PALETTE, 1));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_NEAR(55.0f,
+                enc->last_used_cparams.channel_colors_pre_transform_percent,
+                1E-6);
+    EXPECT_NEAR(25.0f, enc->last_used_cparams.channel_colors_percent, 1E-6);
+    EXPECT_EQ(70000, enc->last_used_cparams.palette_colors);
+    EXPECT_EQ(true, enc->last_used_cparams.lossy_palette);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(
+        JXL_ENC_SUCCESS,
+        JxlEncoderFrameSettingsSetOption(
+            frame_settings, JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE, 30));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE, 2));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR, 14));
+    EXPECT_EQ(
+        JXL_ENC_SUCCESS,
+        JxlEncoderFrameSettingsSetFloatOption(
+            frame_settings,
+            JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT, 77.0f));
+    EXPECT_EQ(
+        JXL_ENC_SUCCESS,
+        JxlEncoderFrameSettingsSetOption(
+            frame_settings, JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS, 7));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_EQ(30, enc->last_used_cparams.colorspace);
+    EXPECT_EQ(2, enc->last_used_cparams.modular_group_size_shift);
+    EXPECT_EQ(jxl::Predictor::Best, enc->last_used_cparams.options.predictor);
+    EXPECT_NEAR(0.77f, enc->last_used_cparams.options.nb_repeats, 1E-6);
+    EXPECT_EQ(7, enc->last_used_cparams.options.max_properties);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL, 0));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_EQ(false, enc->last_used_cparams.force_cfl_jpeg_recompression);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL, 1));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_EQ(true, enc->last_used_cparams.force_cfl_jpeg_recompression);
+  }
+}
+
+TEST(EncodeTest, LossyEncoderUseOriginalProfileTest) {
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    ASSERT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 7897, true);
+  }
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    ASSERT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 2));
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 8310, true);
+  }
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    ASSERT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    ASSERT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 8));
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 7173, true);
+  }
+}
+
+namespace {
+// Returns a copy of buf from offset to offset+size, or a new zeroed vector if
+// the result would have been out of bounds taking integer overflow into
+// account.
+std::vector<uint8_t> SliceSpan(const jxl::Span<const uint8_t>& buf,
+                               size_t offset, size_t size) {
+  if (offset + size >= buf.size()) {
+    return std::vector<uint8_t>(size, 0);
+  }
+  if (offset + size < offset) {
+    return std::vector<uint8_t>(size, 0);
+  }
+  return std::vector<uint8_t>(buf.data() + offset, buf.data() + offset + size);
+}
+
+struct Box {
+  // The type of the box.
+  // If "uuid", use extended_type instead
+  char type[4] = {0, 0, 0, 0};
+
+  // The extended_type is only used when type == "uuid".
+  // Extended types are not used in JXL. However, the box format itself
+  // supports this so they are handled correctly.
+  char extended_type[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  // Box data.
+  jxl::Span<const uint8_t> data = jxl::Span<const uint8_t>(nullptr, 0);
+
+  // If the size is not given, the datasize extends to the end of the file.
+  // If this field is false, the size field is not encoded when the box is
+  // serialized.
+  bool data_size_given = true;
+
+  // If successful, returns true and sets `in` to be the rest data (if any).
+  // If `in` contains a box with a size larger than `in.size()`, will not
+  // modify `in`, and will return true but the data `Span<uint8_t>` will
+  // remain set to nullptr.
+  // If unsuccessful, returns error and doesn't modify `in`.
+  jxl::Status Decode(jxl::Span<const uint8_t>* in) {
+    // Total box_size including this header itself.
+    uint64_t box_size = LoadBE32(SliceSpan(*in, 0, 4).data());
+    size_t pos = 4;
+
+    memcpy(type, SliceSpan(*in, pos, 4).data(), 4);
+    pos += 4;
+
+    if (box_size == 1) {
+      // If the size is 1, it indicates extended size read from 64-bit integer.
+      box_size = LoadBE64(SliceSpan(*in, pos, 8).data());
+      pos += 8;
+    }
+
+    if (!memcmp("uuid", type, 4)) {
+      memcpy(extended_type, SliceSpan(*in, pos, 16).data(), 16);
+      pos += 16;
+    }
+
+    // This is the end of the box header, the box data begins here. Handle
+    // the data size now.
+    const size_t header_size = pos;
+
+    if (box_size != 0) {
+      if (box_size < header_size) {
+        return JXL_FAILURE("Invalid box size");
+      }
+      if (box_size > in->size()) {
+        // The box is fine, but the input is too short.
+        return true;
+      }
+      data_size_given = true;
+      data = jxl::Span<const uint8_t>(in->data() + header_size,
+                                      box_size - header_size);
+    } else {
+      data_size_given = false;
+      data = jxl::Span<const uint8_t>(in->data() + header_size,
+                                      in->size() - header_size);
+    }
+
+    *in = jxl::Span<const uint8_t>(in->data() + header_size + data.size(),
+                                   in->size() - header_size - data.size());
+    return true;
+  }
+};
+
+struct Container {
+  std::vector<Box> boxes;
+
+  // If successful, returns true and sets `in` to be the rest data (if any).
+  // If unsuccessful, returns error and doesn't modify `in`.
+  jxl::Status Decode(jxl::Span<const uint8_t>* in) {
+    boxes.clear();
+
+    Box signature_box;
+    JXL_RETURN_IF_ERROR(signature_box.Decode(in));
+    if (memcmp("JXL ", signature_box.type, 4) != 0) {
+      return JXL_FAILURE("Invalid magic signature");
+    }
+    if (signature_box.data.size() != 4)
+      return JXL_FAILURE("Invalid magic signature");
+    if (signature_box.data[0] != 0xd || signature_box.data[1] != 0xa ||
+        signature_box.data[2] != 0x87 || signature_box.data[3] != 0xa) {
+      return JXL_FAILURE("Invalid magic signature");
+    }
+
+    Box ftyp_box;
+    JXL_RETURN_IF_ERROR(ftyp_box.Decode(in));
+    if (memcmp("ftyp", ftyp_box.type, 4) != 0) {
+      return JXL_FAILURE("Invalid ftyp");
+    }
+    if (ftyp_box.data.size() != 12) return JXL_FAILURE("Invalid ftyp");
+    const char* expected = "jxl \0\0\0\0jxl ";
+    if (memcmp(expected, ftyp_box.data.data(), 12) != 0)
+      return JXL_FAILURE("Invalid ftyp");
+
+    while (!in->empty()) {
+      Box box = {};
+      JXL_RETURN_IF_ERROR(box.Decode(in));
+      if (box.data.data() == nullptr) {
+        // The decoding encountered a box, but not enough data yet.
+        return true;
+      }
+      boxes.emplace_back(box);
+    }
+
+    return true;
+  }
+};
+
+}  // namespace
+
+TEST(EncodeTest, SingleFrameBoundedJXLCTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc.get(), true));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+
+  size_t xsize = 71;
+  size_t ysize = 23;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = false;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding,
+                            /*is_gray=*/false);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+  JxlEncoderCloseInput(enc.get());
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed.data();
+      compressed.resize(compressed.size() * 2);
+      next_out = compressed.data() + offset;
+      avail_out = compressed.size() - offset;
+    }
+  }
+  compressed.resize(next_out - compressed.data());
+  EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+
+  Container container = {};
+  jxl::Span<const uint8_t> encoded_span =
+      jxl::Span<const uint8_t>(compressed.data(), compressed.size());
+  EXPECT_TRUE(container.Decode(&encoded_span));
+  EXPECT_EQ(0u, encoded_span.size());
+  bool found_jxlc = false;
+  bool found_jxlp = false;
+  // The encoder is allowed to either emit a jxlc or one or more jxlp.
+  for (size_t i = 0; i < container.boxes.size(); ++i) {
+    if (memcmp("jxlc", container.boxes[i].type, 4) == 0) {
+      EXPECT_EQ(false, found_jxlc);  // Max 1 jxlc
+      EXPECT_EQ(false, found_jxlp);  // Can't mix jxlc and jxlp
+      found_jxlc = true;
+    }
+    if (memcmp("jxlp", container.boxes[i].type, 4) == 0) {
+      EXPECT_EQ(false, found_jxlc);  // Can't mix jxlc and jxlp
+      found_jxlp = true;
+    }
+    // The encoder shouldn't create an unbounded box in this case, with the
+    // single frame it knows the full size in time, so can help make decoding
+    // more efficient by giving the full box size of the final box.
+    EXPECT_EQ(true, container.boxes[i].data_size_given);
+  }
+  EXPECT_EQ(true, found_jxlc || found_jxlp);
+}
+
+TEST(EncodeTest, CodestreamLevelTest) {
+  size_t xsize = 64;
+  size_t ysize = 64;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+
+  jxl::CodecInOut input_io =
+      jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize);
+
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = false;
+
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding,
+                            /*is_gray=*/pixel_format.num_channels < 3);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+  JxlEncoderCloseInput(enc.get());
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed.data();
+      compressed.resize(compressed.size() * 2);
+      next_out = compressed.data() + offset;
+      avail_out = compressed.size() - offset;
+    }
+  }
+  compressed.resize(next_out - compressed.data());
+  EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+
+  Container container = {};
+  jxl::Span<const uint8_t> encoded_span =
+      jxl::Span<const uint8_t>(compressed.data(), compressed.size());
+  EXPECT_TRUE(container.Decode(&encoded_span));
+  EXPECT_EQ(0u, encoded_span.size());
+  EXPECT_EQ(0, memcmp("jxll", container.boxes[0].type, 4));
+}
+
+TEST(EncodeTest, CodestreamLevelVerificationTest) {
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0};
+
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = 64;
+  basic_info.ysize = 64;
+  basic_info.uses_original_profile = false;
+
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+
+  EXPECT_EQ(5, JxlEncoderGetRequiredCodestreamLevel(enc.get()));
+
+  // Set an image dimension that is too large for level 5, but fits in level 10
+
+  basic_info.xsize = 1ull << 30ull;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 5));
+  EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  EXPECT_EQ(10, JxlEncoderGetRequiredCodestreamLevel(enc.get()));
+
+  // Set an image dimension that is too large even for level 10
+
+  basic_info.xsize = 1ull << 31ull;
+  EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+}
+
+TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructionTest)) {
+  const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg";
+  const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path);
+
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderStoreJPEGMetadata(enc.get(), JXL_TRUE));
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddJPEGFrame(frame_settings, orig.data(), orig.size()));
+  JxlEncoderCloseInput(enc.get());
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed.data();
+      compressed.resize(compressed.size() * 2);
+      next_out = compressed.data() + offset;
+      avail_out = compressed.size() - offset;
+    }
+  }
+  compressed.resize(next_out - compressed.data());
+  EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+
+  jxl::extras::JXLDecompressParams dparams;
+  std::vector<uint8_t> decoded_jpeg_bytes;
+  jxl::extras::PackedPixelFile ppf;
+  EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams,
+                             nullptr, &ppf, &decoded_jpeg_bytes));
+
+  EXPECT_EQ(decoded_jpeg_bytes.size(), orig.size());
+  EXPECT_EQ(0, memcmp(decoded_jpeg_bytes.data(), orig.data(), orig.size()));
+}
+
+TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(ProgressiveJPEGReconstructionTest)) {
+  const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg";
+  const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path);
+
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+
+  frame_settings->values.cparams.progressive_mode = true;
+
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderStoreJPEGMetadata(enc.get(), JXL_TRUE));
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddJPEGFrame(frame_settings, orig.data(), orig.size()));
+  JxlEncoderCloseInput(enc.get());
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed.data();
+      compressed.resize(compressed.size() * 2);
+      next_out = compressed.data() + offset;
+      avail_out = compressed.size() - offset;
+    }
+  }
+  compressed.resize(next_out - compressed.data());
+  EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+
+  jxl::extras::JXLDecompressParams dparams;
+  std::vector<uint8_t> decoded_jpeg_bytes;
+  jxl::extras::PackedPixelFile ppf;
+  EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams,
+                             nullptr, &ppf, &decoded_jpeg_bytes));
+
+  EXPECT_EQ(decoded_jpeg_bytes.size(), orig.size());
+  EXPECT_EQ(0, memcmp(decoded_jpeg_bytes.data(), orig.data(), orig.size()));
+}
+
+static void ProcessEncoder(JxlEncoder* enc, std::vector<uint8_t>& compressed,
+                           uint8_t*& next_out, size_t& avail_out) {
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc, &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed.data();
+      compressed.resize(compressed.size() * 2);
+      next_out = compressed.data() + offset;
+      avail_out = compressed.size() - offset;
+    }
+  }
+  size_t offset = next_out - compressed.data();
+  compressed.resize(next_out - compressed.data());
+  next_out = compressed.data() + offset;
+  avail_out = compressed.size() - offset;
+  EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+}
+
+TEST(EncodeTest, BasicInfoTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+  size_t xsize = 1;
+  size_t ysize = 1;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = false;
+  basic_info.have_animation = true;
+  basic_info.intensity_target = 123.4;
+  basic_info.min_nits = 5.0;
+  basic_info.linear_below = 12.7;
+  basic_info.orientation = JXL_ORIENT_ROTATE_90_CW;
+  basic_info.intrinsic_xsize = 88;
+  basic_info.intrinsic_ysize = 99;
+  basic_info.animation.tps_numerator = 55;
+  basic_info.animation.tps_denominator = 77;
+  basic_info.animation.num_loops = 10;
+  basic_info.animation.have_timecodes = JXL_TRUE;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding, /*is_gray=*/false);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+  JxlEncoderCloseFrames(enc.get());
+  ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+
+  // Decode to verify the boxes, we don't decode to pixels, only the boxes.
+  JxlDecoderPtr dec = JxlDecoderMake(nullptr);
+  EXPECT_NE(nullptr, dec.get());
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec.get(), JXL_DEC_BASIC_INFO));
+  // Allow testing the orientation field, without this setting it will be
+  // overridden to identity.
+  JxlDecoderSetKeepOrientation(dec.get(), JXL_TRUE);
+  JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size());
+  JxlDecoderCloseInput(dec.get());
+
+  for (;;) {
+    JxlDecoderStatus status = JxlDecoderProcessInput(dec.get());
+    if (status == JXL_DEC_ERROR) {
+      FAIL();
+    } else if (status == JXL_DEC_SUCCESS) {
+      break;
+    } else if (status == JXL_DEC_BASIC_INFO) {
+      JxlBasicInfo basic_info2;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderGetBasicInfo(dec.get(), &basic_info2));
+      EXPECT_EQ(basic_info.xsize, basic_info2.xsize);
+      EXPECT_EQ(basic_info.ysize, basic_info2.ysize);
+      EXPECT_EQ(basic_info.bits_per_sample, basic_info2.bits_per_sample);
+      EXPECT_EQ(basic_info.exponent_bits_per_sample,
+                basic_info2.exponent_bits_per_sample);
+      EXPECT_NEAR(basic_info.intensity_target, basic_info2.intensity_target,
+                  0.5);
+      EXPECT_NEAR(basic_info.min_nits, basic_info2.min_nits, 0.5);
+      EXPECT_NEAR(basic_info.linear_below, basic_info2.linear_below, 0.5);
+      EXPECT_EQ(basic_info.relative_to_max_display,
+                basic_info2.relative_to_max_display);
+      EXPECT_EQ(basic_info.uses_original_profile,
+                basic_info2.uses_original_profile);
+      EXPECT_EQ(basic_info.orientation, basic_info2.orientation);
+      EXPECT_EQ(basic_info.intrinsic_xsize, basic_info2.intrinsic_xsize);
+      EXPECT_EQ(basic_info.intrinsic_ysize, basic_info2.intrinsic_ysize);
+      EXPECT_EQ(basic_info.num_color_channels, basic_info2.num_color_channels);
+      // TODO(lode): also test num_extra_channels, but currently there may be a
+      // mismatch between 0 and 1 if there is alpha, until encoder support for
+      // extra channels is fully implemented.
+      EXPECT_EQ(basic_info.alpha_bits, basic_info2.alpha_bits);
+      EXPECT_EQ(basic_info.alpha_exponent_bits,
+                basic_info2.alpha_exponent_bits);
+      EXPECT_EQ(basic_info.alpha_premultiplied,
+                basic_info2.alpha_premultiplied);
+
+      EXPECT_EQ(basic_info.have_preview, basic_info2.have_preview);
+      if (basic_info.have_preview) {
+        EXPECT_EQ(basic_info.preview.xsize, basic_info2.preview.xsize);
+        EXPECT_EQ(basic_info.preview.ysize, basic_info2.preview.ysize);
+      }
+
+      EXPECT_EQ(basic_info.have_animation, basic_info2.have_animation);
+      if (basic_info.have_animation) {
+        EXPECT_EQ(basic_info.animation.tps_numerator,
+                  basic_info2.animation.tps_numerator);
+        EXPECT_EQ(basic_info.animation.tps_denominator,
+                  basic_info2.animation.tps_denominator);
+        EXPECT_EQ(basic_info.animation.num_loops,
+                  basic_info2.animation.num_loops);
+        EXPECT_EQ(basic_info.animation.have_timecodes,
+                  basic_info2.animation.have_timecodes);
+      }
+    } else {
+      FAIL();  // unexpected status
+    }
+  }
+}
+
+TEST(EncodeTest, AnimationHeaderTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+  size_t xsize = 1;
+  size_t ysize = 1;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.have_animation = true;
+  basic_info.animation.tps_numerator = 1000;
+  basic_info.animation.tps_denominator = 1;
+  basic_info.animation.have_timecodes = JXL_TRUE;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding, /*is_gray=*/false);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+
+  std::string frame_name = "test frame";
+  JxlFrameHeader header;
+  JxlEncoderInitFrameHeader(&header);
+  header.duration = 50;
+  header.timecode = 800;
+  header.layer_info.blend_info.blendmode = JXL_BLEND_BLEND;
+  header.layer_info.blend_info.source = 2;
+  header.layer_info.blend_info.clamp = 1;
+  JxlBlendInfo extra_channel_blend_info;
+  JxlEncoderInitBlendInfo(&extra_channel_blend_info);
+  extra_channel_blend_info.blendmode = JXL_BLEND_MULADD;
+  JxlEncoderSetFrameHeader(frame_settings, &header);
+  JxlEncoderSetExtraChannelBlendInfo(frame_settings, 0,
+                                     &extra_channel_blend_info);
+  JxlEncoderSetFrameName(frame_settings, frame_name.c_str());
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+  JxlEncoderCloseFrames(enc.get());
+  ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+
+  // Decode to verify the boxes, we don't decode to pixels, only the boxes.
+  JxlDecoderPtr dec = JxlDecoderMake(nullptr);
+  EXPECT_NE(nullptr, dec.get());
+
+  // To test the blend_info fields, coalescing must be set to false in the
+  // decoder.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec.get(), JXL_FALSE));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec.get(), JXL_DEC_FRAME));
+  JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size());
+  JxlDecoderCloseInput(dec.get());
+
+  bool seen_frame = false;
+
+  for (;;) {
+    JxlDecoderStatus status = JxlDecoderProcessInput(dec.get());
+    if (status == JXL_DEC_ERROR) {
+      FAIL();
+    } else if (status == JXL_DEC_SUCCESS) {
+      break;
+    } else if (status == JXL_DEC_FRAME) {
+      seen_frame = true;
+      JxlFrameHeader header2;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec.get(), &header2));
+      EXPECT_EQ(header.duration, header2.duration);
+      EXPECT_EQ(header.timecode, header2.timecode);
+      EXPECT_EQ(header.layer_info.blend_info.blendmode,
+                header2.layer_info.blend_info.blendmode);
+      EXPECT_EQ(header.layer_info.blend_info.clamp,
+                header2.layer_info.blend_info.clamp);
+      EXPECT_EQ(header.layer_info.blend_info.source,
+                header2.layer_info.blend_info.source);
+      EXPECT_EQ(frame_name.size(), header2.name_length);
+      JxlBlendInfo extra_channel_blend_info2;
+      JxlDecoderGetExtraChannelBlendInfo(dec.get(), 0,
+                                         &extra_channel_blend_info2);
+      EXPECT_EQ(extra_channel_blend_info.blendmode,
+                extra_channel_blend_info2.blendmode);
+      if (header2.name_length > 0) {
+        std::string frame_name2(header2.name_length + 1, '\0');
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderGetFrameName(dec.get(), &frame_name2.front(),
+                                         frame_name2.size()));
+        frame_name2.resize(header2.name_length);
+        EXPECT_EQ(frame_name, frame_name2);
+      }
+    } else {
+      FAIL();  // unexpected status
+    }
+  }
+
+  EXPECT_EQ(true, seen_frame);
+}
+TEST(EncodeTest, CroppedFrameTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+  size_t xsize = 300;
+  size_t ysize = 300;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  std::vector<uint8_t> pixels2(pixels.size());
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  // Encoding a 300x300 frame in an image that is only 100x100
+  basic_info.xsize = 100;
+  basic_info.ysize = 100;
+  basic_info.uses_original_profile = JXL_TRUE;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding, /*is_gray=*/false);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+
+  JxlFrameHeader header;
+  JxlEncoderInitFrameHeader(&header);
+  header.layer_info.have_crop = JXL_TRUE;
+  header.layer_info.xsize = xsize;
+  header.layer_info.ysize = ysize;
+  header.layer_info.crop_x0 = -50;
+  header.layer_info.crop_y0 = -250;
+  JxlEncoderSetFrameLossless(frame_settings, JXL_TRUE);
+  JxlEncoderSetFrameHeader(frame_settings, &header);
+  JxlEncoderFrameSettingsSetOption(frame_settings, JXL_ENC_FRAME_SETTING_EFFORT,
+                                   1);
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(100);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+  JxlEncoderCloseFrames(enc.get());
+  ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+
+  JxlDecoderPtr dec = JxlDecoderMake(nullptr);
+  EXPECT_NE(nullptr, dec.get());
+  // Non-coalesced decoding so we can get the full uncropped frame
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec.get(), JXL_FALSE));
+  EXPECT_EQ(
+      JXL_DEC_SUCCESS,
+      JxlDecoderSubscribeEvents(dec.get(), JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size());
+  JxlDecoderCloseInput(dec.get());
+
+  bool seen_frame = false;
+  bool checked_frame = false;
+  for (;;) {
+    JxlDecoderStatus status = JxlDecoderProcessInput(dec.get());
+    if (status == JXL_DEC_ERROR) {
+      FAIL();
+    } else if (status == JXL_DEC_SUCCESS) {
+      break;
+    } else if (status == JXL_DEC_FRAME) {
+      seen_frame = true;
+      JxlFrameHeader header2;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec.get(), &header2));
+      EXPECT_EQ(header.layer_info.xsize, header2.layer_info.xsize);
+      EXPECT_EQ(header.layer_info.ysize, header2.layer_info.ysize);
+      EXPECT_EQ(header.layer_info.crop_x0, header2.layer_info.crop_x0);
+      EXPECT_EQ(header.layer_info.crop_y0, header2.layer_info.crop_y0);
+    } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(dec.get(), &pixel_format,
+                                            pixels2.data(), pixels2.size()));
+    } else if (status == JXL_DEC_FULL_IMAGE) {
+      EXPECT_EQ(0, memcmp(pixels.data(), pixels2.data(), pixels.size()));
+      checked_frame = true;
+    } else {
+      FAIL();  // unexpected status
+    }
+  }
+  EXPECT_EQ(true, checked_frame);
+  EXPECT_EQ(true, seen_frame);
+}
+
+TEST(EncodeTest, JXL_BOXES_TEST(BoxTest)) {
+  // Test with uncompressed boxes and with brob boxes
+  for (int compress_box = 0; compress_box <= 1; ++compress_box) {
+    // Tests adding two metadata boxes with the encoder: an exif box before the
+    // image frame, and an xml box after the image frame. Then verifies the
+    // decoder can decode them, they are in the expected place, and have the
+    // correct content after decoding.
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+
+    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseBoxes(enc.get()));
+
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    size_t xsize = 50;
+    size_t ysize = 17;
+    JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    std::vector<uint8_t> pixels =
+        jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+    JxlBasicInfo basic_info;
+    jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+    basic_info.xsize = xsize;
+    basic_info.ysize = ysize;
+    basic_info.uses_original_profile = false;
+    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+    JxlColorEncoding color_encoding;
+    JxlColorEncodingSetToSRGB(&color_encoding,
+                              /*is_gray=*/false);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+
+    std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+    uint8_t* next_out = compressed.data();
+    size_t avail_out = compressed.size() - (next_out - compressed.data());
+
+    // Add an early metadata box. Also add a valid 4-byte TIFF offset header
+    // before the fake exif data of these box contents.
+    constexpr const char* exif_test_string = "\0\0\0\0exif test data";
+    const uint8_t* exif_data =
+        reinterpret_cast<const uint8_t*>(exif_test_string);
+    // Skip the 4 zeroes for strlen
+    const size_t exif_size = 4 + strlen(exif_test_string + 4);
+    JxlEncoderAddBox(enc.get(), "Exif", exif_data, exif_size, compress_box);
+
+    // Write to output
+    ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+
+    // Add image frame
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                      pixels.data(), pixels.size()));
+    // Indicate this is the last frame
+    JxlEncoderCloseFrames(enc.get());
+
+    // Write to output
+    ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+
+    // Add a late metadata box
+    constexpr const char* xml_test_string = "<some random xml data>";
+    const uint8_t* xml_data = reinterpret_cast<const uint8_t*>(xml_test_string);
+    size_t xml_size = strlen(xml_test_string);
+    JxlEncoderAddBox(enc.get(), "XML ", xml_data, xml_size, compress_box);
+
+    // Indicate this is the last box
+    JxlEncoderCloseBoxes(enc.get());
+
+    // Write to output
+    ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+
+    // Decode to verify the boxes, we don't decode to pixels, only the boxes.
+    JxlDecoderPtr dec = JxlDecoderMake(nullptr);
+    EXPECT_NE(nullptr, dec.get());
+
+    if (compress_box) {
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetDecompressBoxes(dec.get(), JXL_TRUE));
+    }
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
+                                   dec.get(), JXL_DEC_FRAME | JXL_DEC_BOX));
+
+    JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size());
+    JxlDecoderCloseInput(dec.get());
+
+    std::vector<uint8_t> dec_exif_box(exif_size);
+    std::vector<uint8_t> dec_xml_box(xml_size);
+
+    for (bool post_frame = false;;) {
+      JxlDecoderStatus status = JxlDecoderProcessInput(dec.get());
+      if (status == JXL_DEC_ERROR) {
+        FAIL();
+      } else if (status == JXL_DEC_SUCCESS) {
+        EXPECT_EQ(0, JxlDecoderReleaseBoxBuffer(dec.get()));
+        break;
+      } else if (status == JXL_DEC_FRAME) {
+        post_frame = true;
+      } else if (status == JXL_DEC_BOX) {
+        // Since we gave the exif/xml box output buffer of the exact known
+        // correct size, 0 bytes should be released. Same when no buffer was
+        // set.
+        EXPECT_EQ(0, JxlDecoderReleaseBoxBuffer(dec.get()));
+        JxlBoxType type;
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec.get(), type, true));
+        if (!memcmp(type, "Exif", 4)) {
+          // This box should have been encoded before the image frame
+          EXPECT_EQ(false, post_frame);
+          JxlDecoderSetBoxBuffer(dec.get(), dec_exif_box.data(),
+                                 dec_exif_box.size());
+        } else if (!memcmp(type, "XML ", 4)) {
+          // This box should have been encoded after the image frame
+          EXPECT_EQ(true, post_frame);
+          JxlDecoderSetBoxBuffer(dec.get(), dec_xml_box.data(),
+                                 dec_xml_box.size());
+        }
+      } else {
+        FAIL();  // unexpected status
+      }
+    }
+
+    EXPECT_EQ(0, memcmp(exif_data, dec_exif_box.data(), exif_size));
+    EXPECT_EQ(0, memcmp(xml_data, dec_xml_box.data(), xml_size));
+  }
+}
+
+#if JPEGXL_ENABLE_JPEG  // Loading .jpg files requires libjpeg support.
+TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGFrameTest)) {
+  for (int skip_basic_info = 0; skip_basic_info < 2; skip_basic_info++) {
+    for (int skip_color_encoding = 0; skip_color_encoding < 2;
+         skip_color_encoding++) {
+      // cannot set color encoding if basic info is not set
+      if (skip_basic_info && !skip_color_encoding) continue;
+      const std::string jpeg_path = "jxl/flower/flower_cropped.jpg";
+      const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path);
+      jxl::CodecInOut orig_io;
+      ASSERT_TRUE(SetFromBytes(jxl::Span<const uint8_t>(orig), &orig_io,
+                               /*pool=*/nullptr));
+
+      JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+      JxlEncoderFrameSettings* frame_settings =
+          JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+      JxlEncoderFrameSettingsSetOption(frame_settings,
+                                       JXL_ENC_FRAME_SETTING_EFFORT, 1);
+      if (!skip_basic_info) {
+        JxlBasicInfo basic_info;
+        JxlEncoderInitBasicInfo(&basic_info);
+        basic_info.xsize = orig_io.xsize();
+        basic_info.ysize = orig_io.ysize();
+        basic_info.uses_original_profile = true;
+        EXPECT_EQ(JXL_ENC_SUCCESS,
+                  JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+      }
+      if (!skip_color_encoding) {
+        JxlColorEncoding color_encoding;
+        JxlColorEncodingSetToSRGB(&color_encoding, /*is_gray=*/false);
+        EXPECT_EQ(JXL_ENC_SUCCESS,
+                  JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+      }
+      EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderAddJPEGFrame(
+                                     frame_settings, orig.data(), orig.size()));
+      JxlEncoderCloseInput(enc.get());
+
+      std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+      uint8_t* next_out = compressed.data();
+      size_t avail_out = compressed.size() - (next_out - compressed.data());
+      JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+      while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+        process_result =
+            JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out);
+        if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+          size_t offset = next_out - compressed.data();
+          compressed.resize(compressed.size() * 2);
+          next_out = compressed.data() + offset;
+          avail_out = compressed.size() - offset;
+        }
+      }
+      compressed.resize(next_out - compressed.data());
+      EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+
+      jxl::CodecInOut decoded_io;
+      EXPECT_TRUE(jxl::test::DecodeFile(
+          {}, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+          &decoded_io));
+
+      EXPECT_LE(
+          ComputeDistance2(orig_io.Main(), decoded_io.Main(), jxl::GetJxlCms()),
+          3.5);
+    }
+  }
+}
+#endif  // JPEGXL_ENABLE_JPEG
diff --git a/third_party/jpeg-xl/lib/jxl/entropy_coder.cc b/third_party/jpeg-xl/lib/jxl/entropy_coder.cc
new file mode 100644
index 0000000000..0043c2d31e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/entropy_coder.cc
@@ -0,0 +1,70 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/entropy_coder.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_context_map.h"
+#include "lib/jxl/epf.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+Status DecodeBlockCtxMap(BitReader* br, BlockCtxMap* block_ctx_map) {
+  auto& dct = block_ctx_map->dc_thresholds;
+  auto& qft = block_ctx_map->qf_thresholds;
+  auto& ctx_map = block_ctx_map->ctx_map;
+  bool is_default = br->ReadFixedBits<1>();
+  if (is_default) {
+    *block_ctx_map = BlockCtxMap();
+    return true;
+  }
+  block_ctx_map->num_dc_ctxs = 1;
+  for (int j : {0, 1, 2}) {
+    dct[j].resize(br->ReadFixedBits<4>());
+    block_ctx_map->num_dc_ctxs *= dct[j].size() + 1;
+    for (int& i : dct[j]) {
+      i = UnpackSigned(U32Coder::Read(kDCThresholdDist, br));
+    }
+  }
+  qft.resize(br->ReadFixedBits<4>());
+  for (uint32_t& i : qft) {
+    i = U32Coder::Read(kQFThresholdDist, br) + 1;
+  }
+
+  if (block_ctx_map->num_dc_ctxs * (qft.size() + 1) > 64) {
+    return JXL_FAILURE("Invalid block context map: too big");
+  }
+
+  ctx_map.resize(3 * kNumOrders * block_ctx_map->num_dc_ctxs *
+                 (qft.size() + 1));
+  JXL_RETURN_IF_ERROR(DecodeContextMap(&ctx_map, &block_ctx_map->num_ctxs, br));
+  if (block_ctx_map->num_ctxs > 16) {
+    return JXL_FAILURE("Invalid block context map: too many distinct contexts");
+  }
+  return true;
+}
+
+constexpr uint8_t BlockCtxMap::kDefaultCtxMap[];  // from ac_context.h
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/entropy_coder.h b/third_party/jpeg-xl/lib/jxl/entropy_coder.h
new file mode 100644
index 0000000000..e4afa7a631
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/entropy_coder.h
@@ -0,0 +1,45 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENTROPY_CODER_H_
+#define LIB_JXL_ENTROPY_CODER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/field_encodings.h"
+
+// Entropy coding and context modeling of DC and AC coefficients, as well as AC
+// strategy and quantization field.
+
+namespace jxl {
+
+static JXL_INLINE int32_t PredictFromTopAndLeft(
+    const int32_t* const JXL_RESTRICT row_top,
+    const int32_t* const JXL_RESTRICT row, size_t x, int32_t default_val) {
+  if (x == 0) {
+    return row_top == nullptr ? default_val : row_top[x];
+  }
+  if (row_top == nullptr) {
+    return row[x - 1];
+  }
+  return (row_top[x] + row[x - 1] + 1) / 2;
+}
+
+static constexpr U32Enc kDCThresholdDist(Bits(4), BitsOffset(8, 16),
+                                         BitsOffset(16, 272),
+                                         BitsOffset(32, 65808));
+
+static constexpr U32Enc kQFThresholdDist(Bits(2), BitsOffset(3, 4),
+                                         BitsOffset(5, 12), BitsOffset(8, 44));
+
+Status DecodeBlockCtxMap(BitReader* br, BlockCtxMap* block_ctx_map);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENTROPY_CODER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/entropy_coder_test.cc b/third_party/jpeg-xl/lib/jxl/entropy_coder_test.cc
new file mode 100644
index 0000000000..9dbeb137af
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/entropy_coder_test.cc
@@ -0,0 +1,68 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// TODO(deymo): Move these tests to dec_ans.h and common.h
+
+#include <stdint.h>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(EntropyCoderTest, PackUnpack) {
+  for (int32_t i = -31; i < 32; ++i) {
+    uint32_t packed = PackSigned(i);
+    EXPECT_LT(packed, 63u);
+    int32_t unpacked = UnpackSigned(packed);
+    EXPECT_EQ(i, unpacked);
+  }
+}
+
+struct DummyBitReader {
+  uint32_t nbits, bits;
+  void Consume(uint32_t nbits) {}
+  uint32_t PeekBits(uint32_t n) {
+    EXPECT_EQ(n, nbits);
+    return bits;
+  }
+};
+
+void HybridUintRoundtrip(HybridUintConfig config, size_t limit = 1 << 24) {
+  Rng rng(0);
+  constexpr size_t kNumIntegers = 1 << 20;
+  std::vector<uint32_t> integers(kNumIntegers);
+  std::vector<uint32_t> token(kNumIntegers);
+  std::vector<uint32_t> nbits(kNumIntegers);
+  std::vector<uint32_t> bits(kNumIntegers);
+  for (size_t i = 0; i < kNumIntegers; i++) {
+    integers[i] = rng.UniformU(0, limit + 1);
+    config.Encode(integers[i], &token[i], &nbits[i], &bits[i]);
+  }
+  for (size_t i = 0; i < kNumIntegers; i++) {
+    DummyBitReader br{nbits[i], bits[i]};
+    EXPECT_EQ(integers[i],
+              ANSSymbolReader::ReadHybridUintConfig(config, token[i], &br));
+  }
+}
+
+TEST(HybridUintTest, Test000) {
+  HybridUintRoundtrip(HybridUintConfig{0, 0, 0});
+}
+TEST(HybridUintTest, Test411) {
+  HybridUintRoundtrip(HybridUintConfig{4, 1, 1});
+}
+TEST(HybridUintTest, Test420) {
+  HybridUintRoundtrip(HybridUintConfig{4, 2, 0});
+}
+TEST(HybridUintTest, Test421) {
+  HybridUintRoundtrip(HybridUintConfig{4, 2, 1}, 256);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/epf.cc b/third_party/jpeg-xl/lib/jxl/epf.cc
new file mode 100644
index 0000000000..7288ed9ca6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/epf.cc
@@ -0,0 +1,146 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Edge-preserving smoothing: weighted average based on L1 patch similarity.
+
+#include "lib/jxl/epf.h"
+
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <atomic>
+#include <numeric>  // std::accumulate
+#include <vector>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer.h"
+
+namespace jxl {
+
+// Mirror n floats starting at *p and store them before p.
+JXL_INLINE void LeftMirror(float* p, size_t n) {
+  for (size_t i = 0; i < n; i++) {
+    *(p - 1 - i) = p[i];
+  }
+}
+
+// Mirror n floats starting at *(p - n) and store them at *p.
+JXL_INLINE void RightMirror(float* p, size_t n) {
+  for (size_t i = 0; i < n; i++) {
+    p[i] = *(p - 1 - i);
+  }
+}
+
+void ComputeSigma(const Rect& block_rect, PassesDecoderState* state) {
+  const LoopFilter& lf = state->shared->frame_header.loop_filter;
+  JXL_CHECK(lf.epf_iters > 0);
+  const AcStrategyImage& ac_strategy = state->shared->ac_strategy;
+  const float quant_scale = state->shared->quantizer.Scale();
+
+  const size_t sigma_stride = state->sigma.PixelsPerRow();
+  const size_t sharpness_stride = state->shared->epf_sharpness.PixelsPerRow();
+
+  for (size_t by = 0; by < block_rect.ysize(); ++by) {
+    float* JXL_RESTRICT sigma_row = block_rect.Row(&state->sigma, by);
+    const uint8_t* JXL_RESTRICT sharpness_row =
+        block_rect.ConstRow(state->shared->epf_sharpness, by);
+    AcStrategyRow acs_row = ac_strategy.ConstRow(block_rect, by);
+    const int32_t* const JXL_RESTRICT row_quant =
+        block_rect.ConstRow(state->shared->raw_quant_field, by);
+
+    for (size_t bx = 0; bx < block_rect.xsize(); bx++) {
+      AcStrategy acs = acs_row[bx];
+      size_t llf_x = acs.covered_blocks_x();
+      if (!acs.IsFirstBlock()) continue;
+      // quant_scale is smaller for low quality.
+      // quant_scale is roughly 0.08 / butteraugli score.
+      //
+      // row_quant is smaller for low quality.
+      // row_quant is a quantization multiplier of form 1.0 /
+      // row_quant[bx]
+      //
+      // lf.epf_quant_mul is a parameter in the format
+      // kInvSigmaNum is a constant
+      float sigma_quant =
+          lf.epf_quant_mul / (quant_scale * row_quant[bx] * kInvSigmaNum);
+      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+          float sigma =
+              sigma_quant *
+              lf.epf_sharp_lut[sharpness_row[bx + ix + iy * sharpness_stride]];
+          // Avoid infinities.
+          sigma = std::min(-1e-4f, sigma);  // TODO(veluca): remove this.
+          sigma_row[bx + ix + kSigmaPadding +
+                    (iy + kSigmaPadding) * sigma_stride] = 1.0f / sigma;
+        }
+      }
+      // TODO(veluca): remove this padding.
+      // Left padding with mirroring.
+      if (bx + block_rect.x0() == 0) {
+        for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+          LeftMirror(
+              sigma_row + kSigmaPadding + (iy + kSigmaPadding) * sigma_stride,
+              kSigmaBorder);
+        }
+      }
+      // Right padding with mirroring.
+      if (bx + block_rect.x0() + llf_x ==
+          state->shared->frame_dim.xsize_blocks) {
+        for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+          RightMirror(sigma_row + kSigmaPadding + bx + llf_x +
+                          (iy + kSigmaPadding) * sigma_stride,
+                      kSigmaBorder);
+        }
+      }
+      // Offsets for row copying, in blocks.
+      size_t offset_before = bx + block_rect.x0() == 0 ? 1 : bx + kSigmaPadding;
+      size_t offset_after =
+          bx + block_rect.x0() + llf_x == state->shared->frame_dim.xsize_blocks
+              ? kSigmaPadding + llf_x + bx + kSigmaBorder
+              : kSigmaPadding + llf_x + bx;
+      size_t num = offset_after - offset_before;
+      // Above
+      if (by + block_rect.y0() == 0) {
+        for (size_t iy = 0; iy < kSigmaBorder; iy++) {
+          memcpy(
+              sigma_row + offset_before +
+                  (kSigmaPadding - 1 - iy) * sigma_stride,
+              sigma_row + offset_before + (kSigmaPadding + iy) * sigma_stride,
+              num * sizeof(*sigma_row));
+        }
+      }
+      // Below
+      if (by + block_rect.y0() + acs.covered_blocks_y() ==
+          state->shared->frame_dim.ysize_blocks) {
+        for (size_t iy = 0; iy < kSigmaBorder; iy++) {
+          memcpy(
+              sigma_row + offset_before +
+                  sigma_stride * (acs.covered_blocks_y() + kSigmaPadding + iy),
+              sigma_row + offset_before +
+                  sigma_stride *
+                      (acs.covered_blocks_y() + kSigmaPadding - 1 - iy),
+              num * sizeof(*sigma_row));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/epf.h b/third_party/jpeg-xl/lib/jxl/epf.h
new file mode 100644
index 0000000000..7a0834ed97
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/epf.h
@@ -0,0 +1,33 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_EPF_H_
+#define LIB_JXL_EPF_H_
+
+// Fast SIMD "in-loop" edge preserving filter (adaptive, nonlinear).
+
+#include <stddef.h>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/passes_state.h"
+
+namespace jxl {
+
+// 4 * (sqrt(0.5)-1), so that Weight(sigma) = 0.5.
+static constexpr float kInvSigmaNum = -1.1715728752538099024f;
+
+// kInvSigmaNum / 0.3
+constexpr float kMinSigma = -3.90524291751269967465540850526868f;
+
+// Fills the `state->filter_weights.sigma` image with the precomputed sigma
+// values in the area inside `block_rect`. Accesses the AC strategy, quant field
+// and epf_sharpness fields in the corresponding positions.
+void ComputeSigma(const Rect& block_rect, PassesDecoderState* state);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_EPF_H_
diff --git a/third_party/jpeg-xl/lib/jxl/exif.h b/third_party/jpeg-xl/lib/jxl/exif.h
new file mode 100644
index 0000000000..0cf493fc71
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/exif.h
@@ -0,0 +1,87 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_EXIF_H_
+#define LIB_JXL_EXIF_H_
+
+// Basic parsing of Exif (just enough for the render-impacting things
+// like orientation)
+
+#include <jxl/codestream_header.h>
+
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/image_metadata.h"
+
+namespace jxl {
+
+constexpr uint16_t kExifOrientationTag = 274;
+
+// Checks if a blob looks like Exif, and if so, sets bigendian
+// according to the tiff endianness
+inline bool IsExif(const std::vector<uint8_t>& exif, bool* bigendian) {
+  if (exif.size() < 12) return false;  // not enough bytes for a valid exif blob
+  const uint8_t* t = exif.data();
+  if (LoadLE32(t) == 0x2A004D4D) {
+    *bigendian = true;
+    return true;
+  } else if (LoadLE32(t) == 0x002A4949) {
+    *bigendian = false;
+    return true;
+  }
+  return false;  // not a valid tiff header
+}
+
+// Finds the position of an Exif tag, or 0 if it is not found
+inline size_t FindExifTagPosition(const std::vector<uint8_t>& exif,
+                                  uint16_t tagname) {
+  bool bigendian;
+  if (!IsExif(exif, &bigendian)) return 0;
+  const uint8_t* t = exif.data() + 4;
+  uint64_t offset = (bigendian ? LoadBE32(t) : LoadLE32(t));
+  if (exif.size() < 12 + offset + 2 || offset < 8) return 0;
+  t += offset - 4;
+  if (offset + 2 >= exif.size()) return 0;
+  uint16_t nb_tags = (bigendian ? LoadBE16(t) : LoadLE16(t));
+  t += 2;
+  while (nb_tags > 0) {
+    if (t + 12 >= exif.data() + exif.size()) return 0;
+    uint16_t tag = (bigendian ? LoadBE16(t) : LoadLE16(t));
+    t += 2;
+    if (tag == tagname) return static_cast<size_t>(t - exif.data());
+    t += 10;
+    nb_tags--;
+  }
+  return 0;
+}
+
+// TODO (jon): tag 1 can be used to represent Adobe RGB 1998 if it has value
+// "R03"
+// TODO (jon): set intrinsic dimensions according to
+// https://discourse.wicg.io/t/proposal-exif-image-resolution-auto-and-from-image/4326/24
+// Parses the Exif data just enough to extract any render-impacting info.
+// If the Exif data is invalid or could not be parsed, then it is treated
+// as a no-op.
+inline void InterpretExif(const std::vector<uint8_t>& exif,
+                          JxlOrientation* orientation) {
+  bool bigendian;
+  if (!IsExif(exif, &bigendian)) return;
+  size_t o_pos = FindExifTagPosition(exif, kExifOrientationTag);
+  if (o_pos) {
+    const uint8_t* t = exif.data() + o_pos;
+    uint16_t type = (bigendian ? LoadBE16(t) : LoadLE16(t));
+    t += 2;
+    uint32_t count = (bigendian ? LoadBE32(t) : LoadLE32(t));
+    t += 4;
+    uint16_t value = (bigendian ? LoadBE16(t) : LoadLE16(t));
+    t += 4;
+    if (type == 3 && count == 1 && value >= 1 && value <= 8) {
+      *orientation = static_cast<JxlOrientation>(value);
+    }
+  }
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_EXIF_H_
diff --git a/third_party/jpeg-xl/lib/jxl/fake_parallel_runner_testonly.h b/third_party/jpeg-xl/lib/jxl/fake_parallel_runner_testonly.h
new file mode 100644
index 0000000000..508d808cc5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fake_parallel_runner_testonly.h
@@ -0,0 +1,79 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_FAKE_PARALLEL_RUNNER_TESTONLY_H_
+#define LIB_JXL_FAKE_PARALLEL_RUNNER_TESTONLY_H_
+
+#include <jxl/parallel_runner.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/random.h"
+
+namespace jxl {
+
+// A parallel runner implementation that runs all the jobs in a single thread
+// (the caller thread) but runs them pretending to use multiple threads and
+// potentially out of order. This is useful for testing conditions that only
+// occur under heavy load where the order of operations is different.
+class FakeParallelRunner {
+ public:
+  FakeParallelRunner(uint32_t order_seed, uint32_t num_threads)
+      : order_seed_(order_seed), rng_(order_seed), num_threads_(num_threads) {
+    if (num_threads_ < 1) num_threads_ = 1;
+  }
+
+  JxlParallelRetCode Run(void* jxl_opaque, JxlParallelRunInit init,
+                         JxlParallelRunFunction func, uint32_t start,
+                         uint32_t end) {
+    JxlParallelRetCode ret = init(jxl_opaque, num_threads_);
+    if (ret != 0) return ret;
+
+    if (order_seed_ == 0) {
+      for (uint32_t i = start; i < end; i++) {
+        func(jxl_opaque, i, i % num_threads_);
+      }
+    } else {
+      std::vector<uint32_t> order(end - start);
+      for (uint32_t i = start; i < end; i++) {
+        order[i - start] = i;
+      }
+      rng_.Shuffle(order.data(), order.size());
+      for (uint32_t i = start; i < end; i++) {
+        func(jxl_opaque, order[i - start], i % num_threads_);
+      }
+    }
+    return ret;
+  }
+
+ private:
+  // Seed for the RNG for defining the execution order. A value of 0 means
+  // sequential order from start to end.
+  uint32_t order_seed_;
+
+  // The PRNG object, initialized with the order_seed_. Only used if the seed is
+  // not 0.
+  Rng rng_;
+
+  // Number of fake threads. All the tasks are run on the same thread, but using
+  // different thread_id values based on this num_threads.
+  uint32_t num_threads_;
+};
+
+}  // namespace jxl
+
+extern "C" {
+// Function to pass as the parallel runner.
+JXL_INLINE JxlParallelRetCode JxlFakeParallelRunner(
+    void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
+    JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) {
+  return static_cast<jxl::FakeParallelRunner*>(runner_opaque)
+      ->Run(jpegxl_opaque, init, func, start_range, end_range);
+}
+}
+
+#endif  // LIB_JXL_FAKE_PARALLEL_RUNNER_TESTONLY_H_
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct-inl.h
new file mode 100644
index 0000000000..d2453b0e10
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct-inl.h
@@ -0,0 +1,238 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JXL_FAST_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_FAST_DCT_INL_H_
+#undef LIB_JXL_FAST_DCT_INL_H_
+#else
+#define LIB_JXL_FAST_DCT_INL_H_
+#endif
+
+#include <cmath>
+
+#include <hwy/aligned_allocator.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/status.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+#if HWY_TARGET == HWY_NEON
+HWY_NOINLINE void FastTransposeBlock(const int16_t* JXL_RESTRICT data_in,
+                                     size_t stride_in, size_t N, size_t M,
+                                     int16_t* JXL_RESTRICT data_out,
+                                     size_t stride_out) {
+  JXL_DASSERT(N % 8 == 0);
+  JXL_DASSERT(M % 8 == 0);
+  for (size_t i = 0; i < N; i += 8) {
+    for (size_t j = 0; j < M; j += 8) {
+      // TODO(veluca): one could optimize the M==8, stride_in==8 case further
+      // with vld4.
+      // This code is about 40% faster for N == M == stride_in ==
+      // stride_out == 8
+      // Using loads + stores to reshuffle things to be able to
+      // use vld4 doesn't help.
+      /*
+      auto a0 = vld4q_s16(data_in); auto a1 = vld4q_s16(data_in + 32);
+      int16x8x4_t out0;
+      int16x8x4_t out1;
+      out0.val[0] = vuzp1q_s16(a0.val[0], a1.val[0]);
+      out0.val[1] = vuzp1q_s16(a0.val[1], a1.val[1]);
+      out0.val[2] = vuzp1q_s16(a0.val[2], a1.val[2]);
+      out0.val[3] = vuzp1q_s16(a0.val[3], a1.val[3]);
+      out1.val[0] = vuzp2q_s16(a0.val[0], a1.val[0]);
+      out1.val[1] = vuzp2q_s16(a0.val[1], a1.val[1]);
+      out1.val[2] = vuzp2q_s16(a0.val[2], a1.val[2]);
+      out1.val[3] = vuzp2q_s16(a0.val[3], a1.val[3]);
+      vst1q_s16_x4(data_out, out0);
+      vst1q_s16_x4(data_out + 32, out1);
+      */
+      auto a0 = vld1q_s16(data_in + i * stride_in + j);
+      auto a1 = vld1q_s16(data_in + (i + 1) * stride_in + j);
+      auto a2 = vld1q_s16(data_in + (i + 2) * stride_in + j);
+      auto a3 = vld1q_s16(data_in + (i + 3) * stride_in + j);
+
+      auto a01 = vtrnq_s16(a0, a1);
+      auto a23 = vtrnq_s16(a2, a3);
+
+      auto four0 = vtrnq_s32(vreinterpretq_s32_s16(a01.val[0]),
+                             vreinterpretq_s32_s16(a23.val[0]));
+      auto four1 = vtrnq_s32(vreinterpretq_s32_s16(a01.val[1]),
+                             vreinterpretq_s32_s16(a23.val[1]));
+
+      auto a4 = vld1q_s16(data_in + (i + 4) * stride_in + j);
+      auto a5 = vld1q_s16(data_in + (i + 5) * stride_in + j);
+      auto a6 = vld1q_s16(data_in + (i + 6) * stride_in + j);
+      auto a7 = vld1q_s16(data_in + (i + 7) * stride_in + j);
+
+      auto a45 = vtrnq_s16(a4, a5);
+      auto a67 = vtrnq_s16(a6, a7);
+
+      auto four2 = vtrnq_s32(vreinterpretq_s32_s16(a45.val[0]),
+                             vreinterpretq_s32_s16(a67.val[0]));
+      auto four3 = vtrnq_s32(vreinterpretq_s32_s16(a45.val[1]),
+                             vreinterpretq_s32_s16(a67.val[1]));
+
+      auto out0 =
+          vcombine_s32(vget_low_s32(four0.val[0]), vget_low_s32(four2.val[0]));
+      auto out1 =
+          vcombine_s32(vget_low_s32(four1.val[0]), vget_low_s32(four3.val[0]));
+      auto out2 =
+          vcombine_s32(vget_low_s32(four0.val[1]), vget_low_s32(four2.val[1]));
+      auto out3 =
+          vcombine_s32(vget_low_s32(four1.val[1]), vget_low_s32(four3.val[1]));
+      auto out4 = vcombine_s32(vget_high_s32(four0.val[0]),
+                               vget_high_s32(four2.val[0]));
+      auto out5 = vcombine_s32(vget_high_s32(four1.val[0]),
+                               vget_high_s32(four3.val[0]));
+      auto out6 = vcombine_s32(vget_high_s32(four0.val[1]),
+                               vget_high_s32(four2.val[1]));
+      auto out7 = vcombine_s32(vget_high_s32(four1.val[1]),
+                               vget_high_s32(four3.val[1]));
+      vst1q_s16(data_out + j * stride_out + i, vreinterpretq_s16_s32(out0));
+      vst1q_s16(data_out + (j + 1) * stride_out + i,
+                vreinterpretq_s16_s32(out1));
+      vst1q_s16(data_out + (j + 2) * stride_out + i,
+                vreinterpretq_s16_s32(out2));
+      vst1q_s16(data_out + (j + 3) * stride_out + i,
+                vreinterpretq_s16_s32(out3));
+      vst1q_s16(data_out + (j + 4) * stride_out + i,
+                vreinterpretq_s16_s32(out4));
+      vst1q_s16(data_out + (j + 5) * stride_out + i,
+                vreinterpretq_s16_s32(out5));
+      vst1q_s16(data_out + (j + 6) * stride_out + i,
+                vreinterpretq_s16_s32(out6));
+      vst1q_s16(data_out + (j + 7) * stride_out + i,
+                vreinterpretq_s16_s32(out7));
+    }
+  }
+}
+
+template <size_t N>
+struct FastDCTTag {};
+
+#include "lib/jxl/fast_dct128-inl.h"
+#include "lib/jxl/fast_dct16-inl.h"
+#include "lib/jxl/fast_dct256-inl.h"
+#include "lib/jxl/fast_dct32-inl.h"
+#include "lib/jxl/fast_dct64-inl.h"
+#include "lib/jxl/fast_dct8-inl.h"
+
+template <size_t ROWS, size_t COLS>
+struct ComputeFastScaledIDCT {
+  // scratch_space must be aligned, and should have space for ROWS*COLS
+  // int16_ts.
+  HWY_MAYBE_UNUSED void operator()(int16_t* JXL_RESTRICT from, int16_t* to,
+                                   size_t to_stride,
+                                   int16_t* JXL_RESTRICT scratch_space) {
+    // Reverse the steps done in ComputeScaledDCT.
+    if (ROWS < COLS) {
+      FastTransposeBlock(from, COLS, ROWS, COLS, scratch_space, ROWS);
+      FastIDCT(FastDCTTag<COLS>(), scratch_space, ROWS, from, ROWS, ROWS);
+      FastTransposeBlock(from, ROWS, COLS, ROWS, scratch_space, COLS);
+      FastIDCT(FastDCTTag<ROWS>(), scratch_space, COLS, to, to_stride, COLS);
+    } else {
+      FastIDCT(FastDCTTag<COLS>(), from, ROWS, scratch_space, ROWS, ROWS);
+      FastTransposeBlock(scratch_space, ROWS, COLS, ROWS, from, COLS);
+      FastIDCT(FastDCTTag<ROWS>(), from, COLS, to, to_stride, COLS);
+    }
+  }
+};
+#endif
+
+template <size_t N, size_t M>
+HWY_NOINLINE void TestFastIDCT() {
+#if HWY_TARGET == HWY_NEON
+  auto pixels_mem = hwy::AllocateAligned<float>(N * M);
+  float* pixels = pixels_mem.get();
+  auto dct_mem = hwy::AllocateAligned<float>(N * M);
+  float* dct = dct_mem.get();
+  auto dct_i_mem = hwy::AllocateAligned<int16_t>(N * M);
+  int16_t* dct_i = dct_i_mem.get();
+  auto dct_in_mem = hwy::AllocateAligned<int16_t>(N * M);
+  int16_t* dct_in = dct_in_mem.get();
+  auto idct_mem = hwy::AllocateAligned<int16_t>(N * M);
+  int16_t* idct = idct_mem.get();
+
+  auto scratch_space_mem = hwy::AllocateAligned<float>(N * M * 2);
+  float* scratch_space = scratch_space_mem.get();
+  auto scratch_space_i_mem = hwy::AllocateAligned<int16_t>(N * M * 2);
+  int16_t* scratch_space_i = scratch_space_i_mem.get();
+
+  Rng rng(0);
+  for (size_t i = 0; i < N * M; i++) {
+    pixels[i] = rng.UniformF(-1, 1);
+  }
+  ComputeScaledDCT<M, N>()(DCTFrom(pixels, N), dct, scratch_space);
+  size_t integer_bits = std::max(FastIDCTIntegerBits(FastDCTTag<N>()),
+                                 FastIDCTIntegerBits(FastDCTTag<M>()));
+  // Enough range for [-2, 2] output values.
+  JXL_ASSERT(integer_bits <= 14);
+  float scale = (1 << (14 - integer_bits));
+  for (size_t i = 0; i < N * M; i++) {
+    dct_i[i] = std::round(dct[i] * scale);
+  }
+
+  for (size_t j = 0; j < 40000000 / (M * N); j++) {
+    memcpy(dct_in, dct_i, sizeof(*dct_i) * N * M);
+    ComputeFastScaledIDCT<M, N>()(dct_in, idct, N, scratch_space_i);
+  }
+  float max_error = 0;
+  for (size_t i = 0; i < M * N; i++) {
+    float err = std::abs(idct[i] * (1.0f / scale) - pixels[i]);
+    if (std::abs(err) > max_error) {
+      max_error = std::abs(err);
+    }
+  }
+  printf("max error: %f mantissa bits: %d\n", max_error,
+         14 - (int)integer_bits);
+#endif
+}
+
+template <size_t N, size_t M>
+HWY_NOINLINE void TestFloatIDCT() {
+  auto pixels_mem = hwy::AllocateAligned<float>(N * M);
+  float* pixels = pixels_mem.get();
+  auto dct_mem = hwy::AllocateAligned<float>(N * M);
+  float* dct = dct_mem.get();
+  auto idct_mem = hwy::AllocateAligned<float>(N * M);
+  float* idct = idct_mem.get();
+
+  auto dct_in_mem = hwy::AllocateAligned<float>(N * M);
+  float* dct_in = dct_mem.get();
+
+  auto scratch_space_mem = hwy::AllocateAligned<float>(N * M * 2);
+  float* scratch_space = scratch_space_mem.get();
+
+  Rng rng(0);
+  for (size_t i = 0; i < N * M; i++) {
+    pixels[i] = rng.UniformF(-1, 1);
+  }
+  ComputeScaledDCT<M, N>()(DCTFrom(pixels, N), dct, scratch_space);
+
+  for (size_t j = 0; j < 40000000 / (M * N); j++) {
+    memcpy(dct_in, dct, sizeof(*dct) * N * M);
+    ComputeScaledIDCT<M, N>()(dct_in, DCTTo(idct, N), scratch_space);
+  }
+  float max_error = 0;
+  for (size_t i = 0; i < M * N; i++) {
+    float err = std::abs(idct[i] - pixels[i]);
+    if (std::abs(err) > max_error) {
+      max_error = std::abs(err);
+    }
+  }
+  printf("max error: %e\n", max_error);
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_FAST_DCT_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct.cc b/third_party/jpeg-xl/lib/jxl/fast_dct.cc
new file mode 100644
index 0000000000..d796018fd0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct.cc
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/fast_dct.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/dct-inl.h"
+#include "lib/jxl/fast_dct-inl.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+void BenchmarkFloatIDCT32x32() { TestFloatIDCT<32, 32>(); }
+void BenchmarkFastIDCT32x32() { TestFastIDCT<32, 32>(); }
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(BenchmarkFloatIDCT32x32);
+HWY_EXPORT(BenchmarkFastIDCT32x32);
+void BenchmarkFloatIDCT32x32() {
+  HWY_DYNAMIC_DISPATCH(BenchmarkFloatIDCT32x32)();
+}
+void BenchmarkFastIDCT32x32() {
+  HWY_DYNAMIC_DISPATCH(BenchmarkFastIDCT32x32)();
+}
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct.h b/third_party/jpeg-xl/lib/jxl/fast_dct.h
new file mode 100644
index 0000000000..641933d8a0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct.h
@@ -0,0 +1,9 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+namespace jxl {
+void BenchmarkFloatIDCT32x32();
+void BenchmarkFastIDCT32x32();
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h
new file mode 100644
index 0000000000..1a94d3ee92
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h
@@ -0,0 +1,2137 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/* This file is automatically generated. Do not modify it directly. */
+#if HWY_TARGET != HWY_NEON
+#error "only include this file from fast_dct-inl.h"
+#endif
+
+constexpr size_t FastIDCTIntegerBits(FastDCTTag<128>) { return 2; }
+
+void FastIDCT(FastDCTTag<128>, const int16_t* in, size_t in_stride,
+              int16_t* out, size_t out_stride, size_t count) {
+  JXL_ASSERT(count % 8 == 0);
+  for (size_t i = 0; i < count; i += 8) {
+    int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
+    int16x8_t v1 = vld1q_s16(in + in_stride * 64 + i);
+    int16x8_t v2 = vaddq_s16(v0, v1);
+    int16x8_t v3 = vld1q_s16(in + in_stride * 32 + i);
+    int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
+    int16x8_t v4 = vaddq_s16(v4_tmp, v3);
+    int16x8_t v5 = vld1q_s16(in + in_stride * 96 + i);
+    int16x8_t v6 = vaddq_s16(v5, v3);
+    int16x8_t v7 = vaddq_s16(v4, v6);
+    int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
+    int16x8_t v9 = vaddq_s16(v2, v8);
+    int16x8_t v10 = vld1q_s16(in + in_stride * 16 + i);
+    int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
+    int16x8_t v11 = vaddq_s16(v11_tmp, v10);
+    int16x8_t v12 = vld1q_s16(in + in_stride * 80 + i);
+    int16x8_t v13 = vld1q_s16(in + in_stride * 48 + i);
+    int16x8_t v14 = vaddq_s16(v12, v13);
+    int16x8_t v15 = vaddq_s16(v11, v14);
+    int16x8_t v16 = vaddq_s16(v13, v10);
+    int16x8_t v17_tmp = vqrdmulhq_n_s16(v16, 13573);
+    int16x8_t v17 = vaddq_s16(v17_tmp, v16);
+    int16x8_t v18 = vld1q_s16(in + in_stride * 112 + i);
+    int16x8_t v19 = vaddq_s16(v18, v12);
+    int16x8_t v20 = vaddq_s16(v19, v16);
+    int16x8_t v21 = vaddq_s16(v17, v20);
+    int16x8_t v22 = vqrdmulhq_n_s16(v21, 17734);
+    int16x8_t v23 = vaddq_s16(v15, v22);
+    int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
+    int16x8_t v25 = vaddq_s16(v9, v24);
+    int16x8_t v26 = vld1q_s16(in + in_stride * 8 + i);
+    int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573);
+    int16x8_t v27 = vaddq_s16(v27_tmp, v26);
+    int16x8_t v28 = vld1q_s16(in + in_stride * 72 + i);
+    int16x8_t v29 = vld1q_s16(in + in_stride * 56 + i);
+    int16x8_t v30 = vaddq_s16(v28, v29);
+    int16x8_t v31 = vaddq_s16(v27, v30);
+    int16x8_t v32 = vld1q_s16(in + in_stride * 40 + i);
+    int16x8_t v33 = vld1q_s16(in + in_stride * 24 + i);
+    int16x8_t v34 = vaddq_s16(v32, v33);
+    int16x8_t v35_tmp = vqrdmulhq_n_s16(v34, 13573);
+    int16x8_t v35 = vaddq_s16(v35_tmp, v34);
+    int16x8_t v36 = vld1q_s16(in + in_stride * 104 + i);
+    int16x8_t v37 = vld1q_s16(in + in_stride * 88 + i);
+    int16x8_t v38 = vaddq_s16(v36, v37);
+    int16x8_t v39 = vaddq_s16(v38, v34);
+    int16x8_t v40 = vaddq_s16(v35, v39);
+    int16x8_t v41 = vqrdmulhq_n_s16(v40, 17734);
+    int16x8_t v42 = vaddq_s16(v31, v41);
+    int16x8_t v43 = vaddq_s16(v33, v26);
+    int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573);
+    int16x8_t v44 = vaddq_s16(v44_tmp, v43);
+    int16x8_t v45 = vaddq_s16(v37, v28);
+    int16x8_t v46 = vaddq_s16(v29, v32);
+    int16x8_t v47 = vaddq_s16(v45, v46);
+    int16x8_t v48 = vaddq_s16(v44, v47);
+    int16x8_t v49 = vaddq_s16(v46, v43);
+    int16x8_t v50_tmp = vqrdmulhq_n_s16(v49, 13573);
+    int16x8_t v50 = vaddq_s16(v50_tmp, v49);
+    int16x8_t v51 = vld1q_s16(in + in_stride * 120 + i);
+    int16x8_t v52 = vaddq_s16(v51, v36);
+    int16x8_t v53 = vaddq_s16(v52, v45);
+    int16x8_t v54 = vaddq_s16(v53, v49);
+    int16x8_t v55 = vaddq_s16(v50, v54);
+    int16x8_t v56 = vqrdmulhq_n_s16(v55, 17734);
+    int16x8_t v57 = vaddq_s16(v48, v56);
+    int16x8_t v58 = vqrdmulhq_n_s16(v57, 16705);
+    int16x8_t v59 = vaddq_s16(v42, v58);
+    int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
+    int16x8_t v61 = vaddq_s16(v25, v60);
+    int16x8_t v62 = vld1q_s16(in + in_stride * 4 + i);
+    int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573);
+    int16x8_t v63 = vaddq_s16(v63_tmp, v62);
+    int16x8_t v64 = vld1q_s16(in + in_stride * 68 + i);
+    int16x8_t v65 = vld1q_s16(in + in_stride * 60 + i);
+    int16x8_t v66 = vaddq_s16(v64, v65);
+    int16x8_t v67 = vaddq_s16(v63, v66);
+    int16x8_t v68 = vld1q_s16(in + in_stride * 36 + i);
+    int16x8_t v69 = vld1q_s16(in + in_stride * 28 + i);
+    int16x8_t v70 = vaddq_s16(v68, v69);
+    int16x8_t v71_tmp = vqrdmulhq_n_s16(v70, 13573);
+    int16x8_t v71 = vaddq_s16(v71_tmp, v70);
+    int16x8_t v72 = vld1q_s16(in + in_stride * 100 + i);
+    int16x8_t v73 = vld1q_s16(in + in_stride * 92 + i);
+    int16x8_t v74 = vaddq_s16(v72, v73);
+    int16x8_t v75 = vaddq_s16(v74, v70);
+    int16x8_t v76 = vaddq_s16(v71, v75);
+    int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734);
+    int16x8_t v78 = vaddq_s16(v67, v77);
+    int16x8_t v79 = vld1q_s16(in + in_stride * 20 + i);
+    int16x8_t v80 = vld1q_s16(in + in_stride * 12 + i);
+    int16x8_t v81 = vaddq_s16(v79, v80);
+    int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573);
+    int16x8_t v82 = vaddq_s16(v82_tmp, v81);
+    int16x8_t v83 = vld1q_s16(in + in_stride * 84 + i);
+    int16x8_t v84 = vld1q_s16(in + in_stride * 76 + i);
+    int16x8_t v85 = vaddq_s16(v83, v84);
+    int16x8_t v86 = vld1q_s16(in + in_stride * 52 + i);
+    int16x8_t v87 = vld1q_s16(in + in_stride * 44 + i);
+    int16x8_t v88 = vaddq_s16(v86, v87);
+    int16x8_t v89 = vaddq_s16(v85, v88);
+    int16x8_t v90 = vaddq_s16(v82, v89);
+    int16x8_t v91 = vaddq_s16(v88, v81);
+    int16x8_t v92_tmp = vqrdmulhq_n_s16(v91, 13573);
+    int16x8_t v92 = vaddq_s16(v92_tmp, v91);
+    int16x8_t v93 = vld1q_s16(in + in_stride * 116 + i);
+    int16x8_t v94 = vld1q_s16(in + in_stride * 108 + i);
+    int16x8_t v95 = vaddq_s16(v93, v94);
+    int16x8_t v96 = vaddq_s16(v95, v85);
+    int16x8_t v97 = vaddq_s16(v96, v91);
+    int16x8_t v98 = vaddq_s16(v92, v97);
+    int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734);
+    int16x8_t v100 = vaddq_s16(v90, v99);
+    int16x8_t v101 = vqrdmulhq_n_s16(v100, 16705);
+    int16x8_t v102 = vaddq_s16(v78, v101);
+    int16x8_t v103 = vaddq_s16(v80, v62);
+    int16x8_t v104_tmp = vqrdmulhq_n_s16(v103, 13573);
+    int16x8_t v104 = vaddq_s16(v104_tmp, v103);
+    int16x8_t v105 = vaddq_s16(v84, v64);
+    int16x8_t v106 = vaddq_s16(v65, v86);
+    int16x8_t v107 = vaddq_s16(v105, v106);
+    int16x8_t v108 = vaddq_s16(v104, v107);
+    int16x8_t v109 = vaddq_s16(v87, v68);
+    int16x8_t v110 = vaddq_s16(v69, v79);
+    int16x8_t v111 = vaddq_s16(v109, v110);
+    int16x8_t v112_tmp = vqrdmulhq_n_s16(v111, 13573);
+    int16x8_t v112 = vaddq_s16(v112_tmp, v111);
+    int16x8_t v113 = vaddq_s16(v94, v72);
+    int16x8_t v114 = vaddq_s16(v73, v83);
+    int16x8_t v115 = vaddq_s16(v113, v114);
+    int16x8_t v116 = vaddq_s16(v115, v111);
+    int16x8_t v117 = vaddq_s16(v112, v116);
+    int16x8_t v118 = vqrdmulhq_n_s16(v117, 17734);
+    int16x8_t v119 = vaddq_s16(v108, v118);
+    int16x8_t v120 = vaddq_s16(v110, v103);
+    int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 13573);
+    int16x8_t v121 = vaddq_s16(v121_tmp, v120);
+    int16x8_t v122 = vaddq_s16(v114, v105);
+    int16x8_t v123 = vaddq_s16(v106, v109);
+    int16x8_t v124 = vaddq_s16(v122, v123);
+    int16x8_t v125 = vaddq_s16(v121, v124);
+    int16x8_t v126 = vaddq_s16(v123, v120);
+    int16x8_t v127_tmp = vqrdmulhq_n_s16(v126, 13573);
+    int16x8_t v127 = vaddq_s16(v127_tmp, v126);
+    int16x8_t v128 = vld1q_s16(in + in_stride * 124 + i);
+    int16x8_t v129 = vaddq_s16(v128, v93);
+    int16x8_t v130 = vaddq_s16(v129, v113);
+    int16x8_t v131 = vaddq_s16(v130, v122);
+    int16x8_t v132 = vaddq_s16(v131, v126);
+    int16x8_t v133 = vaddq_s16(v127, v132);
+    int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734);
+    int16x8_t v135 = vaddq_s16(v125, v134);
+    int16x8_t v136 = vqrdmulhq_n_s16(v135, 16705);
+    int16x8_t v137 = vaddq_s16(v119, v136);
+    int16x8_t v138 = vqrdmulhq_n_s16(v137, 16463);
+    int16x8_t v139 = vaddq_s16(v102, v138);
+    int16x8_t v140 = vqrdmulhq_n_s16(v139, 16404);
+    int16x8_t v141 = vaddq_s16(v61, v140);
+    int16x8_t v142 = vld1q_s16(in + in_stride * 2 + i);
+    int16x8_t v143_tmp = vqrdmulhq_n_s16(v142, 13573);
+    int16x8_t v143 = vaddq_s16(v143_tmp, v142);
+    int16x8_t v144 = vld1q_s16(in + in_stride * 66 + i);
+    int16x8_t v145 = vld1q_s16(in + in_stride * 62 + i);
+    int16x8_t v146 = vaddq_s16(v144, v145);
+    int16x8_t v147 = vaddq_s16(v143, v146);
+    int16x8_t v148 = vld1q_s16(in + in_stride * 34 + i);
+    int16x8_t v149 = vld1q_s16(in + in_stride * 30 + i);
+    int16x8_t v150 = vaddq_s16(v148, v149);
+    int16x8_t v151_tmp = vqrdmulhq_n_s16(v150, 13573);
+    int16x8_t v151 = vaddq_s16(v151_tmp, v150);
+    int16x8_t v152 = vld1q_s16(in + in_stride * 98 + i);
+    int16x8_t v153 = vld1q_s16(in + in_stride * 94 + i);
+    int16x8_t v154 = vaddq_s16(v152, v153);
+    int16x8_t v155 = vaddq_s16(v154, v150);
+    int16x8_t v156 = vaddq_s16(v151, v155);
+    int16x8_t v157 = vqrdmulhq_n_s16(v156, 17734);
+    int16x8_t v158 = vaddq_s16(v147, v157);
+    int16x8_t v159 = vld1q_s16(in + in_stride * 18 + i);
+    int16x8_t v160 = vld1q_s16(in + in_stride * 14 + i);
+    int16x8_t v161 = vaddq_s16(v159, v160);
+    int16x8_t v162_tmp = vqrdmulhq_n_s16(v161, 13573);
+    int16x8_t v162 = vaddq_s16(v162_tmp, v161);
+    int16x8_t v163 = vld1q_s16(in + in_stride * 82 + i);
+    int16x8_t v164 = vld1q_s16(in + in_stride * 78 + i);
+    int16x8_t v165 = vaddq_s16(v163, v164);
+    int16x8_t v166 = vld1q_s16(in + in_stride * 50 + i);
+    int16x8_t v167 = vld1q_s16(in + in_stride * 46 + i);
+    int16x8_t v168 = vaddq_s16(v166, v167);
+    int16x8_t v169 = vaddq_s16(v165, v168);
+    int16x8_t v170 = vaddq_s16(v162, v169);
+    int16x8_t v171 = vaddq_s16(v168, v161);
+    int16x8_t v172_tmp = vqrdmulhq_n_s16(v171, 13573);
+    int16x8_t v172 = vaddq_s16(v172_tmp, v171);
+    int16x8_t v173 = vld1q_s16(in + in_stride * 114 + i);
+    int16x8_t v174 = vld1q_s16(in + in_stride * 110 + i);
+    int16x8_t v175 = vaddq_s16(v173, v174);
+    int16x8_t v176 = vaddq_s16(v175, v165);
+    int16x8_t v177 = vaddq_s16(v176, v171);
+    int16x8_t v178 = vaddq_s16(v172, v177);
+    int16x8_t v179 = vqrdmulhq_n_s16(v178, 17734);
+    int16x8_t v180 = vaddq_s16(v170, v179);
+    int16x8_t v181 = vqrdmulhq_n_s16(v180, 16705);
+    int16x8_t v182 = vaddq_s16(v158, v181);
+    int16x8_t v183 = vld1q_s16(in + in_stride * 10 + i);
+    int16x8_t v184 = vld1q_s16(in + in_stride * 6 + i);
+    int16x8_t v185 = vaddq_s16(v183, v184);
+    int16x8_t v186_tmp = vqrdmulhq_n_s16(v185, 13573);
+    int16x8_t v186 = vaddq_s16(v186_tmp, v185);
+    int16x8_t v187 = vld1q_s16(in + in_stride * 74 + i);
+    int16x8_t v188 = vld1q_s16(in + in_stride * 70 + i);
+    int16x8_t v189 = vaddq_s16(v187, v188);
+    int16x8_t v190 = vld1q_s16(in + in_stride * 58 + i);
+    int16x8_t v191 = vld1q_s16(in + in_stride * 54 + i);
+    int16x8_t v192 = vaddq_s16(v190, v191);
+    int16x8_t v193 = vaddq_s16(v189, v192);
+    int16x8_t v194 = vaddq_s16(v186, v193);
+    int16x8_t v195 = vld1q_s16(in + in_stride * 42 + i);
+    int16x8_t v196 = vld1q_s16(in + in_stride * 38 + i);
+    int16x8_t v197 = vaddq_s16(v195, v196);
+    int16x8_t v198 = vld1q_s16(in + in_stride * 26 + i);
+    int16x8_t v199 = vld1q_s16(in + in_stride * 22 + i);
+    int16x8_t v200 = vaddq_s16(v198, v199);
+    int16x8_t v201 = vaddq_s16(v197, v200);
+    int16x8_t v202_tmp = vqrdmulhq_n_s16(v201, 13573);
+    int16x8_t v202 = vaddq_s16(v202_tmp, v201);
+    int16x8_t v203 = vld1q_s16(in + in_stride * 106 + i);
+    int16x8_t v204 = vld1q_s16(in + in_stride * 102 + i);
+    int16x8_t v205 = vaddq_s16(v203, v204);
+    int16x8_t v206 = vld1q_s16(in + in_stride * 90 + i);
+    int16x8_t v207 = vld1q_s16(in + in_stride * 86 + i);
+    int16x8_t v208 = vaddq_s16(v206, v207);
+    int16x8_t v209 = vaddq_s16(v205, v208);
+    int16x8_t v210 = vaddq_s16(v209, v201);
+    int16x8_t v211 = vaddq_s16(v202, v210);
+    int16x8_t v212 = vqrdmulhq_n_s16(v211, 17734);
+    int16x8_t v213 = vaddq_s16(v194, v212);
+    int16x8_t v214 = vaddq_s16(v200, v185);
+    int16x8_t v215_tmp = vqrdmulhq_n_s16(v214, 13573);
+    int16x8_t v215 = vaddq_s16(v215_tmp, v214);
+    int16x8_t v216 = vaddq_s16(v208, v189);
+    int16x8_t v217 = vaddq_s16(v192, v197);
+    int16x8_t v218 = vaddq_s16(v216, v217);
+    int16x8_t v219 = vaddq_s16(v215, v218);
+    int16x8_t v220 = vaddq_s16(v217, v214);
+    int16x8_t v221_tmp = vqrdmulhq_n_s16(v220, 13573);
+    int16x8_t v221 = vaddq_s16(v221_tmp, v220);
+    int16x8_t v222 = vld1q_s16(in + in_stride * 122 + i);
+    int16x8_t v223 = vld1q_s16(in + in_stride * 118 + i);
+    int16x8_t v224 = vaddq_s16(v222, v223);
+    int16x8_t v225 = vaddq_s16(v224, v205);
+    int16x8_t v226 = vaddq_s16(v225, v216);
+    int16x8_t v227 = vaddq_s16(v226, v220);
+    int16x8_t v228 = vaddq_s16(v221, v227);
+    int16x8_t v229 = vqrdmulhq_n_s16(v228, 17734);
+    int16x8_t v230 = vaddq_s16(v219, v229);
+    int16x8_t v231 = vqrdmulhq_n_s16(v230, 16705);
+    int16x8_t v232 = vaddq_s16(v213, v231);
+    int16x8_t v233 = vqrdmulhq_n_s16(v232, 16463);
+    int16x8_t v234 = vaddq_s16(v182, v233);
+    int16x8_t v235 = vaddq_s16(v184, v142);
+    int16x8_t v236_tmp = vqrdmulhq_n_s16(v235, 13573);
+    int16x8_t v236 = vaddq_s16(v236_tmp, v235);
+    int16x8_t v237 = vaddq_s16(v188, v144);
+    int16x8_t v238 = vaddq_s16(v145, v190);
+    int16x8_t v239 = vaddq_s16(v237, v238);
+    int16x8_t v240 = vaddq_s16(v236, v239);
+    int16x8_t v241 = vaddq_s16(v196, v148);
+    int16x8_t v242 = vaddq_s16(v149, v198);
+    int16x8_t v243 = vaddq_s16(v241, v242);
+    int16x8_t v244_tmp = vqrdmulhq_n_s16(v243, 13573);
+    int16x8_t v244 = vaddq_s16(v244_tmp, v243);
+    int16x8_t v245 = vaddq_s16(v204, v152);
+    int16x8_t v246 = vaddq_s16(v153, v206);
+    int16x8_t v247 = vaddq_s16(v245, v246);
+    int16x8_t v248 = vaddq_s16(v247, v243);
+    int16x8_t v249 = vaddq_s16(v244, v248);
+    int16x8_t v250 = vqrdmulhq_n_s16(v249, 17734);
+    int16x8_t v251 = vaddq_s16(v240, v250);
+    int16x8_t v252 = vaddq_s16(v199, v159);
+    int16x8_t v253 = vaddq_s16(v160, v183);
+    int16x8_t v254 = vaddq_s16(v252, v253);
+    int16x8_t v255_tmp = vqrdmulhq_n_s16(v254, 13573);
+    int16x8_t v255 = vaddq_s16(v255_tmp, v254);
+    int16x8_t v256 = vaddq_s16(v207, v163);
+    int16x8_t v257 = vaddq_s16(v164, v187);
+    int16x8_t v258 = vaddq_s16(v256, v257);
+    int16x8_t v259 = vaddq_s16(v191, v166);
+    int16x8_t v260 = vaddq_s16(v167, v195);
+    int16x8_t v261 = vaddq_s16(v259, v260);
+    int16x8_t v262 = vaddq_s16(v258, v261);
+    int16x8_t v263 = vaddq_s16(v255, v262);
+    int16x8_t v264 = vaddq_s16(v261, v254);
+    int16x8_t v265_tmp = vqrdmulhq_n_s16(v264, 13573);
+    int16x8_t v265 = vaddq_s16(v265_tmp, v264);
+    int16x8_t v266 = vaddq_s16(v223, v173);
+    int16x8_t v267 = vaddq_s16(v174, v203);
+    int16x8_t v268 = vaddq_s16(v266, v267);
+    int16x8_t v269 = vaddq_s16(v268, v258);
+    int16x8_t v270 = vaddq_s16(v269, v264);
+    int16x8_t v271 = vaddq_s16(v265, v270);
+    int16x8_t v272 = vqrdmulhq_n_s16(v271, 17734);
+    int16x8_t v273 = vaddq_s16(v263, v272);
+    int16x8_t v274 = vqrdmulhq_n_s16(v273, 16705);
+    int16x8_t v275 = vaddq_s16(v251, v274);
+    int16x8_t v276 = vaddq_s16(v253, v235);
+    int16x8_t v277_tmp = vqrdmulhq_n_s16(v276, 13573);
+    int16x8_t v277 = vaddq_s16(v277_tmp, v276);
+    int16x8_t v278 = vaddq_s16(v257, v237);
+    int16x8_t v279 = vaddq_s16(v238, v259);
+    int16x8_t v280 = vaddq_s16(v278, v279);
+    int16x8_t v281 = vaddq_s16(v277, v280);
+    int16x8_t v282 = vaddq_s16(v260, v241);
+    int16x8_t v283 = vaddq_s16(v242, v252);
+    int16x8_t v284 = vaddq_s16(v282, v283);
+    int16x8_t v285_tmp = vqrdmulhq_n_s16(v284, 13573);
+    int16x8_t v285 = vaddq_s16(v285_tmp, v284);
+    int16x8_t v286 = vaddq_s16(v267, v245);
+    int16x8_t v287 = vaddq_s16(v246, v256);
+    int16x8_t v288 = vaddq_s16(v286, v287);
+    int16x8_t v289 = vaddq_s16(v288, v284);
+    int16x8_t v290 = vaddq_s16(v285, v289);
+    int16x8_t v291 = vqrdmulhq_n_s16(v290, 17734);
+    int16x8_t v292 = vaddq_s16(v281, v291);
+    int16x8_t v293 = vaddq_s16(v283, v276);
+    int16x8_t v294_tmp = vqrdmulhq_n_s16(v293, 13573);
+    int16x8_t v294 = vaddq_s16(v294_tmp, v293);
+    int16x8_t v295 = vaddq_s16(v287, v278);
+    int16x8_t v296 = vaddq_s16(v279, v282);
+    int16x8_t v297 = vaddq_s16(v295, v296);
+    int16x8_t v298 = vaddq_s16(v294, v297);
+    int16x8_t v299 = vaddq_s16(v296, v293);
+    int16x8_t v300_tmp = vqrdmulhq_n_s16(v299, 13573);
+    int16x8_t v300 = vaddq_s16(v300_tmp, v299);
+    int16x8_t v301 = vld1q_s16(in + in_stride * 126 + i);
+    int16x8_t v302 = vaddq_s16(v301, v222);
+    int16x8_t v303 = vaddq_s16(v302, v266);
+    int16x8_t v304 = vaddq_s16(v303, v286);
+    int16x8_t v305 = vaddq_s16(v304, v295);
+    int16x8_t v306 = vaddq_s16(v305, v299);
+    int16x8_t v307 = vaddq_s16(v300, v306);
+    int16x8_t v308 = vqrdmulhq_n_s16(v307, 17734);
+    int16x8_t v309 = vaddq_s16(v298, v308);
+    int16x8_t v310 = vqrdmulhq_n_s16(v309, 16705);
+    int16x8_t v311 = vaddq_s16(v292, v310);
+    int16x8_t v312 = vqrdmulhq_n_s16(v311, 16463);
+    int16x8_t v313 = vaddq_s16(v275, v312);
+    int16x8_t v314 = vqrdmulhq_n_s16(v313, 16404);
+    int16x8_t v315 = vaddq_s16(v234, v314);
+    int16x8_t v316 = vqrdmulhq_n_s16(v315, 16389);
+    int16x8_t v317 = vaddq_s16(v141, v316);
+    int16x8_t v318 = vld1q_s16(in + in_stride * 1 + i);
+    int16x8_t v319_tmp = vqrdmulhq_n_s16(v318, 13573);
+    int16x8_t v319 = vaddq_s16(v319_tmp, v318);
+    int16x8_t v320 = vld1q_s16(in + in_stride * 65 + i);
+    int16x8_t v321 = vld1q_s16(in + in_stride * 63 + i);
+    int16x8_t v322 = vaddq_s16(v320, v321);
+    int16x8_t v323 = vaddq_s16(v319, v322);
+    int16x8_t v324 = vld1q_s16(in + in_stride * 33 + i);
+    int16x8_t v325 = vld1q_s16(in + in_stride * 31 + i);
+    int16x8_t v326 = vaddq_s16(v324, v325);
+    int16x8_t v327_tmp = vqrdmulhq_n_s16(v326, 13573);
+    int16x8_t v327 = vaddq_s16(v327_tmp, v326);
+    int16x8_t v328 = vld1q_s16(in + in_stride * 97 + i);
+    int16x8_t v329 = vld1q_s16(in + in_stride * 95 + i);
+    int16x8_t v330 = vaddq_s16(v328, v329);
+    int16x8_t v331 = vaddq_s16(v330, v326);
+    int16x8_t v332 = vaddq_s16(v327, v331);
+    int16x8_t v333 = vqrdmulhq_n_s16(v332, 17734);
+    int16x8_t v334 = vaddq_s16(v323, v333);
+    int16x8_t v335 = vld1q_s16(in + in_stride * 17 + i);
+    int16x8_t v336 = vld1q_s16(in + in_stride * 15 + i);
+    int16x8_t v337 = vaddq_s16(v335, v336);
+    int16x8_t v338_tmp = vqrdmulhq_n_s16(v337, 13573);
+    int16x8_t v338 = vaddq_s16(v338_tmp, v337);
+    int16x8_t v339 = vld1q_s16(in + in_stride * 81 + i);
+    int16x8_t v340 = vld1q_s16(in + in_stride * 79 + i);
+    int16x8_t v341 = vaddq_s16(v339, v340);
+    int16x8_t v342 = vld1q_s16(in + in_stride * 49 + i);
+    int16x8_t v343 = vld1q_s16(in + in_stride * 47 + i);
+    int16x8_t v344 = vaddq_s16(v342, v343);
+    int16x8_t v345 = vaddq_s16(v341, v344);
+    int16x8_t v346 = vaddq_s16(v338, v345);
+    int16x8_t v347 = vaddq_s16(v344, v337);
+    int16x8_t v348_tmp = vqrdmulhq_n_s16(v347, 13573);
+    int16x8_t v348 = vaddq_s16(v348_tmp, v347);
+    int16x8_t v349 = vld1q_s16(in + in_stride * 113 + i);
+    int16x8_t v350 = vld1q_s16(in + in_stride * 111 + i);
+    int16x8_t v351 = vaddq_s16(v349, v350);
+    int16x8_t v352 = vaddq_s16(v351, v341);
+    int16x8_t v353 = vaddq_s16(v352, v347);
+    int16x8_t v354 = vaddq_s16(v348, v353);
+    int16x8_t v355 = vqrdmulhq_n_s16(v354, 17734);
+    int16x8_t v356 = vaddq_s16(v346, v355);
+    int16x8_t v357 = vqrdmulhq_n_s16(v356, 16705);
+    int16x8_t v358 = vaddq_s16(v334, v357);
+    int16x8_t v359 = vld1q_s16(in + in_stride * 9 + i);
+    int16x8_t v360 = vld1q_s16(in + in_stride * 7 + i);
+    int16x8_t v361 = vaddq_s16(v359, v360);
+    int16x8_t v362_tmp = vqrdmulhq_n_s16(v361, 13573);
+    int16x8_t v362 = vaddq_s16(v362_tmp, v361);
+    int16x8_t v363 = vld1q_s16(in + in_stride * 73 + i);
+    int16x8_t v364 = vld1q_s16(in + in_stride * 71 + i);
+    int16x8_t v365 = vaddq_s16(v363, v364);
+    int16x8_t v366 = vld1q_s16(in + in_stride * 57 + i);
+    int16x8_t v367 = vld1q_s16(in + in_stride * 55 + i);
+    int16x8_t v368 = vaddq_s16(v366, v367);
+    int16x8_t v369 = vaddq_s16(v365, v368);
+    int16x8_t v370 = vaddq_s16(v362, v369);
+    int16x8_t v371 = vld1q_s16(in + in_stride * 41 + i);
+    int16x8_t v372 = vld1q_s16(in + in_stride * 39 + i);
+    int16x8_t v373 = vaddq_s16(v371, v372);
+    int16x8_t v374 = vld1q_s16(in + in_stride * 25 + i);
+    int16x8_t v375 = vld1q_s16(in + in_stride * 23 + i);
+    int16x8_t v376 = vaddq_s16(v374, v375);
+    int16x8_t v377 = vaddq_s16(v373, v376);
+    int16x8_t v378_tmp = vqrdmulhq_n_s16(v377, 13573);
+    int16x8_t v378 = vaddq_s16(v378_tmp, v377);
+    int16x8_t v379 = vld1q_s16(in + in_stride * 105 + i);
+    int16x8_t v380 = vld1q_s16(in + in_stride * 103 + i);
+    int16x8_t v381 = vaddq_s16(v379, v380);
+    int16x8_t v382 = vld1q_s16(in + in_stride * 89 + i);
+    int16x8_t v383 = vld1q_s16(in + in_stride * 87 + i);
+    int16x8_t v384 = vaddq_s16(v382, v383);
+    int16x8_t v385 = vaddq_s16(v381, v384);
+    int16x8_t v386 = vaddq_s16(v385, v377);
+    int16x8_t v387 = vaddq_s16(v378, v386);
+    int16x8_t v388 = vqrdmulhq_n_s16(v387, 17734);
+    int16x8_t v389 = vaddq_s16(v370, v388);
+    int16x8_t v390 = vaddq_s16(v376, v361);
+    int16x8_t v391_tmp = vqrdmulhq_n_s16(v390, 13573);
+    int16x8_t v391 = vaddq_s16(v391_tmp, v390);
+    int16x8_t v392 = vaddq_s16(v384, v365);
+    int16x8_t v393 = vaddq_s16(v368, v373);
+    int16x8_t v394 = vaddq_s16(v392, v393);
+    int16x8_t v395 = vaddq_s16(v391, v394);
+    int16x8_t v396 = vaddq_s16(v393, v390);
+    int16x8_t v397_tmp = vqrdmulhq_n_s16(v396, 13573);
+    int16x8_t v397 = vaddq_s16(v397_tmp, v396);
+    int16x8_t v398 = vld1q_s16(in + in_stride * 121 + i);
+    int16x8_t v399 = vld1q_s16(in + in_stride * 119 + i);
+    int16x8_t v400 = vaddq_s16(v398, v399);
+    int16x8_t v401 = vaddq_s16(v400, v381);
+    int16x8_t v402 = vaddq_s16(v401, v392);
+    int16x8_t v403 = vaddq_s16(v402, v396);
+    int16x8_t v404 = vaddq_s16(v397, v403);
+    int16x8_t v405 = vqrdmulhq_n_s16(v404, 17734);
+    int16x8_t v406 = vaddq_s16(v395, v405);
+    int16x8_t v407 = vqrdmulhq_n_s16(v406, 16705);
+    int16x8_t v408 = vaddq_s16(v389, v407);
+    int16x8_t v409 = vqrdmulhq_n_s16(v408, 16463);
+    int16x8_t v410 = vaddq_s16(v358, v409);
+    int16x8_t v411 = vld1q_s16(in + in_stride * 5 + i);
+    int16x8_t v412 = vld1q_s16(in + in_stride * 3 + i);
+    int16x8_t v413 = vaddq_s16(v411, v412);
+    int16x8_t v414_tmp = vqrdmulhq_n_s16(v413, 13573);
+    int16x8_t v414 = vaddq_s16(v414_tmp, v413);
+    int16x8_t v415 = vld1q_s16(in + in_stride * 69 + i);
+    int16x8_t v416 = vld1q_s16(in + in_stride * 67 + i);
+    int16x8_t v417 = vaddq_s16(v415, v416);
+    int16x8_t v418 = vld1q_s16(in + in_stride * 61 + i);
+    int16x8_t v419 = vld1q_s16(in + in_stride * 59 + i);
+    int16x8_t v420 = vaddq_s16(v418, v419);
+    int16x8_t v421 = vaddq_s16(v417, v420);
+    int16x8_t v422 = vaddq_s16(v414, v421);
+    int16x8_t v423 = vld1q_s16(in + in_stride * 37 + i);
+    int16x8_t v424 = vld1q_s16(in + in_stride * 35 + i);
+    int16x8_t v425 = vaddq_s16(v423, v424);
+    int16x8_t v426 = vld1q_s16(in + in_stride * 29 + i);
+    int16x8_t v427 = vld1q_s16(in + in_stride * 27 + i);
+    int16x8_t v428 = vaddq_s16(v426, v427);
+    int16x8_t v429 = vaddq_s16(v425, v428);
+    int16x8_t v430_tmp = vqrdmulhq_n_s16(v429, 13573);
+    int16x8_t v430 = vaddq_s16(v430_tmp, v429);
+    int16x8_t v431 = vld1q_s16(in + in_stride * 101 + i);
+    int16x8_t v432 = vld1q_s16(in + in_stride * 99 + i);
+    int16x8_t v433 = vaddq_s16(v431, v432);
+    int16x8_t v434 = vld1q_s16(in + in_stride * 93 + i);
+    int16x8_t v435 = vld1q_s16(in + in_stride * 91 + i);
+    int16x8_t v436 = vaddq_s16(v434, v435);
+    int16x8_t v437 = vaddq_s16(v433, v436);
+    int16x8_t v438 = vaddq_s16(v437, v429);
+    int16x8_t v439 = vaddq_s16(v430, v438);
+    int16x8_t v440 = vqrdmulhq_n_s16(v439, 17734);
+    int16x8_t v441 = vaddq_s16(v422, v440);
+    int16x8_t v442 = vld1q_s16(in + in_stride * 21 + i);
+    int16x8_t v443 = vld1q_s16(in + in_stride * 19 + i);
+    int16x8_t v444 = vaddq_s16(v442, v443);
+    int16x8_t v445 = vld1q_s16(in + in_stride * 13 + i);
+    int16x8_t v446 = vld1q_s16(in + in_stride * 11 + i);
+    int16x8_t v447 = vaddq_s16(v445, v446);
+    int16x8_t v448 = vaddq_s16(v444, v447);
+    int16x8_t v449_tmp = vqrdmulhq_n_s16(v448, 13573);
+    int16x8_t v449 = vaddq_s16(v449_tmp, v448);
+    int16x8_t v450 = vld1q_s16(in + in_stride * 85 + i);
+    int16x8_t v451 = vld1q_s16(in + in_stride * 83 + i);
+    int16x8_t v452 = vaddq_s16(v450, v451);
+    int16x8_t v453 = vld1q_s16(in + in_stride * 77 + i);
+    int16x8_t v454 = vld1q_s16(in + in_stride * 75 + i);
+    int16x8_t v455 = vaddq_s16(v453, v454);
+    int16x8_t v456 = vaddq_s16(v452, v455);
+    int16x8_t v457 = vld1q_s16(in + in_stride * 53 + i);
+    int16x8_t v458 = vld1q_s16(in + in_stride * 51 + i);
+    int16x8_t v459 = vaddq_s16(v457, v458);
+    int16x8_t v460 = vld1q_s16(in + in_stride * 45 + i);
+    int16x8_t v461 = vld1q_s16(in + in_stride * 43 + i);
+    int16x8_t v462 = vaddq_s16(v460, v461);
+    int16x8_t v463 = vaddq_s16(v459, v462);
+    int16x8_t v464 = vaddq_s16(v456, v463);
+    int16x8_t v465 = vaddq_s16(v449, v464);
+    int16x8_t v466 = vaddq_s16(v463, v448);
+    int16x8_t v467_tmp = vqrdmulhq_n_s16(v466, 13573);
+    int16x8_t v467 = vaddq_s16(v467_tmp, v466);
+    int16x8_t v468 = vld1q_s16(in + in_stride * 117 + i);
+    int16x8_t v469 = vld1q_s16(in + in_stride * 115 + i);
+    int16x8_t v470 = vaddq_s16(v468, v469);
+    int16x8_t v471 = vld1q_s16(in + in_stride * 109 + i);
+    int16x8_t v472 = vld1q_s16(in + in_stride * 107 + i);
+    int16x8_t v473 = vaddq_s16(v471, v472);
+    int16x8_t v474 = vaddq_s16(v470, v473);
+    int16x8_t v475 = vaddq_s16(v474, v456);
+    int16x8_t v476 = vaddq_s16(v475, v466);
+    int16x8_t v477 = vaddq_s16(v467, v476);
+    int16x8_t v478 = vqrdmulhq_n_s16(v477, 17734);
+    int16x8_t v479 = vaddq_s16(v465, v478);
+    int16x8_t v480 = vqrdmulhq_n_s16(v479, 16705);
+    int16x8_t v481 = vaddq_s16(v441, v480);
+    int16x8_t v482 = vaddq_s16(v447, v413);
+    int16x8_t v483_tmp = vqrdmulhq_n_s16(v482, 13573);
+    int16x8_t v483 = vaddq_s16(v483_tmp, v482);
+    int16x8_t v484 = vaddq_s16(v455, v417);
+    int16x8_t v485 = vaddq_s16(v420, v459);
+    int16x8_t v486 = vaddq_s16(v484, v485);
+    int16x8_t v487 = vaddq_s16(v483, v486);
+    int16x8_t v488 = vaddq_s16(v462, v425);
+    int16x8_t v489 = vaddq_s16(v428, v444);
+    int16x8_t v490 = vaddq_s16(v488, v489);
+    int16x8_t v491_tmp = vqrdmulhq_n_s16(v490, 13573);
+    int16x8_t v491 = vaddq_s16(v491_tmp, v490);
+    int16x8_t v492 = vaddq_s16(v473, v433);
+    int16x8_t v493 = vaddq_s16(v436, v452);
+    int16x8_t v494 = vaddq_s16(v492, v493);
+    int16x8_t v495 = vaddq_s16(v494, v490);
+    int16x8_t v496 = vaddq_s16(v491, v495);
+    int16x8_t v497 = vqrdmulhq_n_s16(v496, 17734);
+    int16x8_t v498 = vaddq_s16(v487, v497);
+    int16x8_t v499 = vaddq_s16(v489, v482);
+    int16x8_t v500_tmp = vqrdmulhq_n_s16(v499, 13573);
+    int16x8_t v500 = vaddq_s16(v500_tmp, v499);
+    int16x8_t v501 = vaddq_s16(v493, v484);
+    int16x8_t v502 = vaddq_s16(v485, v488);
+    int16x8_t v503 = vaddq_s16(v501, v502);
+    int16x8_t v504 = vaddq_s16(v500, v503);
+    int16x8_t v505 = vaddq_s16(v502, v499);
+    int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 13573);
+    int16x8_t v506 = vaddq_s16(v506_tmp, v505);
+    int16x8_t v507 = vld1q_s16(in + in_stride * 125 + i);
+    int16x8_t v508 = vld1q_s16(in + in_stride * 123 + i);
+    int16x8_t v509 = vaddq_s16(v507, v508);
+    int16x8_t v510 = vaddq_s16(v509, v470);
+    int16x8_t v511 = vaddq_s16(v510, v492);
+    int16x8_t v512 = vaddq_s16(v511, v501);
+    int16x8_t v513 = vaddq_s16(v512, v505);
+    int16x8_t v514 = vaddq_s16(v506, v513);
+    int16x8_t v515 = vqrdmulhq_n_s16(v514, 17734);
+    int16x8_t v516 = vaddq_s16(v504, v515);
+    int16x8_t v517 = vqrdmulhq_n_s16(v516, 16705);
+    int16x8_t v518 = vaddq_s16(v498, v517);
+    int16x8_t v519 = vqrdmulhq_n_s16(v518, 16463);
+    int16x8_t v520 = vaddq_s16(v481, v519);
+    int16x8_t v521 = vqrdmulhq_n_s16(v520, 16404);
+    int16x8_t v522 = vaddq_s16(v410, v521);
+    int16x8_t v523 = vaddq_s16(v412, v318);
+    int16x8_t v524_tmp = vqrdmulhq_n_s16(v523, 13573);
+    int16x8_t v524 = vaddq_s16(v524_tmp, v523);
+    int16x8_t v525 = vaddq_s16(v416, v320);
+    int16x8_t v526 = vaddq_s16(v321, v418);
+    int16x8_t v527 = vaddq_s16(v525, v526);
+    int16x8_t v528 = vaddq_s16(v524, v527);
+    int16x8_t v529 = vaddq_s16(v424, v324);
+    int16x8_t v530 = vaddq_s16(v325, v426);
+    int16x8_t v531 = vaddq_s16(v529, v530);
+    int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 13573);
+    int16x8_t v532 = vaddq_s16(v532_tmp, v531);
+    int16x8_t v533 = vaddq_s16(v432, v328);
+    int16x8_t v534 = vaddq_s16(v329, v434);
+    int16x8_t v535 = vaddq_s16(v533, v534);
+    int16x8_t v536 = vaddq_s16(v535, v531);
+    int16x8_t v537 = vaddq_s16(v532, v536);
+    int16x8_t v538 = vqrdmulhq_n_s16(v537, 17734);
+    int16x8_t v539 = vaddq_s16(v528, v538);
+    int16x8_t v540 = vaddq_s16(v443, v335);
+    int16x8_t v541 = vaddq_s16(v336, v445);
+    int16x8_t v542 = vaddq_s16(v540, v541);
+    int16x8_t v543_tmp = vqrdmulhq_n_s16(v542, 13573);
+    int16x8_t v543 = vaddq_s16(v543_tmp, v542);
+    int16x8_t v544 = vaddq_s16(v451, v339);
+    int16x8_t v545 = vaddq_s16(v340, v453);
+    int16x8_t v546 = vaddq_s16(v544, v545);
+    int16x8_t v547 = vaddq_s16(v458, v342);
+    int16x8_t v548 = vaddq_s16(v343, v460);
+    int16x8_t v549 = vaddq_s16(v547, v548);
+    int16x8_t v550 = vaddq_s16(v546, v549);
+    int16x8_t v551 = vaddq_s16(v543, v550);
+    int16x8_t v552 = vaddq_s16(v549, v542);
+    int16x8_t v553_tmp = vqrdmulhq_n_s16(v552, 13573);
+    int16x8_t v553 = vaddq_s16(v553_tmp, v552);
+    int16x8_t v554 = vaddq_s16(v469, v349);
+    int16x8_t v555 = vaddq_s16(v350, v471);
+    int16x8_t v556 = vaddq_s16(v554, v555);
+    int16x8_t v557 = vaddq_s16(v556, v546);
+    int16x8_t v558 = vaddq_s16(v557, v552);
+    int16x8_t v559 = vaddq_s16(v553, v558);
+    int16x8_t v560 = vqrdmulhq_n_s16(v559, 17734);
+    int16x8_t v561 = vaddq_s16(v551, v560);
+    int16x8_t v562 = vqrdmulhq_n_s16(v561, 16705);
+    int16x8_t v563 = vaddq_s16(v539, v562);
+    int16x8_t v564 = vaddq_s16(v446, v359);
+    int16x8_t v565 = vaddq_s16(v360, v411);
+    int16x8_t v566 = vaddq_s16(v564, v565);
+    int16x8_t v567_tmp = vqrdmulhq_n_s16(v566, 13573);
+    int16x8_t v567 = vaddq_s16(v567_tmp, v566);
+    int16x8_t v568 = vaddq_s16(v454, v363);
+    int16x8_t v569 = vaddq_s16(v364, v415);
+    int16x8_t v570 = vaddq_s16(v568, v569);
+    int16x8_t v571 = vaddq_s16(v419, v366);
+    int16x8_t v572 = vaddq_s16(v367, v457);
+    int16x8_t v573 = vaddq_s16(v571, v572);
+    int16x8_t v574 = vaddq_s16(v570, v573);
+    int16x8_t v575 = vaddq_s16(v567, v574);
+    int16x8_t v576 = vaddq_s16(v461, v371);
+    int16x8_t v577 = vaddq_s16(v372, v423);
+    int16x8_t v578 = vaddq_s16(v576, v577);
+    int16x8_t v579 = vaddq_s16(v427, v374);
+    int16x8_t v580 = vaddq_s16(v375, v442);
+    int16x8_t v581 = vaddq_s16(v579, v580);
+    int16x8_t v582 = vaddq_s16(v578, v581);
+    int16x8_t v583_tmp = vqrdmulhq_n_s16(v582, 13573);
+    int16x8_t v583 = vaddq_s16(v583_tmp, v582);
+    int16x8_t v584 = vaddq_s16(v472, v379);
+    int16x8_t v585 = vaddq_s16(v380, v431);
+    int16x8_t v586 = vaddq_s16(v584, v585);
+    int16x8_t v587 = vaddq_s16(v435, v382);
+    int16x8_t v588 = vaddq_s16(v383, v450);
+    int16x8_t v589 = vaddq_s16(v587, v588);
+    int16x8_t v590 = vaddq_s16(v586, v589);
+    int16x8_t v591 = vaddq_s16(v590, v582);
+    int16x8_t v592 = vaddq_s16(v583, v591);
+    int16x8_t v593 = vqrdmulhq_n_s16(v592, 17734);
+    int16x8_t v594 = vaddq_s16(v575, v593);
+    int16x8_t v595 = vaddq_s16(v581, v566);
+    int16x8_t v596_tmp = vqrdmulhq_n_s16(v595, 13573);
+    int16x8_t v596 = vaddq_s16(v596_tmp, v595);
+    int16x8_t v597 = vaddq_s16(v589, v570);
+    int16x8_t v598 = vaddq_s16(v573, v578);
+    int16x8_t v599 = vaddq_s16(v597, v598);
+    int16x8_t v600 = vaddq_s16(v596, v599);
+    int16x8_t v601 = vaddq_s16(v598, v595);
+    int16x8_t v602_tmp = vqrdmulhq_n_s16(v601, 13573);
+    int16x8_t v602 = vaddq_s16(v602_tmp, v601);
+    int16x8_t v603 = vaddq_s16(v508, v398);
+    int16x8_t v604 = vaddq_s16(v399, v468);
+    int16x8_t v605 = vaddq_s16(v603, v604);
+    int16x8_t v606 = vaddq_s16(v605, v586);
+    int16x8_t v607 = vaddq_s16(v606, v597);
+    int16x8_t v608 = vaddq_s16(v607, v601);
+    int16x8_t v609 = vaddq_s16(v602, v608);
+    int16x8_t v610 = vqrdmulhq_n_s16(v609, 17734);
+    int16x8_t v611 = vaddq_s16(v600, v610);
+    int16x8_t v612 = vqrdmulhq_n_s16(v611, 16705);
+    int16x8_t v613 = vaddq_s16(v594, v612);
+    int16x8_t v614 = vqrdmulhq_n_s16(v613, 16463);
+    int16x8_t v615 = vaddq_s16(v563, v614);
+    int16x8_t v616 = vaddq_s16(v565, v523);
+    int16x8_t v617_tmp = vqrdmulhq_n_s16(v616, 13573);
+    int16x8_t v617 = vaddq_s16(v617_tmp, v616);
+    int16x8_t v618 = vaddq_s16(v569, v525);
+    int16x8_t v619 = vaddq_s16(v526, v571);
+    int16x8_t v620 = vaddq_s16(v618, v619);
+    int16x8_t v621 = vaddq_s16(v617, v620);
+    int16x8_t v622 = vaddq_s16(v577, v529);
+    int16x8_t v623 = vaddq_s16(v530, v579);
+    int16x8_t v624 = vaddq_s16(v622, v623);
+    int16x8_t v625_tmp = vqrdmulhq_n_s16(v624, 13573);
+    int16x8_t v625 = vaddq_s16(v625_tmp, v624);
+    int16x8_t v626 = vaddq_s16(v585, v533);
+    int16x8_t v627 = vaddq_s16(v534, v587);
+    int16x8_t v628 = vaddq_s16(v626, v627);
+    int16x8_t v629 = vaddq_s16(v628, v624);
+    int16x8_t v630 = vaddq_s16(v625, v629);
+    int16x8_t v631 = vqrdmulhq_n_s16(v630, 17734);
+    int16x8_t v632 = vaddq_s16(v621, v631);
+    int16x8_t v633 = vaddq_s16(v580, v540);
+    int16x8_t v634 = vaddq_s16(v541, v564);
+    int16x8_t v635 = vaddq_s16(v633, v634);
+    int16x8_t v636_tmp = vqrdmulhq_n_s16(v635, 13573);
+    int16x8_t v636 = vaddq_s16(v636_tmp, v635);
+    int16x8_t v637 = vaddq_s16(v588, v544);
+    int16x8_t v638 = vaddq_s16(v545, v568);
+    int16x8_t v639 = vaddq_s16(v637, v638);
+    int16x8_t v640 = vaddq_s16(v572, v547);
+    int16x8_t v641 = vaddq_s16(v548, v576);
+    int16x8_t v642 = vaddq_s16(v640, v641);
+    int16x8_t v643 = vaddq_s16(v639, v642);
+    int16x8_t v644 = vaddq_s16(v636, v643);
+    int16x8_t v645 = vaddq_s16(v642, v635);
+    int16x8_t v646_tmp = vqrdmulhq_n_s16(v645, 13573);
+    int16x8_t v646 = vaddq_s16(v646_tmp, v645);
+    int16x8_t v647 = vaddq_s16(v604, v554);
+    int16x8_t v648 = vaddq_s16(v555, v584);
+    int16x8_t v649 = vaddq_s16(v647, v648);
+    int16x8_t v650 = vaddq_s16(v649, v639);
+    int16x8_t v651 = vaddq_s16(v650, v645);
+    int16x8_t v652 = vaddq_s16(v646, v651);
+    int16x8_t v653 = vqrdmulhq_n_s16(v652, 17734);
+    int16x8_t v654 = vaddq_s16(v644, v653);
+    int16x8_t v655 = vqrdmulhq_n_s16(v654, 16705);
+    int16x8_t v656 = vaddq_s16(v632, v655);
+    int16x8_t v657 = vaddq_s16(v634, v616);
+    int16x8_t v658_tmp = vqrdmulhq_n_s16(v657, 13573);
+    int16x8_t v658 = vaddq_s16(v658_tmp, v657);
+    int16x8_t v659 = vaddq_s16(v638, v618);
+    int16x8_t v660 = vaddq_s16(v619, v640);
+    int16x8_t v661 = vaddq_s16(v659, v660);
+    int16x8_t v662 = vaddq_s16(v658, v661);
+    int16x8_t v663 = vaddq_s16(v641, v622);
+    int16x8_t v664 = vaddq_s16(v623, v633);
+    int16x8_t v665 = vaddq_s16(v663, v664);
+    int16x8_t v666_tmp = vqrdmulhq_n_s16(v665, 13573);
+    int16x8_t v666 = vaddq_s16(v666_tmp, v665);
+    int16x8_t v667 = vaddq_s16(v648, v626);
+    int16x8_t v668 = vaddq_s16(v627, v637);
+    int16x8_t v669 = vaddq_s16(v667, v668);
+    int16x8_t v670 = vaddq_s16(v669, v665);
+    int16x8_t v671 = vaddq_s16(v666, v670);
+    int16x8_t v672 = vqrdmulhq_n_s16(v671, 17734);
+    int16x8_t v673 = vaddq_s16(v662, v672);
+    int16x8_t v674 = vaddq_s16(v664, v657);
+    int16x8_t v675_tmp = vqrdmulhq_n_s16(v674, 13573);
+    int16x8_t v675 = vaddq_s16(v675_tmp, v674);
+    int16x8_t v676 = vaddq_s16(v668, v659);
+    int16x8_t v677 = vaddq_s16(v660, v663);
+    int16x8_t v678 = vaddq_s16(v676, v677);
+    int16x8_t v679 = vaddq_s16(v675, v678);
+    int16x8_t v680 = vaddq_s16(v677, v674);
+    int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 13573);
+    int16x8_t v681 = vaddq_s16(v681_tmp, v680);
+    int16x8_t v682 = vld1q_s16(in + in_stride * 127 + i);
+    int16x8_t v683 = vaddq_s16(v682, v507);
+    int16x8_t v684 = vaddq_s16(v683, v603);
+    int16x8_t v685 = vaddq_s16(v684, v647);
+    int16x8_t v686 = vaddq_s16(v685, v667);
+    int16x8_t v687 = vaddq_s16(v686, v676);
+    int16x8_t v688 = vaddq_s16(v687, v680);
+    int16x8_t v689 = vaddq_s16(v681, v688);
+    int16x8_t v690 = vqrdmulhq_n_s16(v689, 17734);
+    int16x8_t v691 = vaddq_s16(v679, v690);
+    int16x8_t v692 = vqrdmulhq_n_s16(v691, 16705);
+    int16x8_t v693 = vaddq_s16(v673, v692);
+    int16x8_t v694 = vqrdmulhq_n_s16(v693, 16463);
+    int16x8_t v695 = vaddq_s16(v656, v694);
+    int16x8_t v696 = vqrdmulhq_n_s16(v695, 16404);
+    int16x8_t v697 = vaddq_s16(v615, v696);
+    int16x8_t v698 = vqrdmulhq_n_s16(v697, 16389);
+    int16x8_t v699 = vaddq_s16(v522, v698);
+    int16x8_t v700 = vqrdmulhq_n_s16(v699, 16385);
+    int16x8_t v701 = vaddq_s16(v317, v700);
+    int16x8_t v702 = vsubq_s16(v0, v1);
+    int16x8_t v703 = vsubq_s16(v4, v6);
+    int16x8_t v704_tmp = vqrdmulhq_n_s16(v703, 10045);
+    int16x8_t v704 = vaddq_s16(v704_tmp, v703);
+    int16x8_t v705 = vaddq_s16(v702, v704);
+    int16x8_t v706 = vsubq_s16(v11, v14);
+    int16x8_t v707 = vsubq_s16(v17, v20);
+    int16x8_t v708_tmp = vqrdmulhq_n_s16(v707, 10045);
+    int16x8_t v708 = vaddq_s16(v708_tmp, v707);
+    int16x8_t v709 = vaddq_s16(v706, v708);
+    int16x8_t v710 = vqrdmulhq_n_s16(v709, 19705);
+    int16x8_t v711 = vaddq_s16(v705, v710);
+    int16x8_t v712 = vsubq_s16(v27, v30);
+    int16x8_t v713 = vsubq_s16(v35, v39);
+    int16x8_t v714_tmp = vqrdmulhq_n_s16(v713, 10045);
+    int16x8_t v714 = vaddq_s16(v714_tmp, v713);
+    int16x8_t v715 = vaddq_s16(v712, v714);
+    int16x8_t v716 = vsubq_s16(v44, v47);
+    int16x8_t v717 = vsubq_s16(v50, v54);
+    int16x8_t v718_tmp = vqrdmulhq_n_s16(v717, 10045);
+    int16x8_t v718 = vaddq_s16(v718_tmp, v717);
+    int16x8_t v719 = vaddq_s16(v716, v718);
+    int16x8_t v720 = vqrdmulhq_n_s16(v719, 19705);
+    int16x8_t v721 = vaddq_s16(v715, v720);
+    int16x8_t v722 = vqrdmulhq_n_s16(v721, 17121);
+    int16x8_t v723 = vaddq_s16(v711, v722);
+    int16x8_t v724 = vsubq_s16(v63, v66);
+    int16x8_t v725 = vsubq_s16(v71, v75);
+    int16x8_t v726_tmp = vqrdmulhq_n_s16(v725, 10045);
+    int16x8_t v726 = vaddq_s16(v726_tmp, v725);
+    int16x8_t v727 = vaddq_s16(v724, v726);
+    int16x8_t v728 = vsubq_s16(v82, v89);
+    int16x8_t v729 = vsubq_s16(v92, v97);
+    int16x8_t v730_tmp = vqrdmulhq_n_s16(v729, 10045);
+    int16x8_t v730 = vaddq_s16(v730_tmp, v729);
+    int16x8_t v731 = vaddq_s16(v728, v730);
+    int16x8_t v732 = vqrdmulhq_n_s16(v731, 19705);
+    int16x8_t v733 = vaddq_s16(v727, v732);
+    int16x8_t v734 = vsubq_s16(v104, v107);
+    int16x8_t v735 = vsubq_s16(v112, v116);
+    int16x8_t v736_tmp = vqrdmulhq_n_s16(v735, 10045);
+    int16x8_t v736 = vaddq_s16(v736_tmp, v735);
+    int16x8_t v737 = vaddq_s16(v734, v736);
+    int16x8_t v738 = vsubq_s16(v121, v124);
+    int16x8_t v739 = vsubq_s16(v127, v132);
+    int16x8_t v740_tmp = vqrdmulhq_n_s16(v739, 10045);
+    int16x8_t v740 = vaddq_s16(v740_tmp, v739);
+    int16x8_t v741 = vaddq_s16(v738, v740);
+    int16x8_t v742 = vqrdmulhq_n_s16(v741, 19705);
+    int16x8_t v743 = vaddq_s16(v737, v742);
+    int16x8_t v744 = vqrdmulhq_n_s16(v743, 17121);
+    int16x8_t v745 = vaddq_s16(v733, v744);
+    int16x8_t v746 = vqrdmulhq_n_s16(v745, 16563);
+    int16x8_t v747 = vaddq_s16(v723, v746);
+    int16x8_t v748 = vsubq_s16(v143, v146);
+    int16x8_t v749 = vsubq_s16(v151, v155);
+    int16x8_t v750_tmp = vqrdmulhq_n_s16(v749, 10045);
+    int16x8_t v750 = vaddq_s16(v750_tmp, v749);
+    int16x8_t v751 = vaddq_s16(v748, v750);
+    int16x8_t v752 = vsubq_s16(v162, v169);
+    int16x8_t v753 = vqrdmulhq_n_s16(v752, 19705);
+    int16x8_t v754 = vsubq_s16(v172, v177);
+    int16x8_t v755 = vqrdmulhq_n_s16(v754, 25746);
+    int16x8_t v756 = vaddq_s16(v753, v755);
+    int16x8_t v757 = vaddq_s16(v751, v756);
+    int16x8_t v758 = vsubq_s16(v186, v193);
+    int16x8_t v759 = vsubq_s16(v202, v210);
+    int16x8_t v760_tmp = vqrdmulhq_n_s16(v759, 10045);
+    int16x8_t v760 = vaddq_s16(v760_tmp, v759);
+    int16x8_t v761 = vaddq_s16(v758, v760);
+    int16x8_t v762 = vsubq_s16(v215, v218);
+    int16x8_t v763 = vsubq_s16(v221, v227);
+    int16x8_t v764_tmp = vqrdmulhq_n_s16(v763, 10045);
+    int16x8_t v764 = vaddq_s16(v764_tmp, v763);
+    int16x8_t v765 = vaddq_s16(v762, v764);
+    int16x8_t v766 = vqrdmulhq_n_s16(v765, 19705);
+    int16x8_t v767 = vaddq_s16(v761, v766);
+    int16x8_t v768 = vqrdmulhq_n_s16(v767, 17121);
+    int16x8_t v769 = vaddq_s16(v757, v768);
+    int16x8_t v770 = vsubq_s16(v236, v239);
+    int16x8_t v771 = vsubq_s16(v244, v248);
+    int16x8_t v772_tmp = vqrdmulhq_n_s16(v771, 10045);
+    int16x8_t v772 = vaddq_s16(v772_tmp, v771);
+    int16x8_t v773 = vaddq_s16(v770, v772);
+    int16x8_t v774 = vsubq_s16(v255, v262);
+    int16x8_t v775 = vsubq_s16(v265, v270);
+    int16x8_t v776_tmp = vqrdmulhq_n_s16(v775, 10045);
+    int16x8_t v776 = vaddq_s16(v776_tmp, v775);
+    int16x8_t v777 = vaddq_s16(v774, v776);
+    int16x8_t v778 = vqrdmulhq_n_s16(v777, 19705);
+    int16x8_t v779 = vaddq_s16(v773, v778);
+    int16x8_t v780 = vsubq_s16(v277, v280);
+    int16x8_t v781 = vsubq_s16(v285, v289);
+    int16x8_t v782_tmp = vqrdmulhq_n_s16(v781, 10045);
+    int16x8_t v782 = vaddq_s16(v782_tmp, v781);
+    int16x8_t v783 = vaddq_s16(v780, v782);
+    int16x8_t v784 = vsubq_s16(v294, v297);
+    int16x8_t v785 = vsubq_s16(v300, v306);
+    int16x8_t v786_tmp = vqrdmulhq_n_s16(v785, 10045);
+    int16x8_t v786 = vaddq_s16(v786_tmp, v785);
+    int16x8_t v787 = vaddq_s16(v784, v786);
+    int16x8_t v788 = vqrdmulhq_n_s16(v787, 19705);
+    int16x8_t v789 = vaddq_s16(v783, v788);
+    int16x8_t v790 = vqrdmulhq_n_s16(v789, 17121);
+    int16x8_t v791 = vaddq_s16(v779, v790);
+    int16x8_t v792 = vqrdmulhq_n_s16(v791, 16563);
+    int16x8_t v793 = vaddq_s16(v769, v792);
+    int16x8_t v794 = vqrdmulhq_n_s16(v793, 16429);
+    int16x8_t v795 = vaddq_s16(v747, v794);
+    int16x8_t v796 = vsubq_s16(v319, v322);
+    int16x8_t v797 = vsubq_s16(v327, v331);
+    int16x8_t v798_tmp = vqrdmulhq_n_s16(v797, 10045);
+    int16x8_t v798 = vaddq_s16(v798_tmp, v797);
+    int16x8_t v799 = vaddq_s16(v796, v798);
+    int16x8_t v800 = vsubq_s16(v338, v345);
+    int16x8_t v801 = vsubq_s16(v348, v353);
+    int16x8_t v802_tmp = vqrdmulhq_n_s16(v801, 10045);
+    int16x8_t v802 = vaddq_s16(v802_tmp, v801);
+    int16x8_t v803 = vaddq_s16(v800, v802);
+    int16x8_t v804 = vqrdmulhq_n_s16(v803, 19705);
+    int16x8_t v805 = vaddq_s16(v799, v804);
+    int16x8_t v806 = vsubq_s16(v362, v369);
+    int16x8_t v807 = vsubq_s16(v378, v386);
+    int16x8_t v808_tmp = vqrdmulhq_n_s16(v807, 10045);
+    int16x8_t v808 = vaddq_s16(v808_tmp, v807);
+    int16x8_t v809 = vaddq_s16(v806, v808);
+    int16x8_t v810 = vsubq_s16(v391, v394);
+    int16x8_t v811 = vsubq_s16(v397, v403);
+    int16x8_t v812_tmp = vqrdmulhq_n_s16(v811, 10045);
+    int16x8_t v812 = vaddq_s16(v812_tmp, v811);
+    int16x8_t v813 = vaddq_s16(v810, v812);
+    int16x8_t v814 = vqrdmulhq_n_s16(v813, 19705);
+    int16x8_t v815 = vaddq_s16(v809, v814);
+    int16x8_t v816 = vqrdmulhq_n_s16(v815, 17121);
+    int16x8_t v817 = vaddq_s16(v805, v816);
+    int16x8_t v818 = vsubq_s16(v414, v421);
+    int16x8_t v819 = vsubq_s16(v430, v438);
+    int16x8_t v820_tmp = vqrdmulhq_n_s16(v819, 10045);
+    int16x8_t v820 = vaddq_s16(v820_tmp, v819);
+    int16x8_t v821 = vaddq_s16(v818, v820);
+    int16x8_t v822 = vsubq_s16(v449, v464);
+    int16x8_t v823 = vsubq_s16(v467, v476);
+    int16x8_t v824_tmp = vqrdmulhq_n_s16(v823, 10045);
+    int16x8_t v824 = vaddq_s16(v824_tmp, v823);
+    int16x8_t v825 = vaddq_s16(v822, v824);
+    int16x8_t v826 = vqrdmulhq_n_s16(v825, 19705);
+    int16x8_t v827 = vaddq_s16(v821, v826);
+    int16x8_t v828 = vsubq_s16(v483, v486);
+    int16x8_t v829 = vsubq_s16(v491, v495);
+    int16x8_t v830_tmp = vqrdmulhq_n_s16(v829, 10045);
+    int16x8_t v830 = vaddq_s16(v830_tmp, v829);
+    int16x8_t v831 = vaddq_s16(v828, v830);
+    int16x8_t v832 = vsubq_s16(v500, v503);
+    int16x8_t v833 = vsubq_s16(v506, v513);
+    int16x8_t v834_tmp = vqrdmulhq_n_s16(v833, 10045);
+    int16x8_t v834 = vaddq_s16(v834_tmp, v833);
+    int16x8_t v835 = vaddq_s16(v832, v834);
+    int16x8_t v836 = vqrdmulhq_n_s16(v835, 19705);
+    int16x8_t v837 = vaddq_s16(v831, v836);
+    int16x8_t v838 = vqrdmulhq_n_s16(v837, 17121);
+    int16x8_t v839 = vaddq_s16(v827, v838);
+    int16x8_t v840 = vqrdmulhq_n_s16(v839, 16563);
+    int16x8_t v841 = vaddq_s16(v817, v840);
+    int16x8_t v842 = vsubq_s16(v524, v527);
+    int16x8_t v843 = vsubq_s16(v532, v536);
+    int16x8_t v844_tmp = vqrdmulhq_n_s16(v843, 10045);
+    int16x8_t v844 = vaddq_s16(v844_tmp, v843);
+    int16x8_t v845 = vaddq_s16(v842, v844);
+    int16x8_t v846 = vsubq_s16(v543, v550);
+    int16x8_t v847 = vsubq_s16(v553, v558);
+    int16x8_t v848_tmp = vqrdmulhq_n_s16(v847, 10045);
+    int16x8_t v848 = vaddq_s16(v848_tmp, v847);
+    int16x8_t v849 = vaddq_s16(v846, v848);
+    int16x8_t v850 = vqrdmulhq_n_s16(v849, 19705);
+    int16x8_t v851 = vaddq_s16(v845, v850);
+    int16x8_t v852 = vsubq_s16(v567, v574);
+    int16x8_t v853 = vsubq_s16(v583, v591);
+    int16x8_t v854_tmp = vqrdmulhq_n_s16(v853, 10045);
+    int16x8_t v854 = vaddq_s16(v854_tmp, v853);
+    int16x8_t v855 = vaddq_s16(v852, v854);
+    int16x8_t v856 = vsubq_s16(v596, v599);
+    int16x8_t v857 = vsubq_s16(v602, v608);
+    int16x8_t v858_tmp = vqrdmulhq_n_s16(v857, 10045);
+    int16x8_t v858 = vaddq_s16(v858_tmp, v857);
+    int16x8_t v859 = vaddq_s16(v856, v858);
+    int16x8_t v860 = vqrdmulhq_n_s16(v859, 19705);
+    int16x8_t v861 = vaddq_s16(v855, v860);
+    int16x8_t v862 = vqrdmulhq_n_s16(v861, 17121);
+    int16x8_t v863 = vaddq_s16(v851, v862);
+    int16x8_t v864 = vsubq_s16(v617, v620);
+    int16x8_t v865 = vsubq_s16(v625, v629);
+    int16x8_t v866_tmp = vqrdmulhq_n_s16(v865, 10045);
+    int16x8_t v866 = vaddq_s16(v866_tmp, v865);
+    int16x8_t v867 = vaddq_s16(v864, v866);
+    int16x8_t v868 = vsubq_s16(v636, v643);
+    int16x8_t v869 = vsubq_s16(v646, v651);
+    int16x8_t v870_tmp = vqrdmulhq_n_s16(v869, 10045);
+    int16x8_t v870 = vaddq_s16(v870_tmp, v869);
+    int16x8_t v871 = vaddq_s16(v868, v870);
+    int16x8_t v872 = vqrdmulhq_n_s16(v871, 19705);
+    int16x8_t v873 = vaddq_s16(v867, v872);
+    int16x8_t v874 = vsubq_s16(v658, v661);
+    int16x8_t v875 = vsubq_s16(v666, v670);
+    int16x8_t v876_tmp = vqrdmulhq_n_s16(v875, 10045);
+    int16x8_t v876 = vaddq_s16(v876_tmp, v875);
+    int16x8_t v877 = vaddq_s16(v874, v876);
+    int16x8_t v878 = vsubq_s16(v675, v678);
+    int16x8_t v879 = vsubq_s16(v681, v688);
+    int16x8_t v880_tmp = vqrdmulhq_n_s16(v879, 10045);
+    int16x8_t v880 = vaddq_s16(v880_tmp, v879);
+    int16x8_t v881 = vaddq_s16(v878, v880);
+    int16x8_t v882 = vqrdmulhq_n_s16(v881, 19705);
+    int16x8_t v883 = vaddq_s16(v877, v882);
+    int16x8_t v884 = vqrdmulhq_n_s16(v883, 17121);
+    int16x8_t v885 = vaddq_s16(v873, v884);
+    int16x8_t v886 = vqrdmulhq_n_s16(v885, 16563);
+    int16x8_t v887 = vaddq_s16(v863, v886);
+    int16x8_t v888 = vqrdmulhq_n_s16(v887, 16429);
+    int16x8_t v889 = vaddq_s16(v841, v888);
+    int16x8_t v890 = vqrdmulhq_n_s16(v889, 16395);
+    int16x8_t v891 = vaddq_s16(v795, v890);
+    int16x8_t v892 = vsubq_s16(v702, v704);
+    int16x8_t v893 = vsubq_s16(v706, v708);
+    int16x8_t v894 = vqrdmulhq_n_s16(v893, 29490);
+    int16x8_t v895 = vaddq_s16(v892, v894);
+    int16x8_t v896 = vsubq_s16(v712, v714);
+    int16x8_t v897 = vsubq_s16(v716, v718);
+    int16x8_t v898 = vqrdmulhq_n_s16(v897, 29490);
+    int16x8_t v899 = vaddq_s16(v896, v898);
+    int16x8_t v900 = vqrdmulhq_n_s16(v899, 18578);
+    int16x8_t v901 = vaddq_s16(v895, v900);
+    int16x8_t v902 = vsubq_s16(v724, v726);
+    int16x8_t v903 = vsubq_s16(v728, v730);
+    int16x8_t v904 = vqrdmulhq_n_s16(v903, 29490);
+    int16x8_t v905 = vaddq_s16(v902, v904);
+    int16x8_t v906 = vsubq_s16(v734, v736);
+    int16x8_t v907 = vsubq_s16(v738, v740);
+    int16x8_t v908 = vqrdmulhq_n_s16(v907, 29490);
+    int16x8_t v909 = vaddq_s16(v906, v908);
+    int16x8_t v910 = vqrdmulhq_n_s16(v909, 18578);
+    int16x8_t v911 = vaddq_s16(v905, v910);
+    int16x8_t v912 = vqrdmulhq_n_s16(v911, 16890);
+    int16x8_t v913 = vaddq_s16(v901, v912);
+    int16x8_t v914 = vsubq_s16(v748, v750);
+    int16x8_t v915_tmp = vqrdmulhq_n_s16(v754, 10045);
+    int16x8_t v915 = vaddq_s16(v915_tmp, v754);
+    int16x8_t v916 = vsubq_s16(v752, v915);
+    int16x8_t v917 = vqrdmulhq_n_s16(v916, 29490);
+    int16x8_t v918 = vaddq_s16(v914, v917);
+    int16x8_t v919 = vsubq_s16(v758, v760);
+    int16x8_t v920 = vsubq_s16(v762, v764);
+    int16x8_t v921 = vqrdmulhq_n_s16(v920, 29490);
+    int16x8_t v922 = vaddq_s16(v919, v921);
+    int16x8_t v923 = vqrdmulhq_n_s16(v922, 18578);
+    int16x8_t v924 = vaddq_s16(v918, v923);
+    int16x8_t v925 = vsubq_s16(v770, v772);
+    int16x8_t v926 = vsubq_s16(v774, v776);
+    int16x8_t v927 = vqrdmulhq_n_s16(v926, 29490);
+    int16x8_t v928 = vaddq_s16(v925, v927);
+    int16x8_t v929 = vsubq_s16(v780, v782);
+    int16x8_t v930 = vsubq_s16(v784, v786);
+    int16x8_t v931 = vqrdmulhq_n_s16(v930, 29490);
+    int16x8_t v932 = vaddq_s16(v929, v931);
+    int16x8_t v933 = vqrdmulhq_n_s16(v932, 18578);
+    int16x8_t v934 = vaddq_s16(v928, v933);
+    int16x8_t v935 = vqrdmulhq_n_s16(v934, 16890);
+    int16x8_t v936 = vaddq_s16(v924, v935);
+    int16x8_t v937 = vqrdmulhq_n_s16(v936, 16508);
+    int16x8_t v938 = vaddq_s16(v913, v937);
+    int16x8_t v939 = vsubq_s16(v796, v798);
+    int16x8_t v940 = vsubq_s16(v800, v802);
+    int16x8_t v941 = vqrdmulhq_n_s16(v940, 29490);
+    int16x8_t v942 = vaddq_s16(v939, v941);
+    int16x8_t v943 = vsubq_s16(v806, v808);
+    int16x8_t v944 = vsubq_s16(v810, v812);
+    int16x8_t v945 = vqrdmulhq_n_s16(v944, 29490);
+    int16x8_t v946 = vaddq_s16(v943, v945);
+    int16x8_t v947 = vqrdmulhq_n_s16(v946, 18578);
+    int16x8_t v948 = vaddq_s16(v942, v947);
+    int16x8_t v949 = vsubq_s16(v818, v820);
+    int16x8_t v950 = vsubq_s16(v822, v824);
+    int16x8_t v951 = vqrdmulhq_n_s16(v950, 29490);
+    int16x8_t v952 = vaddq_s16(v949, v951);
+    int16x8_t v953 = vsubq_s16(v828, v830);
+    int16x8_t v954 = vsubq_s16(v832, v834);
+    int16x8_t v955 = vqrdmulhq_n_s16(v954, 29490);
+    int16x8_t v956 = vaddq_s16(v953, v955);
+    int16x8_t v957 = vqrdmulhq_n_s16(v956, 18578);
+    int16x8_t v958 = vaddq_s16(v952, v957);
+    int16x8_t v959 = vqrdmulhq_n_s16(v958, 16890);
+    int16x8_t v960 = vaddq_s16(v948, v959);
+    int16x8_t v961 = vsubq_s16(v842, v844);
+    int16x8_t v962 = vsubq_s16(v846, v848);
+    int16x8_t v963 = vqrdmulhq_n_s16(v962, 29490);
+    int16x8_t v964 = vaddq_s16(v961, v963);
+    int16x8_t v965 = vsubq_s16(v852, v854);
+    int16x8_t v966 = vsubq_s16(v856, v858);
+    int16x8_t v967 = vqrdmulhq_n_s16(v966, 29490);
+    int16x8_t v968 = vaddq_s16(v965, v967);
+    int16x8_t v969 = vqrdmulhq_n_s16(v968, 18578);
+    int16x8_t v970 = vaddq_s16(v964, v969);
+    int16x8_t v971 = vsubq_s16(v864, v866);
+    int16x8_t v972 = vsubq_s16(v868, v870);
+    int16x8_t v973 = vqrdmulhq_n_s16(v972, 29490);
+    int16x8_t v974 = vaddq_s16(v971, v973);
+    int16x8_t v975 = vsubq_s16(v874, v876);
+    int16x8_t v976 = vsubq_s16(v878, v880);
+    int16x8_t v977 = vqrdmulhq_n_s16(v976, 29490);
+    int16x8_t v978 = vaddq_s16(v975, v977);
+    int16x8_t v979 = vqrdmulhq_n_s16(v978, 18578);
+    int16x8_t v980 = vaddq_s16(v974, v979);
+    int16x8_t v981 = vqrdmulhq_n_s16(v980, 16890);
+    int16x8_t v982 = vaddq_s16(v970, v981);
+    int16x8_t v983 = vqrdmulhq_n_s16(v982, 16508);
+    int16x8_t v984 = vaddq_s16(v960, v983);
+    int16x8_t v985 = vqrdmulhq_n_s16(v984, 16415);
+    int16x8_t v986 = vaddq_s16(v938, v985);
+    int16x8_t v987 = vsubq_s16(v2, v8);
+    int16x8_t v988 = vsubq_s16(v15, v22);
+    int16x8_t v989_tmp = vqrdmulhq_n_s16(v988, 18446);
+    int16x8_t v989 = vmlaq_n_s16(v989_tmp, v988, 2);
+    int16x8_t v990 = vaddq_s16(v987, v989);
+    int16x8_t v991 = vsubq_s16(v31, v41);
+    int16x8_t v992 = vsubq_s16(v48, v56);
+    int16x8_t v993_tmp = vqrdmulhq_n_s16(v992, 18446);
+    int16x8_t v993 = vmlaq_n_s16(v993_tmp, v992, 2);
+    int16x8_t v994 = vaddq_s16(v991, v993);
+    int16x8_t v995 = vqrdmulhq_n_s16(v994, 21195);
+    int16x8_t v996 = vaddq_s16(v990, v995);
+    int16x8_t v997 = vsubq_s16(v67, v77);
+    int16x8_t v998 = vsubq_s16(v90, v99);
+    int16x8_t v999_tmp = vqrdmulhq_n_s16(v998, 18446);
+    int16x8_t v999 = vmlaq_n_s16(v999_tmp, v998, 2);
+    int16x8_t v1000 = vaddq_s16(v997, v999);
+    int16x8_t v1001 = vsubq_s16(v108, v118);
+    int16x8_t v1002 = vsubq_s16(v125, v134);
+    int16x8_t v1003_tmp = vqrdmulhq_n_s16(v1002, 18446);
+    int16x8_t v1003 = vmlaq_n_s16(v1003_tmp, v1002, 2);
+    int16x8_t v1004 = vaddq_s16(v1001, v1003);
+    int16x8_t v1005 = vqrdmulhq_n_s16(v1004, 21195);
+    int16x8_t v1006 = vaddq_s16(v1000, v1005);
+    int16x8_t v1007 = vqrdmulhq_n_s16(v1006, 17401);
+    int16x8_t v1008 = vaddq_s16(v996, v1007);
+    int16x8_t v1009 = vsubq_s16(v147, v157);
+    int16x8_t v1010 = vsubq_s16(v170, v179);
+    int16x8_t v1011_tmp = vqrdmulhq_n_s16(v1010, 18446);
+    int16x8_t v1011 = vmlaq_n_s16(v1011_tmp, v1010, 2);
+    int16x8_t v1012 = vaddq_s16(v1009, v1011);
+    int16x8_t v1013 = vsubq_s16(v194, v212);
+    int16x8_t v1014 = vsubq_s16(v219, v229);
+    int16x8_t v1015_tmp = vqrdmulhq_n_s16(v1014, 18446);
+    int16x8_t v1015 = vmlaq_n_s16(v1015_tmp, v1014, 2);
+    int16x8_t v1016 = vaddq_s16(v1013, v1015);
+    int16x8_t v1017 = vqrdmulhq_n_s16(v1016, 21195);
+    int16x8_t v1018 = vaddq_s16(v1012, v1017);
+    int16x8_t v1019 = vsubq_s16(v240, v250);
+    int16x8_t v1020 = vsubq_s16(v263, v272);
+    int16x8_t v1021_tmp = vqrdmulhq_n_s16(v1020, 18446);
+    int16x8_t v1021 = vmlaq_n_s16(v1021_tmp, v1020, 2);
+    int16x8_t v1022 = vaddq_s16(v1019, v1021);
+    int16x8_t v1023 = vsubq_s16(v281, v291);
+    int16x8_t v1024 = vsubq_s16(v298, v308);
+    int16x8_t v1025_tmp = vqrdmulhq_n_s16(v1024, 18446);
+    int16x8_t v1025 = vmlaq_n_s16(v1025_tmp, v1024, 2);
+    int16x8_t v1026 = vaddq_s16(v1023, v1025);
+    int16x8_t v1027 = vqrdmulhq_n_s16(v1026, 21195);
+    int16x8_t v1028 = vaddq_s16(v1022, v1027);
+    int16x8_t v1029 = vqrdmulhq_n_s16(v1028, 17401);
+    int16x8_t v1030 = vaddq_s16(v1018, v1029);
+    int16x8_t v1031 = vqrdmulhq_n_s16(v1030, 16629);
+    int16x8_t v1032 = vaddq_s16(v1008, v1031);
+    int16x8_t v1033 = vsubq_s16(v323, v333);
+    int16x8_t v1034 = vsubq_s16(v346, v355);
+    int16x8_t v1035_tmp = vqrdmulhq_n_s16(v1034, 18446);
+    int16x8_t v1035 = vmlaq_n_s16(v1035_tmp, v1034, 2);
+    int16x8_t v1036 = vaddq_s16(v1033, v1035);
+    int16x8_t v1037 = vsubq_s16(v370, v388);
+    int16x8_t v1038 = vsubq_s16(v395, v405);
+    int16x8_t v1039_tmp = vqrdmulhq_n_s16(v1038, 18446);
+    int16x8_t v1039 = vmlaq_n_s16(v1039_tmp, v1038, 2);
+    int16x8_t v1040 = vaddq_s16(v1037, v1039);
+    int16x8_t v1041 = vqrdmulhq_n_s16(v1040, 21195);
+    int16x8_t v1042 = vaddq_s16(v1036, v1041);
+    int16x8_t v1043 = vsubq_s16(v422, v440);
+    int16x8_t v1044 = vsubq_s16(v465, v478);
+    int16x8_t v1045_tmp = vqrdmulhq_n_s16(v1044, 18446);
+    int16x8_t v1045 = vmlaq_n_s16(v1045_tmp, v1044, 2);
+    int16x8_t v1046 = vaddq_s16(v1043, v1045);
+    int16x8_t v1047 = vsubq_s16(v487, v497);
+    int16x8_t v1048 = vsubq_s16(v504, v515);
+    int16x8_t v1049_tmp = vqrdmulhq_n_s16(v1048, 18446);
+    int16x8_t v1049 = vmlaq_n_s16(v1049_tmp, v1048, 2);
+    int16x8_t v1050 = vaddq_s16(v1047, v1049);
+    int16x8_t v1051 = vqrdmulhq_n_s16(v1050, 21195);
+    int16x8_t v1052 = vaddq_s16(v1046, v1051);
+    int16x8_t v1053 = vqrdmulhq_n_s16(v1052, 17401);
+    int16x8_t v1054 = vaddq_s16(v1042, v1053);
+    int16x8_t v1055 = vsubq_s16(v528, v538);
+    int16x8_t v1056 = vsubq_s16(v551, v560);
+    int16x8_t v1057_tmp = vqrdmulhq_n_s16(v1056, 18446);
+    int16x8_t v1057 = vmlaq_n_s16(v1057_tmp, v1056, 2);
+    int16x8_t v1058 = vaddq_s16(v1055, v1057);
+    int16x8_t v1059 = vsubq_s16(v575, v593);
+    int16x8_t v1060 = vsubq_s16(v600, v610);
+    int16x8_t v1061_tmp = vqrdmulhq_n_s16(v1060, 18446);
+    int16x8_t v1061 = vmlaq_n_s16(v1061_tmp, v1060, 2);
+    int16x8_t v1062 = vaddq_s16(v1059, v1061);
+    int16x8_t v1063 = vqrdmulhq_n_s16(v1062, 21195);
+    int16x8_t v1064 = vaddq_s16(v1058, v1063);
+    int16x8_t v1065 = vsubq_s16(v621, v631);
+    int16x8_t v1066 = vsubq_s16(v644, v653);
+    int16x8_t v1067_tmp = vqrdmulhq_n_s16(v1066, 18446);
+    int16x8_t v1067 = vmlaq_n_s16(v1067_tmp, v1066, 2);
+    int16x8_t v1068 = vaddq_s16(v1065, v1067);
+    int16x8_t v1069 = vsubq_s16(v662, v672);
+    int16x8_t v1070 = vsubq_s16(v679, v690);
+    int16x8_t v1071_tmp = vqrdmulhq_n_s16(v1070, 18446);
+    int16x8_t v1071 = vmlaq_n_s16(v1071_tmp, v1070, 2);
+    int16x8_t v1072 = vaddq_s16(v1069, v1071);
+    int16x8_t v1073 = vqrdmulhq_n_s16(v1072, 21195);
+    int16x8_t v1074 = vaddq_s16(v1068, v1073);
+    int16x8_t v1075 = vqrdmulhq_n_s16(v1074, 17401);
+    int16x8_t v1076 = vaddq_s16(v1064, v1075);
+    int16x8_t v1077 = vqrdmulhq_n_s16(v1076, 16629);
+    int16x8_t v1078 = vaddq_s16(v1054, v1077);
+    int16x8_t v1079 = vqrdmulhq_n_s16(v1078, 16445);
+    int16x8_t v1080 = vaddq_s16(v1032, v1079);
+    int16x8_t v1081 = vsubq_s16(v987, v989);
+    int16x8_t v1082 = vsubq_s16(v991, v993);
+    int16x8_t v1083 = vqrdmulhq_n_s16(v1082, 25826);
+    int16x8_t v1084 = vaddq_s16(v1081, v1083);
+    int16x8_t v1085 = vsubq_s16(v997, v999);
+    int16x8_t v1086 = vsubq_s16(v1001, v1003);
+    int16x8_t v1087 = vqrdmulhq_n_s16(v1086, 25826);
+    int16x8_t v1088 = vaddq_s16(v1085, v1087);
+    int16x8_t v1089 = vqrdmulhq_n_s16(v1088, 18124);
+    int16x8_t v1090 = vaddq_s16(v1084, v1089);
+    int16x8_t v1091 = vsubq_s16(v1009, v1011);
+    int16x8_t v1092 = vsubq_s16(v1013, v1015);
+    int16x8_t v1093 = vqrdmulhq_n_s16(v1092, 25826);
+    int16x8_t v1094 = vaddq_s16(v1091, v1093);
+    int16x8_t v1095 = vsubq_s16(v1019, v1021);
+    int16x8_t v1096 = vsubq_s16(v1023, v1025);
+    int16x8_t v1097 = vqrdmulhq_n_s16(v1096, 25826);
+    int16x8_t v1098 = vaddq_s16(v1095, v1097);
+    int16x8_t v1099 = vqrdmulhq_n_s16(v1098, 18124);
+    int16x8_t v1100 = vaddq_s16(v1094, v1099);
+    int16x8_t v1101 = vqrdmulhq_n_s16(v1100, 16792);
+    int16x8_t v1102 = vaddq_s16(v1090, v1101);
+    int16x8_t v1103 = vsubq_s16(v1033, v1035);
+    int16x8_t v1104 = vsubq_s16(v1037, v1039);
+    int16x8_t v1105 = vqrdmulhq_n_s16(v1104, 25826);
+    int16x8_t v1106 = vaddq_s16(v1103, v1105);
+    int16x8_t v1107 = vsubq_s16(v1043, v1045);
+    int16x8_t v1108 = vsubq_s16(v1047, v1049);
+    int16x8_t v1109 = vqrdmulhq_n_s16(v1108, 25826);
+    int16x8_t v1110 = vaddq_s16(v1107, v1109);
+    int16x8_t v1111 = vqrdmulhq_n_s16(v1110, 18124);
+    int16x8_t v1112 = vaddq_s16(v1106, v1111);
+    int16x8_t v1113 = vsubq_s16(v1055, v1057);
+    int16x8_t v1114 = vsubq_s16(v1059, v1061);
+    int16x8_t v1115 = vqrdmulhq_n_s16(v1114, 25826);
+    int16x8_t v1116 = vaddq_s16(v1113, v1115);
+    int16x8_t v1117 = vsubq_s16(v1065, v1067);
+    int16x8_t v1118 = vsubq_s16(v1069, v1071);
+    int16x8_t v1119 = vqrdmulhq_n_s16(v1118, 25826);
+    int16x8_t v1120 = vaddq_s16(v1117, v1119);
+    int16x8_t v1121 = vqrdmulhq_n_s16(v1120, 18124);
+    int16x8_t v1122 = vaddq_s16(v1116, v1121);
+    int16x8_t v1123 = vqrdmulhq_n_s16(v1122, 16792);
+    int16x8_t v1124 = vaddq_s16(v1112, v1123);
+    int16x8_t v1125 = vqrdmulhq_n_s16(v1124, 16484);
+    int16x8_t v1126 = vaddq_s16(v1102, v1125);
+    int16x8_t v1127 = vsubq_s16(v892, v894);
+    int16x8_t v1128 = vsubq_s16(v896, v898);
+    int16x8_t v1129_tmp = vqrdmulhq_n_s16(v1128, 1988);
+    int16x8_t v1129 = vaddq_s16(v1129_tmp, v1128);
+    int16x8_t v1130 = vaddq_s16(v1127, v1129);
+    int16x8_t v1131 = vsubq_s16(v902, v904);
+    int16x8_t v1132 = vsubq_s16(v906, v908);
+    int16x8_t v1133_tmp = vqrdmulhq_n_s16(v1132, 1988);
+    int16x8_t v1133 = vaddq_s16(v1133_tmp, v1132);
+    int16x8_t v1134 = vaddq_s16(v1131, v1133);
+    int16x8_t v1135 = vqrdmulhq_n_s16(v1134, 19102);
+    int16x8_t v1136 = vaddq_s16(v1130, v1135);
+    int16x8_t v1137 = vsubq_s16(v914, v917);
+    int16x8_t v1138 = vsubq_s16(v919, v921);
+    int16x8_t v1139_tmp = vqrdmulhq_n_s16(v1138, 1988);
+    int16x8_t v1139 = vaddq_s16(v1139_tmp, v1138);
+    int16x8_t v1140 = vaddq_s16(v1137, v1139);
+    int16x8_t v1141 = vsubq_s16(v925, v927);
+    int16x8_t v1142 = vsubq_s16(v929, v931);
+    int16x8_t v1143_tmp = vqrdmulhq_n_s16(v1142, 1988);
+    int16x8_t v1143 = vaddq_s16(v1143_tmp, v1142);
+    int16x8_t v1144 = vaddq_s16(v1141, v1143);
+    int16x8_t v1145 = vqrdmulhq_n_s16(v1144, 19102);
+    int16x8_t v1146 = vaddq_s16(v1140, v1145);
+    int16x8_t v1147 = vqrdmulhq_n_s16(v1146, 17000);
+    int16x8_t v1148 = vaddq_s16(v1136, v1147);
+    int16x8_t v1149 = vsubq_s16(v939, v941);
+    int16x8_t v1150 = vsubq_s16(v943, v945);
+    int16x8_t v1151_tmp = vqrdmulhq_n_s16(v1150, 1988);
+    int16x8_t v1151 = vaddq_s16(v1151_tmp, v1150);
+    int16x8_t v1152 = vaddq_s16(v1149, v1151);
+    int16x8_t v1153 = vsubq_s16(v949, v951);
+    int16x8_t v1154 = vsubq_s16(v953, v955);
+    int16x8_t v1155_tmp = vqrdmulhq_n_s16(v1154, 1988);
+    int16x8_t v1155 = vaddq_s16(v1155_tmp, v1154);
+    int16x8_t v1156 = vaddq_s16(v1153, v1155);
+    int16x8_t v1157 = vqrdmulhq_n_s16(v1156, 19102);
+    int16x8_t v1158 = vaddq_s16(v1152, v1157);
+    int16x8_t v1159 = vsubq_s16(v961, v963);
+    int16x8_t v1160 = vsubq_s16(v965, v967);
+    int16x8_t v1161_tmp = vqrdmulhq_n_s16(v1160, 1988);
+    int16x8_t v1161 = vaddq_s16(v1161_tmp, v1160);
+    int16x8_t v1162 = vaddq_s16(v1159, v1161);
+    int16x8_t v1163 = vsubq_s16(v971, v973);
+    int16x8_t v1164 = vsubq_s16(v975, v977);
+    int16x8_t v1165_tmp = vqrdmulhq_n_s16(v1164, 1988);
+    int16x8_t v1165 = vaddq_s16(v1165_tmp, v1164);
+    int16x8_t v1166 = vaddq_s16(v1163, v1165);
+    int16x8_t v1167 = vqrdmulhq_n_s16(v1166, 19102);
+    int16x8_t v1168 = vaddq_s16(v1162, v1167);
+    int16x8_t v1169 = vqrdmulhq_n_s16(v1168, 17000);
+    int16x8_t v1170 = vaddq_s16(v1158, v1169);
+    int16x8_t v1171 = vqrdmulhq_n_s16(v1170, 16534);
+    int16x8_t v1172 = vaddq_s16(v1148, v1171);
+    int16x8_t v1173 = vsubq_s16(v705, v710);
+    int16x8_t v1174 = vsubq_s16(v715, v720);
+    int16x8_t v1175_tmp = vqrdmulhq_n_s16(v1174, 23673);
+    int16x8_t v1175 = vaddq_s16(v1175_tmp, v1174);
+    int16x8_t v1176 = vaddq_s16(v1173, v1175);
+    int16x8_t v1177 = vsubq_s16(v727, v732);
+    int16x8_t v1178 = vsubq_s16(v737, v742);
+    int16x8_t v1179_tmp = vqrdmulhq_n_s16(v1178, 23673);
+    int16x8_t v1179 = vaddq_s16(v1179_tmp, v1178);
+    int16x8_t v1180 = vaddq_s16(v1177, v1179);
+    int16x8_t v1181 = vqrdmulhq_n_s16(v1180, 20398);
+    int16x8_t v1182 = vaddq_s16(v1176, v1181);
+    int16x8_t v1183 = vsubq_s16(v751, v756);
+    int16x8_t v1184 = vsubq_s16(v761, v766);
+    int16x8_t v1185_tmp = vqrdmulhq_n_s16(v1184, 23673);
+    int16x8_t v1185 = vaddq_s16(v1185_tmp, v1184);
+    int16x8_t v1186 = vaddq_s16(v1183, v1185);
+    int16x8_t v1187 = vsubq_s16(v773, v778);
+    int16x8_t v1188 = vsubq_s16(v783, v788);
+    int16x8_t v1189_tmp = vqrdmulhq_n_s16(v1188, 23673);
+    int16x8_t v1189 = vaddq_s16(v1189_tmp, v1188);
+    int16x8_t v1190 = vaddq_s16(v1187, v1189);
+    int16x8_t v1191 = vqrdmulhq_n_s16(v1190, 20398);
+    int16x8_t v1192 = vaddq_s16(v1186, v1191);
+    int16x8_t v1193 = vqrdmulhq_n_s16(v1192, 17255);
+    int16x8_t v1194 = vaddq_s16(v1182, v1193);
+    int16x8_t v1195 = vsubq_s16(v799, v804);
+    int16x8_t v1196 = vsubq_s16(v809, v814);
+    int16x8_t v1197_tmp = vqrdmulhq_n_s16(v1196, 23673);
+    int16x8_t v1197 = vaddq_s16(v1197_tmp, v1196);
+    int16x8_t v1198 = vaddq_s16(v1195, v1197);
+    int16x8_t v1199 = vsubq_s16(v821, v826);
+    int16x8_t v1200 = vsubq_s16(v831, v836);
+    int16x8_t v1201_tmp = vqrdmulhq_n_s16(v1200, 23673);
+    int16x8_t v1201 = vaddq_s16(v1201_tmp, v1200);
+    int16x8_t v1202 = vaddq_s16(v1199, v1201);
+    int16x8_t v1203 = vqrdmulhq_n_s16(v1202, 20398);
+    int16x8_t v1204 = vaddq_s16(v1198, v1203);
+    int16x8_t v1205 = vsubq_s16(v845, v850);
+    int16x8_t v1206 = vsubq_s16(v855, v860);
+    int16x8_t v1207_tmp = vqrdmulhq_n_s16(v1206, 23673);
+    int16x8_t v1207 = vaddq_s16(v1207_tmp, v1206);
+    int16x8_t v1208 = vaddq_s16(v1205, v1207);
+    int16x8_t v1209 = vsubq_s16(v867, v872);
+    int16x8_t v1210 = vsubq_s16(v877, v882);
+    int16x8_t v1211_tmp = vqrdmulhq_n_s16(v1210, 23673);
+    int16x8_t v1211 = vaddq_s16(v1211_tmp, v1210);
+    int16x8_t v1212 = vaddq_s16(v1209, v1211);
+    int16x8_t v1213 = vqrdmulhq_n_s16(v1212, 20398);
+    int16x8_t v1214 = vaddq_s16(v1208, v1213);
+    int16x8_t v1215 = vqrdmulhq_n_s16(v1214, 17255);
+    int16x8_t v1216 = vaddq_s16(v1204, v1215);
+    int16x8_t v1217 = vqrdmulhq_n_s16(v1216, 16595);
+    int16x8_t v1218 = vaddq_s16(v1194, v1217);
+    int16x8_t v1219 = vsubq_s16(v9, v24);
+    int16x8_t v1220 = vsubq_s16(v42, v58);
+    int16x8_t v1221_tmp = vqrdmulhq_n_s16(v1220, 3314);
+    int16x8_t v1221 = vmlaq_n_s16(v1221_tmp, v1220, 5);
+    int16x8_t v1222 = vaddq_s16(v1219, v1221);
+    int16x8_t v1223 = vsubq_s16(v78, v101);
+    int16x8_t v1224 = vsubq_s16(v119, v136);
+    int16x8_t v1225_tmp = vqrdmulhq_n_s16(v1224, 3314);
+    int16x8_t v1225 = vmlaq_n_s16(v1225_tmp, v1224, 5);
+    int16x8_t v1226 = vaddq_s16(v1223, v1225);
+    int16x8_t v1227 = vqrdmulhq_n_s16(v1226, 22112);
+    int16x8_t v1228 = vaddq_s16(v1222, v1227);
+    int16x8_t v1229 = vsubq_s16(v158, v181);
+    int16x8_t v1230 = vsubq_s16(v213, v231);
+    int16x8_t v1231_tmp = vqrdmulhq_n_s16(v1230, 3314);
+    int16x8_t v1231 = vmlaq_n_s16(v1231_tmp, v1230, 5);
+    int16x8_t v1232 = vaddq_s16(v1229, v1231);
+    int16x8_t v1233 = vsubq_s16(v251, v274);
+    int16x8_t v1234 = vsubq_s16(v292, v310);
+    int16x8_t v1235_tmp = vqrdmulhq_n_s16(v1234, 3314);
+    int16x8_t v1235 = vmlaq_n_s16(v1235_tmp, v1234, 5);
+    int16x8_t v1236 = vaddq_s16(v1233, v1235);
+    int16x8_t v1237 = vqrdmulhq_n_s16(v1236, 22112);
+    int16x8_t v1238 = vaddq_s16(v1232, v1237);
+    int16x8_t v1239 = vqrdmulhq_n_s16(v1238, 17561);
+    int16x8_t v1240 = vaddq_s16(v1228, v1239);
+    int16x8_t v1241 = vsubq_s16(v334, v357);
+    int16x8_t v1242 = vsubq_s16(v389, v407);
+    int16x8_t v1243_tmp = vqrdmulhq_n_s16(v1242, 3314);
+    int16x8_t v1243 = vmlaq_n_s16(v1243_tmp, v1242, 5);
+    int16x8_t v1244 = vaddq_s16(v1241, v1243);
+    int16x8_t v1245 = vsubq_s16(v441, v480);
+    int16x8_t v1246 = vsubq_s16(v498, v517);
+    int16x8_t v1247_tmp = vqrdmulhq_n_s16(v1246, 3314);
+    int16x8_t v1247 = vmlaq_n_s16(v1247_tmp, v1246, 5);
+    int16x8_t v1248 = vaddq_s16(v1245, v1247);
+    int16x8_t v1249 = vqrdmulhq_n_s16(v1248, 22112);
+    int16x8_t v1250 = vaddq_s16(v1244, v1249);
+    int16x8_t v1251 = vsubq_s16(v539, v562);
+    int16x8_t v1252 = vsubq_s16(v594, v612);
+    int16x8_t v1253_tmp = vqrdmulhq_n_s16(v1252, 3314);
+    int16x8_t v1253 = vmlaq_n_s16(v1253_tmp, v1252, 5);
+    int16x8_t v1254 = vaddq_s16(v1251, v1253);
+    int16x8_t v1255 = vsubq_s16(v632, v655);
+    int16x8_t v1256 = vsubq_s16(v673, v692);
+    int16x8_t v1257_tmp = vqrdmulhq_n_s16(v1256, 3314);
+    int16x8_t v1257 = vmlaq_n_s16(v1257_tmp, v1256, 5);
+    int16x8_t v1258 = vaddq_s16(v1255, v1257);
+    int16x8_t v1259 = vqrdmulhq_n_s16(v1258, 22112);
+    int16x8_t v1260 = vaddq_s16(v1254, v1259);
+    int16x8_t v1261 = vqrdmulhq_n_s16(v1260, 17561);
+    int16x8_t v1262 = vaddq_s16(v1250, v1261);
+    int16x8_t v1263 = vqrdmulhq_n_s16(v1262, 16666);
+    int16x8_t v1264 = vaddq_s16(v1240, v1263);
+    int16x8_t v1265 = vsubq_s16(v1219, v1221);
+    int16x8_t v1266 = vsubq_s16(v1223, v1225);
+    int16x8_t v1267 = vqrdmulhq_n_s16(v1266, 24397);
+    int16x8_t v1268 = vaddq_s16(v1265, v1267);
+    int16x8_t v1269 = vsubq_s16(v1229, v1231);
+    int16x8_t v1270 = vsubq_s16(v1233, v1235);
+    int16x8_t v1271 = vqrdmulhq_n_s16(v1270, 24397);
+    int16x8_t v1272 = vaddq_s16(v1269, v1271);
+    int16x8_t v1273 = vqrdmulhq_n_s16(v1272, 17921);
+    int16x8_t v1274 = vaddq_s16(v1268, v1273);
+    int16x8_t v1275 = vsubq_s16(v1241, v1243);
+    int16x8_t v1276 = vsubq_s16(v1245, v1247);
+    int16x8_t v1277 = vqrdmulhq_n_s16(v1276, 24397);
+    int16x8_t v1278 = vaddq_s16(v1275, v1277);
+    int16x8_t v1279 = vsubq_s16(v1251, v1253);
+    int16x8_t v1280 = vsubq_s16(v1255, v1257);
+    int16x8_t v1281 = vqrdmulhq_n_s16(v1280, 24397);
+    int16x8_t v1282 = vaddq_s16(v1279, v1281);
+    int16x8_t v1283 = vqrdmulhq_n_s16(v1282, 17921);
+    int16x8_t v1284 = vaddq_s16(v1278, v1283);
+    int16x8_t v1285 = vqrdmulhq_n_s16(v1284, 16747);
+    int16x8_t v1286 = vaddq_s16(v1274, v1285);
+    int16x8_t v1287 = vsubq_s16(v1173, v1175);
+    int16x8_t v1288 = vsubq_s16(v1177, v1179);
+    int16x8_t v1289 = vqrdmulhq_n_s16(v1288, 27504);
+    int16x8_t v1290 = vaddq_s16(v1287, v1289);
+    int16x8_t v1291 = vsubq_s16(v1183, v1185);
+    int16x8_t v1292 = vsubq_s16(v1187, v1189);
+    int16x8_t v1293 = vqrdmulhq_n_s16(v1292, 27504);
+    int16x8_t v1294 = vaddq_s16(v1291, v1293);
+    int16x8_t v1295 = vqrdmulhq_n_s16(v1294, 18343);
+    int16x8_t v1296 = vaddq_s16(v1290, v1295);
+    int16x8_t v1297 = vsubq_s16(v1195, v1197);
+    int16x8_t v1298 = vsubq_s16(v1199, v1201);
+    int16x8_t v1299 = vqrdmulhq_n_s16(v1298, 27504);
+    int16x8_t v1300 = vaddq_s16(v1297, v1299);
+    int16x8_t v1301 = vsubq_s16(v1205, v1207);
+    int16x8_t v1302 = vsubq_s16(v1209, v1211);
+    int16x8_t v1303 = vqrdmulhq_n_s16(v1302, 27504);
+    int16x8_t v1304 = vaddq_s16(v1301, v1303);
+    int16x8_t v1305 = vqrdmulhq_n_s16(v1304, 18343);
+    int16x8_t v1306 = vaddq_s16(v1300, v1305);
+    int16x8_t v1307 = vqrdmulhq_n_s16(v1306, 16840);
+    int16x8_t v1308 = vaddq_s16(v1296, v1307);
+    int16x8_t v1309 = vsubq_s16(v1127, v1129);
+    int16x8_t v1310 = vsubq_s16(v1131, v1133);
+    int16x8_t v1311 = vqrdmulhq_n_s16(v1310, 31869);
+    int16x8_t v1312 = vaddq_s16(v1309, v1311);
+    int16x8_t v1313 = vsubq_s16(v1137, v1139);
+    int16x8_t v1314 = vsubq_s16(v1141, v1143);
+    int16x8_t v1315 = vqrdmulhq_n_s16(v1314, 31869);
+    int16x8_t v1316 = vaddq_s16(v1313, v1315);
+    int16x8_t v1317 = vqrdmulhq_n_s16(v1316, 18830);
+    int16x8_t v1318 = vaddq_s16(v1312, v1317);
+    int16x8_t v1319 = vsubq_s16(v1149, v1151);
+    int16x8_t v1320 = vsubq_s16(v1153, v1155);
+    int16x8_t v1321 = vqrdmulhq_n_s16(v1320, 31869);
+    int16x8_t v1322 = vaddq_s16(v1319, v1321);
+    int16x8_t v1323 = vsubq_s16(v1159, v1161);
+    int16x8_t v1324 = vsubq_s16(v1163, v1165);
+    int16x8_t v1325 = vqrdmulhq_n_s16(v1324, 31869);
+    int16x8_t v1326 = vaddq_s16(v1323, v1325);
+    int16x8_t v1327 = vqrdmulhq_n_s16(v1326, 18830);
+    int16x8_t v1328 = vaddq_s16(v1322, v1327);
+    int16x8_t v1329 = vqrdmulhq_n_s16(v1328, 16944);
+    int16x8_t v1330 = vaddq_s16(v1318, v1329);
+    int16x8_t v1331 = vsubq_s16(v1081, v1083);
+    int16x8_t v1332 = vsubq_s16(v1085, v1087);
+    int16x8_t v1333_tmp = vqrdmulhq_n_s16(v1332, 5552);
+    int16x8_t v1333 = vaddq_s16(v1333_tmp, v1332);
+    int16x8_t v1334 = vaddq_s16(v1331, v1333);
+    int16x8_t v1335 = vsubq_s16(v1091, v1093);
+    int16x8_t v1336 = vsubq_s16(v1095, v1097);
+    int16x8_t v1337_tmp = vqrdmulhq_n_s16(v1336, 5552);
+    int16x8_t v1337 = vaddq_s16(v1337_tmp, v1336);
+    int16x8_t v1338 = vaddq_s16(v1335, v1337);
+    int16x8_t v1339 = vqrdmulhq_n_s16(v1338, 19393);
+    int16x8_t v1340 = vaddq_s16(v1334, v1339);
+    int16x8_t v1341 = vsubq_s16(v1103, v1105);
+    int16x8_t v1342 = vsubq_s16(v1107, v1109);
+    int16x8_t v1343_tmp = vqrdmulhq_n_s16(v1342, 5552);
+    int16x8_t v1343 = vaddq_s16(v1343_tmp, v1342);
+    int16x8_t v1344 = vaddq_s16(v1341, v1343);
+    int16x8_t v1345 = vsubq_s16(v1113, v1115);
+    int16x8_t v1346 = vsubq_s16(v1117, v1119);
+    int16x8_t v1347_tmp = vqrdmulhq_n_s16(v1346, 5552);
+    int16x8_t v1347 = vaddq_s16(v1347_tmp, v1346);
+    int16x8_t v1348 = vaddq_s16(v1345, v1347);
+    int16x8_t v1349 = vqrdmulhq_n_s16(v1348, 19393);
+    int16x8_t v1350 = vaddq_s16(v1344, v1349);
+    int16x8_t v1351 = vqrdmulhq_n_s16(v1350, 17059);
+    int16x8_t v1352 = vaddq_s16(v1340, v1351);
+    int16x8_t v1353 = vsubq_s16(v990, v995);
+    int16x8_t v1354 = vsubq_s16(v1000, v1005);
+    int16x8_t v1355_tmp = vqrdmulhq_n_s16(v1354, 15865);
+    int16x8_t v1355 = vaddq_s16(v1355_tmp, v1354);
+    int16x8_t v1356 = vaddq_s16(v1353, v1355);
+    int16x8_t v1357 = vsubq_s16(v1012, v1017);
+    int16x8_t v1358 = vsubq_s16(v1022, v1027);
+    int16x8_t v1359_tmp = vqrdmulhq_n_s16(v1358, 15865);
+    int16x8_t v1359 = vaddq_s16(v1359_tmp, v1358);
+    int16x8_t v1360 = vaddq_s16(v1357, v1359);
+    int16x8_t v1361 = vqrdmulhq_n_s16(v1360, 20040);
+    int16x8_t v1362 = vaddq_s16(v1356, v1361);
+    int16x8_t v1363 = vsubq_s16(v1036, v1041);
+    int16x8_t v1364 = vsubq_s16(v1046, v1051);
+    int16x8_t v1365_tmp = vqrdmulhq_n_s16(v1364, 15865);
+    int16x8_t v1365 = vaddq_s16(v1365_tmp, v1364);
+    int16x8_t v1366 = vaddq_s16(v1363, v1365);
+    int16x8_t v1367 = vsubq_s16(v1058, v1063);
+    int16x8_t v1368 = vsubq_s16(v1068, v1073);
+    int16x8_t v1369_tmp = vqrdmulhq_n_s16(v1368, 15865);
+    int16x8_t v1369 = vaddq_s16(v1369_tmp, v1368);
+    int16x8_t v1370 = vaddq_s16(v1367, v1369);
+    int16x8_t v1371 = vqrdmulhq_n_s16(v1370, 20040);
+    int16x8_t v1372 = vaddq_s16(v1366, v1371);
+    int16x8_t v1373 = vqrdmulhq_n_s16(v1372, 17187);
+    int16x8_t v1374 = vaddq_s16(v1362, v1373);
+    int16x8_t v1375 = vsubq_s16(v895, v900);
+    int16x8_t v1376 = vsubq_s16(v905, v910);
+    int16x8_t v1377_tmp = vqrdmulhq_n_s16(v1376, 1893);
+    int16x8_t v1377 = vmlaq_n_s16(v1377_tmp, v1376, 2);
+    int16x8_t v1378 = vaddq_s16(v1375, v1377);
+    int16x8_t v1379 = vsubq_s16(v918, v923);
+    int16x8_t v1380 = vsubq_s16(v928, v933);
+    int16x8_t v1381_tmp = vqrdmulhq_n_s16(v1380, 1893);
+    int16x8_t v1381 = vmlaq_n_s16(v1381_tmp, v1380, 2);
+    int16x8_t v1382 = vaddq_s16(v1379, v1381);
+    int16x8_t v1383 = vqrdmulhq_n_s16(v1382, 20783);
+    int16x8_t v1384 = vaddq_s16(v1378, v1383);
+    int16x8_t v1385 = vsubq_s16(v942, v947);
+    int16x8_t v1386 = vsubq_s16(v952, v957);
+    int16x8_t v1387_tmp = vqrdmulhq_n_s16(v1386, 1893);
+    int16x8_t v1387 = vmlaq_n_s16(v1387_tmp, v1386, 2);
+    int16x8_t v1388 = vaddq_s16(v1385, v1387);
+    int16x8_t v1389 = vsubq_s16(v964, v969);
+    int16x8_t v1390 = vsubq_s16(v974, v979);
+    int16x8_t v1391_tmp = vqrdmulhq_n_s16(v1390, 1893);
+    int16x8_t v1391 = vmlaq_n_s16(v1391_tmp, v1390, 2);
+    int16x8_t v1392 = vaddq_s16(v1389, v1391);
+    int16x8_t v1393 = vqrdmulhq_n_s16(v1392, 20783);
+    int16x8_t v1394 = vaddq_s16(v1388, v1393);
+    int16x8_t v1395 = vqrdmulhq_n_s16(v1394, 17326);
+    int16x8_t v1396 = vaddq_s16(v1384, v1395);
+    int16x8_t v1397 = vsubq_s16(v711, v722);
+    int16x8_t v1398 = vsubq_s16(v733, v744);
+    int16x8_t v1399_tmp = vqrdmulhq_n_s16(v1398, 13357);
+    int16x8_t v1399 = vmlaq_n_s16(v1399_tmp, v1398, 3);
+    int16x8_t v1400 = vaddq_s16(v1397, v1399);
+    int16x8_t v1401 = vsubq_s16(v757, v768);
+    int16x8_t v1402 = vsubq_s16(v779, v790);
+    int16x8_t v1403_tmp = vqrdmulhq_n_s16(v1402, 13357);
+    int16x8_t v1403 = vmlaq_n_s16(v1403_tmp, v1402, 3);
+    int16x8_t v1404 = vaddq_s16(v1401, v1403);
+    int16x8_t v1405 = vqrdmulhq_n_s16(v1404, 21637);
+    int16x8_t v1406 = vaddq_s16(v1400, v1405);
+    int16x8_t v1407 = vsubq_s16(v805, v816);
+    int16x8_t v1408 = vsubq_s16(v827, v838);
+    int16x8_t v1409_tmp = vqrdmulhq_n_s16(v1408, 13357);
+    int16x8_t v1409 = vmlaq_n_s16(v1409_tmp, v1408, 3);
+    int16x8_t v1410 = vaddq_s16(v1407, v1409);
+    int16x8_t v1411 = vsubq_s16(v851, v862);
+    int16x8_t v1412 = vsubq_s16(v873, v884);
+    int16x8_t v1413_tmp = vqrdmulhq_n_s16(v1412, 13357);
+    int16x8_t v1413 = vmlaq_n_s16(v1413_tmp, v1412, 3);
+    int16x8_t v1414 = vaddq_s16(v1411, v1413);
+    int16x8_t v1415 = vqrdmulhq_n_s16(v1414, 21637);
+    int16x8_t v1416 = vaddq_s16(v1410, v1415);
+    int16x8_t v1417 = vqrdmulhq_n_s16(v1416, 17479);
+    int16x8_t v1418 = vaddq_s16(v1406, v1417);
+    int16x8_t v1419 = vsubq_s16(v25, v60);
+    int16x8_t v1420 = vsubq_s16(v102, v138);
+    int16x8_t v1421_tmp = vqrdmulhq_n_s16(v1420, 6226);
+    int16x8_t v1421 = vmlaq_n_s16(v1421_tmp, v1420, 10);
+    int16x8_t v1422 = vaddq_s16(v1419, v1421);
+    int16x8_t v1423 = vsubq_s16(v182, v233);
+    int16x8_t v1424 = vsubq_s16(v275, v312);
+    int16x8_t v1425_tmp = vqrdmulhq_n_s16(v1424, 6226);
+    int16x8_t v1425 = vmlaq_n_s16(v1425_tmp, v1424, 10);
+    int16x8_t v1426 = vaddq_s16(v1423, v1425);
+    int16x8_t v1427 = vqrdmulhq_n_s16(v1426, 22622);
+    int16x8_t v1428 = vaddq_s16(v1422, v1427);
+    int16x8_t v1429 = vsubq_s16(v358, v409);
+    int16x8_t v1430 = vsubq_s16(v481, v519);
+    int16x8_t v1431_tmp = vqrdmulhq_n_s16(v1430, 6226);
+    int16x8_t v1431 = vmlaq_n_s16(v1431_tmp, v1430, 10);
+    int16x8_t v1432 = vaddq_s16(v1429, v1431);
+    int16x8_t v1433 = vsubq_s16(v563, v614);
+    int16x8_t v1434 = vsubq_s16(v656, v694);
+    int16x8_t v1435_tmp = vqrdmulhq_n_s16(v1434, 6226);
+    int16x8_t v1435 = vmlaq_n_s16(v1435_tmp, v1434, 10);
+    int16x8_t v1436 = vaddq_s16(v1433, v1435);
+    int16x8_t v1437 = vqrdmulhq_n_s16(v1436, 22622);
+    int16x8_t v1438 = vaddq_s16(v1432, v1437);
+    int16x8_t v1439 = vqrdmulhq_n_s16(v1438, 17646);
+    int16x8_t v1440 = vaddq_s16(v1428, v1439);
+    int16x8_t v1441 = vsubq_s16(v1419, v1421);
+    int16x8_t v1442 = vsubq_s16(v1423, v1425);
+    int16x8_t v1443 = vqrdmulhq_n_s16(v1442, 23761);
+    int16x8_t v1444 = vaddq_s16(v1441, v1443);
+    int16x8_t v1445 = vsubq_s16(v1429, v1431);
+    int16x8_t v1446 = vsubq_s16(v1433, v1435);
+    int16x8_t v1447 = vqrdmulhq_n_s16(v1446, 23761);
+    int16x8_t v1448 = vaddq_s16(v1445, v1447);
+    int16x8_t v1449 = vqrdmulhq_n_s16(v1448, 17826);
+    int16x8_t v1450 = vaddq_s16(v1444, v1449);
+    int16x8_t v1451 = vsubq_s16(v1397, v1399);
+    int16x8_t v1452 = vsubq_s16(v1401, v1403);
+    int16x8_t v1453 = vqrdmulhq_n_s16(v1452, 25084);
+    int16x8_t v1454 = vaddq_s16(v1451, v1453);
+    int16x8_t v1455 = vsubq_s16(v1407, v1409);
+    int16x8_t v1456 = vsubq_s16(v1411, v1413);
+    int16x8_t v1457 = vqrdmulhq_n_s16(v1456, 25084);
+    int16x8_t v1458 = vaddq_s16(v1455, v1457);
+    int16x8_t v1459 = vqrdmulhq_n_s16(v1458, 18021);
+    int16x8_t v1460 = vaddq_s16(v1454, v1459);
+    int16x8_t v1461 = vsubq_s16(v1375, v1377);
+    int16x8_t v1462 = vsubq_s16(v1379, v1381);
+    int16x8_t v1463 = vqrdmulhq_n_s16(v1462, 26631);
+    int16x8_t v1464 = vaddq_s16(v1461, v1463);
+    int16x8_t v1465 = vsubq_s16(v1385, v1387);
+    int16x8_t v1466 = vsubq_s16(v1389, v1391);
+    int16x8_t v1467 = vqrdmulhq_n_s16(v1466, 26631);
+    int16x8_t v1468 = vaddq_s16(v1465, v1467);
+    int16x8_t v1469 = vqrdmulhq_n_s16(v1468, 18231);
+    int16x8_t v1470 = vaddq_s16(v1464, v1469);
+    int16x8_t v1471 = vsubq_s16(v1353, v1355);
+    int16x8_t v1472 = vsubq_s16(v1357, v1359);
+    int16x8_t v1473 = vqrdmulhq_n_s16(v1472, 28454);
+    int16x8_t v1474 = vaddq_s16(v1471, v1473);
+    int16x8_t v1475 = vsubq_s16(v1363, v1365);
+    int16x8_t v1476 = vsubq_s16(v1367, v1369);
+    int16x8_t v1477 = vqrdmulhq_n_s16(v1476, 28454);
+    int16x8_t v1478 = vaddq_s16(v1475, v1477);
+    int16x8_t v1479 = vqrdmulhq_n_s16(v1478, 18458);
+    int16x8_t v1480 = vaddq_s16(v1474, v1479);
+    int16x8_t v1481 = vsubq_s16(v1331, v1333);
+    int16x8_t v1482 = vsubq_s16(v1335, v1337);
+    int16x8_t v1483 = vqrdmulhq_n_s16(v1482, 30624);
+    int16x8_t v1484 = vaddq_s16(v1481, v1483);
+    int16x8_t v1485 = vsubq_s16(v1341, v1343);
+    int16x8_t v1486 = vsubq_s16(v1345, v1347);
+    int16x8_t v1487 = vqrdmulhq_n_s16(v1486, 30624);
+    int16x8_t v1488 = vaddq_s16(v1485, v1487);
+    int16x8_t v1489 = vqrdmulhq_n_s16(v1488, 18702);
+    int16x8_t v1490 = vaddq_s16(v1484, v1489);
+    int16x8_t v1491 = vsubq_s16(v1309, v1311);
+    int16x8_t v1492 = vsubq_s16(v1313, v1315);
+    int16x8_t v1493_tmp = vqrdmulhq_n_s16(v1492, 472);
+    int16x8_t v1493 = vaddq_s16(v1493_tmp, v1492);
+    int16x8_t v1494 = vaddq_s16(v1491, v1493);
+    int16x8_t v1495 = vsubq_s16(v1319, v1321);
+    int16x8_t v1496 = vsubq_s16(v1323, v1325);
+    int16x8_t v1497_tmp = vqrdmulhq_n_s16(v1496, 472);
+    int16x8_t v1497 = vaddq_s16(v1497_tmp, v1496);
+    int16x8_t v1498 = vaddq_s16(v1495, v1497);
+    int16x8_t v1499 = vqrdmulhq_n_s16(v1498, 18964);
+    int16x8_t v1500 = vaddq_s16(v1494, v1499);
+    int16x8_t v1501 = vsubq_s16(v1287, v1289);
+    int16x8_t v1502 = vsubq_s16(v1291, v1293);
+    int16x8_t v1503_tmp = vqrdmulhq_n_s16(v1502, 3672);
+    int16x8_t v1503 = vaddq_s16(v1503_tmp, v1502);
+    int16x8_t v1504 = vaddq_s16(v1501, v1503);
+    int16x8_t v1505 = vsubq_s16(v1297, v1299);
+    int16x8_t v1506 = vsubq_s16(v1301, v1303);
+    int16x8_t v1507_tmp = vqrdmulhq_n_s16(v1506, 3672);
+    int16x8_t v1507 = vaddq_s16(v1507_tmp, v1506);
+    int16x8_t v1508 = vaddq_s16(v1505, v1507);
+    int16x8_t v1509 = vqrdmulhq_n_s16(v1508, 19245);
+    int16x8_t v1510 = vaddq_s16(v1504, v1509);
+    int16x8_t v1511 = vsubq_s16(v1265, v1267);
+    int16x8_t v1512 = vsubq_s16(v1269, v1271);
+    int16x8_t v1513_tmp = vqrdmulhq_n_s16(v1512, 7662);
+    int16x8_t v1513 = vaddq_s16(v1513_tmp, v1512);
+    int16x8_t v1514 = vaddq_s16(v1511, v1513);
+    int16x8_t v1515 = vsubq_s16(v1275, v1277);
+    int16x8_t v1516 = vsubq_s16(v1279, v1281);
+    int16x8_t v1517_tmp = vqrdmulhq_n_s16(v1516, 7662);
+    int16x8_t v1517 = vaddq_s16(v1517_tmp, v1516);
+    int16x8_t v1518 = vaddq_s16(v1515, v1517);
+    int16x8_t v1519 = vqrdmulhq_n_s16(v1518, 19546);
+    int16x8_t v1520 = vaddq_s16(v1514, v1519);
+    int16x8_t v1521 = vsubq_s16(v1222, v1227);
+    int16x8_t v1522 = vsubq_s16(v1232, v1237);
+    int16x8_t v1523_tmp = vqrdmulhq_n_s16(v1522, 12756);
+    int16x8_t v1523 = vaddq_s16(v1523_tmp, v1522);
+    int16x8_t v1524 = vaddq_s16(v1521, v1523);
+    int16x8_t v1525 = vsubq_s16(v1244, v1249);
+    int16x8_t v1526 = vsubq_s16(v1254, v1259);
+    int16x8_t v1527_tmp = vqrdmulhq_n_s16(v1526, 12756);
+    int16x8_t v1527 = vaddq_s16(v1527_tmp, v1526);
+    int16x8_t v1528 = vaddq_s16(v1525, v1527);
+    int16x8_t v1529 = vqrdmulhq_n_s16(v1528, 19869);
+    int16x8_t v1530 = vaddq_s16(v1524, v1529);
+    int16x8_t v1531 = vsubq_s16(v1176, v1181);
+    int16x8_t v1532 = vsubq_s16(v1186, v1191);
+    int16x8_t v1533_tmp = vqrdmulhq_n_s16(v1532, 19463);
+    int16x8_t v1533 = vaddq_s16(v1533_tmp, v1532);
+    int16x8_t v1534 = vaddq_s16(v1531, v1533);
+    int16x8_t v1535 = vsubq_s16(v1198, v1203);
+    int16x8_t v1536 = vsubq_s16(v1208, v1213);
+    int16x8_t v1537_tmp = vqrdmulhq_n_s16(v1536, 19463);
+    int16x8_t v1537 = vaddq_s16(v1537_tmp, v1536);
+    int16x8_t v1538 = vaddq_s16(v1535, v1537);
+    int16x8_t v1539 = vqrdmulhq_n_s16(v1538, 20216);
+    int16x8_t v1540 = vaddq_s16(v1534, v1539);
+    int16x8_t v1541 = vsubq_s16(v1130, v1135);
+    int16x8_t v1542 = vsubq_s16(v1140, v1145);
+    int16x8_t v1543_tmp = vqrdmulhq_n_s16(v1542, 28661);
+    int16x8_t v1543 = vaddq_s16(v1543_tmp, v1542);
+    int16x8_t v1544 = vaddq_s16(v1541, v1543);
+    int16x8_t v1545 = vsubq_s16(v1152, v1157);
+    int16x8_t v1546 = vsubq_s16(v1162, v1167);
+    int16x8_t v1547_tmp = vqrdmulhq_n_s16(v1546, 28661);
+    int16x8_t v1547 = vaddq_s16(v1547_tmp, v1546);
+    int16x8_t v1548 = vaddq_s16(v1545, v1547);
+    int16x8_t v1549 = vqrdmulhq_n_s16(v1548, 20587);
+    int16x8_t v1550 = vaddq_s16(v1544, v1549);
+    int16x8_t v1551 = vsubq_s16(v1084, v1089);
+    int16x8_t v1552 = vsubq_s16(v1094, v1099);
+    int16x8_t v1553_tmp = vqrdmulhq_n_s16(v1552, 9242);
+    int16x8_t v1553 = vmlaq_n_s16(v1553_tmp, v1552, 2);
+    int16x8_t v1554 = vaddq_s16(v1551, v1553);
+    int16x8_t v1555 = vsubq_s16(v1106, v1111);
+    int16x8_t v1556 = vsubq_s16(v1116, v1121);
+    int16x8_t v1557_tmp = vqrdmulhq_n_s16(v1556, 9242);
+    int16x8_t v1557 = vmlaq_n_s16(v1557_tmp, v1556, 2);
+    int16x8_t v1558 = vaddq_s16(v1555, v1557);
+    int16x8_t v1559 = vqrdmulhq_n_s16(v1558, 20985);
+    int16x8_t v1560 = vaddq_s16(v1554, v1559);
+    int16x8_t v1561 = vsubq_s16(v996, v1007);
+    int16x8_t v1562 = vsubq_s16(v1018, v1029);
+    int16x8_t v1563_tmp = vqrdmulhq_n_s16(v1562, 30298);
+    int16x8_t v1563 = vmlaq_n_s16(v1563_tmp, v1562, 2);
+    int16x8_t v1564 = vaddq_s16(v1561, v1563);
+    int16x8_t v1565 = vsubq_s16(v1042, v1053);
+    int16x8_t v1566 = vsubq_s16(v1064, v1075);
+    int16x8_t v1567_tmp = vqrdmulhq_n_s16(v1566, 30298);
+    int16x8_t v1567 = vmlaq_n_s16(v1567_tmp, v1566, 2);
+    int16x8_t v1568 = vaddq_s16(v1565, v1567);
+    int16x8_t v1569 = vqrdmulhq_n_s16(v1568, 21412);
+    int16x8_t v1570 = vaddq_s16(v1564, v1569);
+    int16x8_t v1571 = vsubq_s16(v901, v912);
+    int16x8_t v1572 = vsubq_s16(v924, v935);
+    int16x8_t v1573_tmp = vqrdmulhq_n_s16(v1572, 2773);
+    int16x8_t v1573 = vmlaq_n_s16(v1573_tmp, v1572, 4);
+    int16x8_t v1574 = vaddq_s16(v1571, v1573);
+    int16x8_t v1575 = vsubq_s16(v948, v959);
+    int16x8_t v1576 = vsubq_s16(v970, v981);
+    int16x8_t v1577_tmp = vqrdmulhq_n_s16(v1576, 2773);
+    int16x8_t v1577 = vmlaq_n_s16(v1577_tmp, v1576, 4);
+    int16x8_t v1578 = vaddq_s16(v1575, v1577);
+    int16x8_t v1579 = vqrdmulhq_n_s16(v1578, 21871);
+    int16x8_t v1580 = vaddq_s16(v1574, v1579);
+    int16x8_t v1581 = vsubq_s16(v723, v746);
+    int16x8_t v1582 = vsubq_s16(v769, v792);
+    int16x8_t v1583_tmp = vqrdmulhq_n_s16(v1582, 26108);
+    int16x8_t v1583 = vmlaq_n_s16(v1583_tmp, v1582, 6);
+    int16x8_t v1584 = vaddq_s16(v1581, v1583);
+    int16x8_t v1585 = vsubq_s16(v817, v840);
+    int16x8_t v1586 = vsubq_s16(v863, v886);
+    int16x8_t v1587_tmp = vqrdmulhq_n_s16(v1586, 26108);
+    int16x8_t v1587 = vmlaq_n_s16(v1587_tmp, v1586, 6);
+    int16x8_t v1588 = vaddq_s16(v1585, v1587);
+    int16x8_t v1589 = vqrdmulhq_n_s16(v1588, 22363);
+    int16x8_t v1590 = vaddq_s16(v1584, v1589);
+    int16x8_t v1591 = vsubq_s16(v61, v140);
+    int16x8_t v1592 = vsubq_s16(v234, v314);
+    int16x8_t v1593_tmp = vqrdmulhq_n_s16(v1592, 12251);
+    int16x8_t v1593 = vmlaq_n_s16(v1593_tmp, v1592, 20);
+    int16x8_t v1594 = vaddq_s16(v1591, v1593);
+    int16x8_t v1595 = vsubq_s16(v410, v521);
+    int16x8_t v1596 = vsubq_s16(v615, v696);
+    int16x8_t v1597_tmp = vqrdmulhq_n_s16(v1596, 12251);
+    int16x8_t v1597 = vmlaq_n_s16(v1597_tmp, v1596, 20);
+    int16x8_t v1598 = vaddq_s16(v1595, v1597);
+    int16x8_t v1599 = vqrdmulhq_n_s16(v1598, 22891);
+    int16x8_t v1600 = vaddq_s16(v1594, v1599);
+    int16x8_t v1601 = vsubq_s16(v1591, v1593);
+    int16x8_t v1602 = vsubq_s16(v1595, v1597);
+    int16x8_t v1603 = vqrdmulhq_n_s16(v1602, 23460);
+    int16x8_t v1604 = vaddq_s16(v1601, v1603);
+    int16x8_t v1605 = vsubq_s16(v1581, v1583);
+    int16x8_t v1606 = vsubq_s16(v1585, v1587);
+    int16x8_t v1607 = vqrdmulhq_n_s16(v1606, 24073);
+    int16x8_t v1608 = vaddq_s16(v1605, v1607);
+    int16x8_t v1609 = vsubq_s16(v1571, v1573);
+    int16x8_t v1610 = vsubq_s16(v1575, v1577);
+    int16x8_t v1611 = vqrdmulhq_n_s16(v1610, 24734);
+    int16x8_t v1612 = vaddq_s16(v1609, v1611);
+    int16x8_t v1613 = vsubq_s16(v1561, v1563);
+    int16x8_t v1614 = vsubq_s16(v1565, v1567);
+    int16x8_t v1615 = vqrdmulhq_n_s16(v1614, 25448);
+    int16x8_t v1616 = vaddq_s16(v1613, v1615);
+    int16x8_t v1617 = vsubq_s16(v1551, v1553);
+    int16x8_t v1618 = vsubq_s16(v1555, v1557);
+    int16x8_t v1619 = vqrdmulhq_n_s16(v1618, 26220);
+    int16x8_t v1620 = vaddq_s16(v1617, v1619);
+    int16x8_t v1621 = vsubq_s16(v1541, v1543);
+    int16x8_t v1622 = vsubq_s16(v1545, v1547);
+    int16x8_t v1623 = vqrdmulhq_n_s16(v1622, 27058);
+    int16x8_t v1624 = vaddq_s16(v1621, v1623);
+    int16x8_t v1625 = vsubq_s16(v1531, v1533);
+    int16x8_t v1626 = vsubq_s16(v1535, v1537);
+    int16x8_t v1627 = vqrdmulhq_n_s16(v1626, 27969);
+    int16x8_t v1628 = vaddq_s16(v1625, v1627);
+    int16x8_t v1629 = vsubq_s16(v1521, v1523);
+    int16x8_t v1630 = vsubq_s16(v1525, v1527);
+    int16x8_t v1631 = vqrdmulhq_n_s16(v1630, 28961);
+    int16x8_t v1632 = vaddq_s16(v1629, v1631);
+    int16x8_t v1633 = vsubq_s16(v1511, v1513);
+    int16x8_t v1634 = vsubq_s16(v1515, v1517);
+    int16x8_t v1635 = vqrdmulhq_n_s16(v1634, 30044);
+    int16x8_t v1636 = vaddq_s16(v1633, v1635);
+    int16x8_t v1637 = vsubq_s16(v1501, v1503);
+    int16x8_t v1638 = vsubq_s16(v1505, v1507);
+    int16x8_t v1639 = vqrdmulhq_n_s16(v1638, 31232);
+    int16x8_t v1640 = vaddq_s16(v1637, v1639);
+    int16x8_t v1641 = vsubq_s16(v1491, v1493);
+    int16x8_t v1642 = vsubq_s16(v1495, v1497);
+    int16x8_t v1643 = vqrdmulhq_n_s16(v1642, 32538);
+    int16x8_t v1644 = vaddq_s16(v1641, v1643);
+    int16x8_t v1645 = vsubq_s16(v1481, v1483);
+    int16x8_t v1646 = vsubq_s16(v1485, v1487);
+    int16x8_t v1647_tmp = vqrdmulhq_n_s16(v1646, 1211);
+    int16x8_t v1647 = vaddq_s16(v1647_tmp, v1646);
+    int16x8_t v1648 = vaddq_s16(v1645, v1647);
+    int16x8_t v1649 = vsubq_s16(v1471, v1473);
+    int16x8_t v1650 = vsubq_s16(v1475, v1477);
+    int16x8_t v1651_tmp = vqrdmulhq_n_s16(v1650, 2808);
+    int16x8_t v1651 = vaddq_s16(v1651_tmp, v1650);
+    int16x8_t v1652 = vaddq_s16(v1649, v1651);
+    int16x8_t v1653 = vsubq_s16(v1461, v1463);
+    int16x8_t v1654 = vsubq_s16(v1465, v1467);
+    int16x8_t v1655_tmp = vqrdmulhq_n_s16(v1654, 4586);
+    int16x8_t v1655 = vaddq_s16(v1655_tmp, v1654);
+    int16x8_t v1656 = vaddq_s16(v1653, v1655);
+    int16x8_t v1657 = vsubq_s16(v1451, v1453);
+    int16x8_t v1658 = vsubq_s16(v1455, v1457);
+    int16x8_t v1659_tmp = vqrdmulhq_n_s16(v1658, 6576);
+    int16x8_t v1659 = vaddq_s16(v1659_tmp, v1658);
+    int16x8_t v1660 = vaddq_s16(v1657, v1659);
+    int16x8_t v1661 = vsubq_s16(v1441, v1443);
+    int16x8_t v1662 = vsubq_s16(v1445, v1447);
+    int16x8_t v1663_tmp = vqrdmulhq_n_s16(v1662, 8817);
+    int16x8_t v1663 = vaddq_s16(v1663_tmp, v1662);
+    int16x8_t v1664 = vaddq_s16(v1661, v1663);
+    int16x8_t v1665 = vsubq_s16(v1422, v1427);
+    int16x8_t v1666 = vsubq_s16(v1432, v1437);
+    int16x8_t v1667_tmp = vqrdmulhq_n_s16(v1666, 11356);
+    int16x8_t v1667 = vaddq_s16(v1667_tmp, v1666);
+    int16x8_t v1668 = vaddq_s16(v1665, v1667);
+    int16x8_t v1669 = vsubq_s16(v1400, v1405);
+    int16x8_t v1670 = vsubq_s16(v1410, v1415);
+    int16x8_t v1671_tmp = vqrdmulhq_n_s16(v1670, 14256);
+    int16x8_t v1671 = vaddq_s16(v1671_tmp, v1670);
+    int16x8_t v1672 = vaddq_s16(v1669, v1671);
+    int16x8_t v1673 = vsubq_s16(v1378, v1383);
+    int16x8_t v1674 = vsubq_s16(v1388, v1393);
+    int16x8_t v1675_tmp = vqrdmulhq_n_s16(v1674, 17596);
+    int16x8_t v1675 = vaddq_s16(v1675_tmp, v1674);
+    int16x8_t v1676 = vaddq_s16(v1673, v1675);
+    int16x8_t v1677 = vsubq_s16(v1356, v1361);
+    int16x8_t v1678 = vsubq_s16(v1366, v1371);
+    int16x8_t v1679_tmp = vqrdmulhq_n_s16(v1678, 21483);
+    int16x8_t v1679 = vaddq_s16(v1679_tmp, v1678);
+    int16x8_t v1680 = vaddq_s16(v1677, v1679);
+    int16x8_t v1681 = vsubq_s16(v1334, v1339);
+    int16x8_t v1682 = vsubq_s16(v1344, v1349);
+    int16x8_t v1683_tmp = vqrdmulhq_n_s16(v1682, 26057);
+    int16x8_t v1683 = vaddq_s16(v1683_tmp, v1682);
+    int16x8_t v1684 = vaddq_s16(v1681, v1683);
+    int16x8_t v1685 = vsubq_s16(v1312, v1317);
+    int16x8_t v1686 = vsubq_s16(v1322, v1327);
+    int16x8_t v1687_tmp = vqrdmulhq_n_s16(v1686, 31517);
+    int16x8_t v1687 = vaddq_s16(v1687_tmp, v1686);
+    int16x8_t v1688 = vaddq_s16(v1685, v1687);
+    int16x8_t v1689 = vsubq_s16(v1290, v1295);
+    int16x8_t v1690 = vsubq_s16(v1300, v1305);
+    int16x8_t v1691_tmp = vqrdmulhq_n_s16(v1690, 5373);
+    int16x8_t v1691 = vmlaq_n_s16(v1691_tmp, v1690, 2);
+    int16x8_t v1692 = vaddq_s16(v1689, v1691);
+    int16x8_t v1693 = vsubq_s16(v1268, v1273);
+    int16x8_t v1694 = vsubq_s16(v1278, v1283);
+    int16x8_t v1695_tmp = vqrdmulhq_n_s16(v1694, 13571);
+    int16x8_t v1695 = vmlaq_n_s16(v1695_tmp, v1694, 2);
+    int16x8_t v1696 = vaddq_s16(v1693, v1695);
+    int16x8_t v1697 = vsubq_s16(v1228, v1239);
+    int16x8_t v1698 = vsubq_s16(v1250, v1261);
+    int16x8_t v1699_tmp = vqrdmulhq_n_s16(v1698, 23975);
+    int16x8_t v1699 = vmlaq_n_s16(v1699_tmp, v1698, 2);
+    int16x8_t v1700 = vaddq_s16(v1697, v1699);
+    int16x8_t v1701 = vsubq_s16(v1182, v1193);
+    int16x8_t v1702 = vsubq_s16(v1204, v1215);
+    int16x8_t v1703_tmp = vqrdmulhq_n_s16(v1702, 4832);
+    int16x8_t v1703 = vmlaq_n_s16(v1703_tmp, v1702, 3);
+    int16x8_t v1704 = vaddq_s16(v1701, v1703);
+    int16x8_t v1705 = vsubq_s16(v1136, v1147);
+    int16x8_t v1706 = vsubq_s16(v1158, v1169);
+    int16x8_t v1707_tmp = vqrdmulhq_n_s16(v1706, 23437);
+    int16x8_t v1707 = vmlaq_n_s16(v1707_tmp, v1706, 3);
+    int16x8_t v1708 = vaddq_s16(v1705, v1707);
+    int16x8_t v1709 = vsubq_s16(v1090, v1101);
+    int16x8_t v1710 = vsubq_s16(v1112, v1123);
+    int16x8_t v1711_tmp = vqrdmulhq_n_s16(v1710, 17573);
+    int16x8_t v1711 = vmlaq_n_s16(v1711_tmp, v1710, 4);
+    int16x8_t v1712 = vaddq_s16(v1709, v1711);
+    int16x8_t v1713 = vsubq_s16(v1008, v1031);
+    int16x8_t v1714 = vsubq_s16(v1054, v1077);
+    int16x8_t v1715_tmp = vqrdmulhq_n_s16(v1714, 27122);
+    int16x8_t v1715 = vmlaq_n_s16(v1715_tmp, v1714, 5);
+    int16x8_t v1716 = vaddq_s16(v1713, v1715);
+    int16x8_t v1717 = vsubq_s16(v913, v937);
+    int16x8_t v1718 = vsubq_s16(v960, v983);
+    int16x8_t v1719_tmp = vqrdmulhq_n_s16(v1718, 5041);
+    int16x8_t v1719 = vmlaq_n_s16(v1719_tmp, v1718, 8);
+    int16x8_t v1720 = vaddq_s16(v1717, v1719);
+    int16x8_t v1721 = vsubq_s16(v747, v794);
+    int16x8_t v1722 = vsubq_s16(v841, v888);
+    int16x8_t v1723_tmp = vqrdmulhq_n_s16(v1722, 19146);
+    int16x8_t v1723 = vmlaq_n_s16(v1723_tmp, v1722, 13);
+    int16x8_t v1724 = vaddq_s16(v1721, v1723);
+    int16x8_t v1725 = vsubq_s16(v141, v316);
+    int16x8_t v1726 = vsubq_s16(v522, v698);
+    int16x8_t v1727_tmp = vqrdmulhq_n_s16(v1726, 24402);
+    int16x8_t v1727 = vmlaq_n_s16(v1727_tmp, v1726, 40);
+    int16x8_t v1728 = vaddq_s16(v1725, v1727);
+    int16x8_t v1729 = vsubq_s16(v1725, v1727);
+    int16x8_t v1730 = vsubq_s16(v1721, v1723);
+    int16x8_t v1731 = vsubq_s16(v1717, v1719);
+    int16x8_t v1732 = vsubq_s16(v1713, v1715);
+    int16x8_t v1733 = vsubq_s16(v1709, v1711);
+    int16x8_t v1734 = vsubq_s16(v1705, v1707);
+    int16x8_t v1735 = vsubq_s16(v1701, v1703);
+    int16x8_t v1736 = vsubq_s16(v1697, v1699);
+    int16x8_t v1737 = vsubq_s16(v1693, v1695);
+    int16x8_t v1738 = vsubq_s16(v1689, v1691);
+    int16x8_t v1739 = vsubq_s16(v1685, v1687);
+    int16x8_t v1740 = vsubq_s16(v1681, v1683);
+    int16x8_t v1741 = vsubq_s16(v1677, v1679);
+    int16x8_t v1742 = vsubq_s16(v1673, v1675);
+    int16x8_t v1743 = vsubq_s16(v1669, v1671);
+    int16x8_t v1744 = vsubq_s16(v1665, v1667);
+    int16x8_t v1745 = vsubq_s16(v1661, v1663);
+    int16x8_t v1746 = vsubq_s16(v1657, v1659);
+    int16x8_t v1747 = vsubq_s16(v1653, v1655);
+    int16x8_t v1748 = vsubq_s16(v1649, v1651);
+    int16x8_t v1749 = vsubq_s16(v1645, v1647);
+    int16x8_t v1750 = vsubq_s16(v1641, v1643);
+    int16x8_t v1751 = vsubq_s16(v1637, v1639);
+    int16x8_t v1752 = vsubq_s16(v1633, v1635);
+    int16x8_t v1753 = vsubq_s16(v1629, v1631);
+    int16x8_t v1754 = vsubq_s16(v1625, v1627);
+    int16x8_t v1755 = vsubq_s16(v1621, v1623);
+    int16x8_t v1756 = vsubq_s16(v1617, v1619);
+    int16x8_t v1757 = vsubq_s16(v1613, v1615);
+    int16x8_t v1758 = vsubq_s16(v1609, v1611);
+    int16x8_t v1759 = vsubq_s16(v1605, v1607);
+    int16x8_t v1760 = vsubq_s16(v1601, v1603);
+    int16x8_t v1761 = vsubq_s16(v1594, v1599);
+    int16x8_t v1762 = vsubq_s16(v1584, v1589);
+    int16x8_t v1763 = vsubq_s16(v1574, v1579);
+    int16x8_t v1764 = vsubq_s16(v1564, v1569);
+    int16x8_t v1765 = vsubq_s16(v1554, v1559);
+    int16x8_t v1766 = vsubq_s16(v1544, v1549);
+    int16x8_t v1767 = vsubq_s16(v1534, v1539);
+    int16x8_t v1768 = vsubq_s16(v1524, v1529);
+    int16x8_t v1769 = vsubq_s16(v1514, v1519);
+    int16x8_t v1770 = vsubq_s16(v1504, v1509);
+    int16x8_t v1771 = vsubq_s16(v1494, v1499);
+    int16x8_t v1772 = vsubq_s16(v1484, v1489);
+    int16x8_t v1773 = vsubq_s16(v1474, v1479);
+    int16x8_t v1774 = vsubq_s16(v1464, v1469);
+    int16x8_t v1775 = vsubq_s16(v1454, v1459);
+    int16x8_t v1776 = vsubq_s16(v1444, v1449);
+    int16x8_t v1777 = vsubq_s16(v1428, v1439);
+    int16x8_t v1778 = vsubq_s16(v1406, v1417);
+    int16x8_t v1779 = vsubq_s16(v1384, v1395);
+    int16x8_t v1780 = vsubq_s16(v1362, v1373);
+    int16x8_t v1781 = vsubq_s16(v1340, v1351);
+    int16x8_t v1782 = vsubq_s16(v1318, v1329);
+    int16x8_t v1783 = vsubq_s16(v1296, v1307);
+    int16x8_t v1784 = vsubq_s16(v1274, v1285);
+    int16x8_t v1785 = vsubq_s16(v1240, v1263);
+    int16x8_t v1786 = vsubq_s16(v1194, v1217);
+    int16x8_t v1787 = vsubq_s16(v1148, v1171);
+    int16x8_t v1788 = vsubq_s16(v1102, v1125);
+    int16x8_t v1789 = vsubq_s16(v1032, v1079);
+    int16x8_t v1790 = vsubq_s16(v938, v985);
+    int16x8_t v1791 = vsubq_s16(v795, v890);
+    int16x8_t v1792 = vsubq_s16(v317, v700);
+    vst1q_s16(out + out_stride * 0 + i, v701);
+    vst1q_s16(out + out_stride * 1 + i, v891);
+    vst1q_s16(out + out_stride * 2 + i, v986);
+    vst1q_s16(out + out_stride * 3 + i, v1080);
+    vst1q_s16(out + out_stride * 4 + i, v1126);
+    vst1q_s16(out + out_stride * 5 + i, v1172);
+    vst1q_s16(out + out_stride * 6 + i, v1218);
+    vst1q_s16(out + out_stride * 7 + i, v1264);
+    vst1q_s16(out + out_stride * 8 + i, v1286);
+    vst1q_s16(out + out_stride * 9 + i, v1308);
+    vst1q_s16(out + out_stride * 10 + i, v1330);
+    vst1q_s16(out + out_stride * 11 + i, v1352);
+    vst1q_s16(out + out_stride * 12 + i, v1374);
+    vst1q_s16(out + out_stride * 13 + i, v1396);
+    vst1q_s16(out + out_stride * 14 + i, v1418);
+    vst1q_s16(out + out_stride * 15 + i, v1440);
+    vst1q_s16(out + out_stride * 16 + i, v1450);
+    vst1q_s16(out + out_stride * 17 + i, v1460);
+    vst1q_s16(out + out_stride * 18 + i, v1470);
+    vst1q_s16(out + out_stride * 19 + i, v1480);
+    vst1q_s16(out + out_stride * 20 + i, v1490);
+    vst1q_s16(out + out_stride * 21 + i, v1500);
+    vst1q_s16(out + out_stride * 22 + i, v1510);
+    vst1q_s16(out + out_stride * 23 + i, v1520);
+    vst1q_s16(out + out_stride * 24 + i, v1530);
+    vst1q_s16(out + out_stride * 25 + i, v1540);
+    vst1q_s16(out + out_stride * 26 + i, v1550);
+    vst1q_s16(out + out_stride * 27 + i, v1560);
+    vst1q_s16(out + out_stride * 28 + i, v1570);
+    vst1q_s16(out + out_stride * 29 + i, v1580);
+    vst1q_s16(out + out_stride * 30 + i, v1590);
+    vst1q_s16(out + out_stride * 31 + i, v1600);
+    vst1q_s16(out + out_stride * 32 + i, v1604);
+    vst1q_s16(out + out_stride * 33 + i, v1608);
+    vst1q_s16(out + out_stride * 34 + i, v1612);
+    vst1q_s16(out + out_stride * 35 + i, v1616);
+    vst1q_s16(out + out_stride * 36 + i, v1620);
+    vst1q_s16(out + out_stride * 37 + i, v1624);
+    vst1q_s16(out + out_stride * 38 + i, v1628);
+    vst1q_s16(out + out_stride * 39 + i, v1632);
+    vst1q_s16(out + out_stride * 40 + i, v1636);
+    vst1q_s16(out + out_stride * 41 + i, v1640);
+    vst1q_s16(out + out_stride * 42 + i, v1644);
+    vst1q_s16(out + out_stride * 43 + i, v1648);
+    vst1q_s16(out + out_stride * 44 + i, v1652);
+    vst1q_s16(out + out_stride * 45 + i, v1656);
+    vst1q_s16(out + out_stride * 46 + i, v1660);
+    vst1q_s16(out + out_stride * 47 + i, v1664);
+    vst1q_s16(out + out_stride * 48 + i, v1668);
+    vst1q_s16(out + out_stride * 49 + i, v1672);
+    vst1q_s16(out + out_stride * 50 + i, v1676);
+    vst1q_s16(out + out_stride * 51 + i, v1680);
+    vst1q_s16(out + out_stride * 52 + i, v1684);
+    vst1q_s16(out + out_stride * 53 + i, v1688);
+    vst1q_s16(out + out_stride * 54 + i, v1692);
+    vst1q_s16(out + out_stride * 55 + i, v1696);
+    vst1q_s16(out + out_stride * 56 + i, v1700);
+    vst1q_s16(out + out_stride * 57 + i, v1704);
+    vst1q_s16(out + out_stride * 58 + i, v1708);
+    vst1q_s16(out + out_stride * 59 + i, v1712);
+    vst1q_s16(out + out_stride * 60 + i, v1716);
+    vst1q_s16(out + out_stride * 61 + i, v1720);
+    vst1q_s16(out + out_stride * 62 + i, v1724);
+    vst1q_s16(out + out_stride * 63 + i, v1728);
+    vst1q_s16(out + out_stride * 64 + i, v1729);
+    vst1q_s16(out + out_stride * 65 + i, v1730);
+    vst1q_s16(out + out_stride * 66 + i, v1731);
+    vst1q_s16(out + out_stride * 67 + i, v1732);
+    vst1q_s16(out + out_stride * 68 + i, v1733);
+    vst1q_s16(out + out_stride * 69 + i, v1734);
+    vst1q_s16(out + out_stride * 70 + i, v1735);
+    vst1q_s16(out + out_stride * 71 + i, v1736);
+    vst1q_s16(out + out_stride * 72 + i, v1737);
+    vst1q_s16(out + out_stride * 73 + i, v1738);
+    vst1q_s16(out + out_stride * 74 + i, v1739);
+    vst1q_s16(out + out_stride * 75 + i, v1740);
+    vst1q_s16(out + out_stride * 76 + i, v1741);
+    vst1q_s16(out + out_stride * 77 + i, v1742);
+    vst1q_s16(out + out_stride * 78 + i, v1743);
+    vst1q_s16(out + out_stride * 79 + i, v1744);
+    vst1q_s16(out + out_stride * 80 + i, v1745);
+    vst1q_s16(out + out_stride * 81 + i, v1746);
+    vst1q_s16(out + out_stride * 82 + i, v1747);
+    vst1q_s16(out + out_stride * 83 + i, v1748);
+    vst1q_s16(out + out_stride * 84 + i, v1749);
+    vst1q_s16(out + out_stride * 85 + i, v1750);
+    vst1q_s16(out + out_stride * 86 + i, v1751);
+    vst1q_s16(out + out_stride * 87 + i, v1752);
+    vst1q_s16(out + out_stride * 88 + i, v1753);
+    vst1q_s16(out + out_stride * 89 + i, v1754);
+    vst1q_s16(out + out_stride * 90 + i, v1755);
+    vst1q_s16(out + out_stride * 91 + i, v1756);
+    vst1q_s16(out + out_stride * 92 + i, v1757);
+    vst1q_s16(out + out_stride * 93 + i, v1758);
+    vst1q_s16(out + out_stride * 94 + i, v1759);
+    vst1q_s16(out + out_stride * 95 + i, v1760);
+    vst1q_s16(out + out_stride * 96 + i, v1761);
+    vst1q_s16(out + out_stride * 97 + i, v1762);
+    vst1q_s16(out + out_stride * 98 + i, v1763);
+    vst1q_s16(out + out_stride * 99 + i, v1764);
+    vst1q_s16(out + out_stride * 100 + i, v1765);
+    vst1q_s16(out + out_stride * 101 + i, v1766);
+    vst1q_s16(out + out_stride * 102 + i, v1767);
+    vst1q_s16(out + out_stride * 103 + i, v1768);
+    vst1q_s16(out + out_stride * 104 + i, v1769);
+    vst1q_s16(out + out_stride * 105 + i, v1770);
+    vst1q_s16(out + out_stride * 106 + i, v1771);
+    vst1q_s16(out + out_stride * 107 + i, v1772);
+    vst1q_s16(out + out_stride * 108 + i, v1773);
+    vst1q_s16(out + out_stride * 109 + i, v1774);
+    vst1q_s16(out + out_stride * 110 + i, v1775);
+    vst1q_s16(out + out_stride * 111 + i, v1776);
+    vst1q_s16(out + out_stride * 112 + i, v1777);
+    vst1q_s16(out + out_stride * 113 + i, v1778);
+    vst1q_s16(out + out_stride * 114 + i, v1779);
+    vst1q_s16(out + out_stride * 115 + i, v1780);
+    vst1q_s16(out + out_stride * 116 + i, v1781);
+    vst1q_s16(out + out_stride * 117 + i, v1782);
+    vst1q_s16(out + out_stride * 118 + i, v1783);
+    vst1q_s16(out + out_stride * 119 + i, v1784);
+    vst1q_s16(out + out_stride * 120 + i, v1785);
+    vst1q_s16(out + out_stride * 121 + i, v1786);
+    vst1q_s16(out + out_stride * 122 + i, v1787);
+    vst1q_s16(out + out_stride * 123 + i, v1788);
+    vst1q_s16(out + out_stride * 124 + i, v1789);
+    vst1q_s16(out + out_stride * 125 + i, v1790);
+    vst1q_s16(out + out_stride * 126 + i, v1791);
+    vst1q_s16(out + out_stride * 127 + i, v1792);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct16-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct16-inl.h
new file mode 100644
index 0000000000..472ec20d42
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct16-inl.h
@@ -0,0 +1,180 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/* This file is automatically generated. Do not modify it directly. */
+#if HWY_TARGET != HWY_NEON
+#error "only include this file from fast_dct-inl.h"
+#endif
+
+constexpr size_t FastIDCTIntegerBits(FastDCTTag<16>) { return 1; }
+
+void FastIDCT(FastDCTTag<16>, const int16_t* in, size_t in_stride, int16_t* out,
+              size_t out_stride, size_t count) {
+  JXL_ASSERT(count % 8 == 0);
+  for (size_t i = 0; i < count; i += 8) {
+    int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
+    int16x8_t v1 = vld1q_s16(in + in_stride * 8 + i);
+    int16x8_t v2 = vaddq_s16(v0, v1);
+    int16x8_t v3 = vld1q_s16(in + in_stride * 4 + i);
+    int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
+    int16x8_t v4 = vaddq_s16(v4_tmp, v3);
+    int16x8_t v5 = vld1q_s16(in + in_stride * 12 + i);
+    int16x8_t v6 = vaddq_s16(v5, v3);
+    int16x8_t v7 = vaddq_s16(v4, v6);
+    int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
+    int16x8_t v9 = vaddq_s16(v2, v8);
+    int16x8_t v10 = vld1q_s16(in + in_stride * 2 + i);
+    int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
+    int16x8_t v11 = vaddq_s16(v11_tmp, v10);
+    int16x8_t v12 = vld1q_s16(in + in_stride * 10 + i);
+    int16x8_t v13 = vld1q_s16(in + in_stride * 6 + i);
+    int16x8_t v14 = vaddq_s16(v12, v13);
+    int16x8_t v15 = vaddq_s16(v11, v14);
+    int16x8_t v16 = vaddq_s16(v13, v10);
+    int16x8_t v17 = vqrdmulhq_n_s16(v16, 25080);
+    int16x8_t v18 = vld1q_s16(in + in_stride * 14 + i);
+    int16x8_t v19 = vaddq_s16(v18, v12);
+    int16x8_t v20 = vaddq_s16(v16, v19);
+    int16x8_t v21 = vqrdmulhq_n_s16(v20, 17734);
+    int16x8_t v22 = vaddq_s16(v17, v21);
+    int16x8_t v23 = vaddq_s16(v15, v22);
+    int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
+    int16x8_t v25 = vaddq_s16(v9, v24);
+    int16x8_t v26 = vld1q_s16(in + in_stride * 15 + i);
+    int16x8_t v27 = vld1q_s16(in + in_stride * 13 + i);
+    int16x8_t v28 = vaddq_s16(v26, v27);
+    int16x8_t v29 = vld1q_s16(in + in_stride * 11 + i);
+    int16x8_t v30 = vld1q_s16(in + in_stride * 9 + i);
+    int16x8_t v31 = vaddq_s16(v29, v30);
+    int16x8_t v32 = vaddq_s16(v28, v31);
+    int16x8_t v33 = vqrdmulhq_n_s16(v32, 17734);
+    int16x8_t v34 = vld1q_s16(in + in_stride * 3 + i);
+    int16x8_t v35 = vld1q_s16(in + in_stride * 1 + i);
+    int16x8_t v36 = vaddq_s16(v34, v35);
+    int16x8_t v37 = vld1q_s16(in + in_stride * 7 + i);
+    int16x8_t v38 = vld1q_s16(in + in_stride * 5 + i);
+    int16x8_t v39 = vaddq_s16(v37, v38);
+    int16x8_t v40 = vaddq_s16(v36, v39);
+    int16x8_t v41_tmp = vqrdmulhq_n_s16(v40, 10045);
+    int16x8_t v41 = vaddq_s16(v41_tmp, v40);
+    int16x8_t v42 = vaddq_s16(v33, v41);
+    int16x8_t v43 = vqrdmulhq_n_s16(v42, 16705);
+    int16x8_t v44_tmp = vqrdmulhq_n_s16(v36, 13573);
+    int16x8_t v44 = vaddq_s16(v44_tmp, v36);
+    int16x8_t v45 = vaddq_s16(v39, v31);
+    int16x8_t v46 = vaddq_s16(v44, v45);
+    int16x8_t v47 = vqrdmulhq_n_s16(v46, 16705);
+    int16x8_t v48 = vaddq_s16(v43, v47);
+    int16x8_t v49_tmp = vqrdmulhq_n_s16(v35, 13573);
+    int16x8_t v49 = vaddq_s16(v49_tmp, v35);
+    int16x8_t v50 = vaddq_s16(v30, v37);
+    int16x8_t v51 = vaddq_s16(v49, v50);
+    int16x8_t v52 = vaddq_s16(v38, v34);
+    int16x8_t v53 = vaddq_s16(v27, v29);
+    int16x8_t v54 = vaddq_s16(v52, v53);
+    int16x8_t v55 = vqrdmulhq_n_s16(v54, 17734);
+    int16x8_t v56 = vqrdmulhq_n_s16(v52, 25080);
+    int16x8_t v57 = vaddq_s16(v55, v56);
+    int16x8_t v58 = vaddq_s16(v51, v57);
+    int16x8_t v59 = vaddq_s16(v48, v58);
+    int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
+    int16x8_t v61 = vaddq_s16(v25, v60);
+    int16x8_t v62 = vsubq_s16(v0, v1);
+    int16x8_t v63 = vsubq_s16(v4, v6);
+    int16x8_t v64_tmp = vqrdmulhq_n_s16(v63, 10045);
+    int16x8_t v64 = vaddq_s16(v64_tmp, v63);
+    int16x8_t v65 = vaddq_s16(v62, v64);
+    int16x8_t v66 = vsubq_s16(v11, v14);
+    int16x8_t v67 = vqrdmulhq_n_s16(v16, 17734);
+    int16x8_t v68_tmp = vqrdmulhq_n_s16(v19, 10045);
+    int16x8_t v68 = vaddq_s16(v68_tmp, v19);
+    int16x8_t v69 = vsubq_s16(v67, v68);
+    int16x8_t v70 = vaddq_s16(v66, v69);
+    int16x8_t v71 = vqrdmulhq_n_s16(v70, 19705);
+    int16x8_t v72 = vaddq_s16(v65, v71);
+    int16x8_t v73 = vsubq_s16(v49, v50);
+    int16x8_t v74 = vqrdmulhq_n_s16(v52, 17734);
+    int16x8_t v75_tmp = vqrdmulhq_n_s16(v53, 10045);
+    int16x8_t v75 = vaddq_s16(v75_tmp, v53);
+    int16x8_t v76 = vsubq_s16(v74, v75);
+    int16x8_t v77 = vaddq_s16(v73, v76);
+    int16x8_t v78 = vsubq_s16(v44, v45);
+    int16x8_t v79 = vqrdmulhq_n_s16(v78, 19705);
+    int16x8_t v80 = vqrdmulhq_n_s16(v40, 13573);
+    int16x8_t v81 = vsubq_s16(v80, v32);
+    int16x8_t v82 = vqrdmulhq_n_s16(v81, 25746);
+    int16x8_t v83 = vaddq_s16(v79, v82);
+    int16x8_t v84 = vaddq_s16(v77, v83);
+    int16x8_t v85 = vqrdmulhq_n_s16(v84, 17121);
+    int16x8_t v86 = vaddq_s16(v72, v85);
+    int16x8_t v87 = vsubq_s16(v62, v64);
+    int16x8_t v88 = vsubq_s16(v66, v69);
+    int16x8_t v89 = vqrdmulhq_n_s16(v88, 29490);
+    int16x8_t v90 = vaddq_s16(v87, v89);
+    int16x8_t v91 = vsubq_s16(v73, v76);
+    int16x8_t v92 = vqrdmulhq_n_s16(v78, 29490);
+    int16x8_t v93_tmp = vqrdmulhq_n_s16(v81, 5763);
+    int16x8_t v93 = vaddq_s16(v93_tmp, v81);
+    int16x8_t v94 = vsubq_s16(v92, v93);
+    int16x8_t v95 = vaddq_s16(v91, v94);
+    int16x8_t v96 = vqrdmulhq_n_s16(v95, 18578);
+    int16x8_t v97 = vaddq_s16(v90, v96);
+    int16x8_t v98 = vsubq_s16(v46, v42);
+    int16x8_t v99_tmp = vqrdmulhq_n_s16(v98, 18446);
+    int16x8_t v99 = vmlaq_n_s16(v99_tmp, v98, 2);
+    int16x8_t v100 = vsubq_s16(v51, v57);
+    int16x8_t v101 = vaddq_s16(v99, v100);
+    int16x8_t v102 = vqrdmulhq_n_s16(v101, 21195);
+    int16x8_t v103 = vsubq_s16(v2, v8);
+    int16x8_t v104 = vsubq_s16(v15, v22);
+    int16x8_t v105_tmp = vqrdmulhq_n_s16(v104, 18446);
+    int16x8_t v105 = vmlaq_n_s16(v105_tmp, v104, 2);
+    int16x8_t v106 = vaddq_s16(v103, v105);
+    int16x8_t v107 = vaddq_s16(v102, v106);
+    int16x8_t v108 = vsubq_s16(v103, v105);
+    int16x8_t v109 = vsubq_s16(v100, v99);
+    int16x8_t v110 = vqrdmulhq_n_s16(v109, 25826);
+    int16x8_t v111 = vaddq_s16(v108, v110);
+    int16x8_t v112 = vsubq_s16(v87, v89);
+    int16x8_t v113 = vsubq_s16(v91, v94);
+    int16x8_t v114_tmp = vqrdmulhq_n_s16(v113, 1988);
+    int16x8_t v114 = vaddq_s16(v114_tmp, v113);
+    int16x8_t v115 = vaddq_s16(v112, v114);
+    int16x8_t v116 = vsubq_s16(v65, v71);
+    int16x8_t v117 = vsubq_s16(v77, v83);
+    int16x8_t v118_tmp = vqrdmulhq_n_s16(v117, 23673);
+    int16x8_t v118 = vaddq_s16(v118_tmp, v117);
+    int16x8_t v119 = vaddq_s16(v116, v118);
+    int16x8_t v120 = vsubq_s16(v58, v48);
+    int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 3314);
+    int16x8_t v121 = vmlaq_n_s16(v121_tmp, v120, 5);
+    int16x8_t v122 = vsubq_s16(v9, v24);
+    int16x8_t v123 = vaddq_s16(v121, v122);
+    int16x8_t v124 = vsubq_s16(v122, v121);
+    int16x8_t v125 = vsubq_s16(v116, v118);
+    int16x8_t v126 = vsubq_s16(v112, v114);
+    int16x8_t v127 = vsubq_s16(v108, v110);
+    int16x8_t v128 = vsubq_s16(v106, v102);
+    int16x8_t v129 = vsubq_s16(v90, v96);
+    int16x8_t v130 = vsubq_s16(v72, v85);
+    int16x8_t v131 = vsubq_s16(v25, v60);
+    vst1q_s16(out + out_stride * 0 + i, v61);
+    vst1q_s16(out + out_stride * 1 + i, v86);
+    vst1q_s16(out + out_stride * 2 + i, v97);
+    vst1q_s16(out + out_stride * 3 + i, v107);
+    vst1q_s16(out + out_stride * 4 + i, v111);
+    vst1q_s16(out + out_stride * 5 + i, v115);
+    vst1q_s16(out + out_stride * 6 + i, v119);
+    vst1q_s16(out + out_stride * 7 + i, v123);
+    vst1q_s16(out + out_stride * 8 + i, v124);
+    vst1q_s16(out + out_stride * 9 + i, v125);
+    vst1q_s16(out + out_stride * 10 + i, v126);
+    vst1q_s16(out + out_stride * 11 + i, v127);
+    vst1q_s16(out + out_stride * 12 + i, v128);
+    vst1q_s16(out + out_stride * 13 + i, v129);
+    vst1q_s16(out + out_stride * 14 + i, v130);
+    vst1q_s16(out + out_stride * 15 + i, v131);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct256-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct256-inl.h
new file mode 100644
index 0000000000..a823440af2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct256-inl.h
@@ -0,0 +1,4811 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/* This file is automatically generated. Do not modify it directly. */
+#if HWY_TARGET != HWY_NEON
+#error "only include this file from fast_dct-inl.h"
+#endif
+
+constexpr size_t FastIDCTIntegerBits(FastDCTTag<256>) { return 3; }
+
+void FastIDCT(FastDCTTag<256>, const int16_t* in, size_t in_stride,
+              int16_t* out, size_t out_stride, size_t count) {
+  JXL_ASSERT(count % 8 == 0);
+  for (size_t i = 0; i < count; i += 8) {
+    int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
+    int16x8_t v1 = vld1q_s16(in + in_stride * 128 + i);
+    int16x8_t v2 = vaddq_s16(v0, v1);
+    int16x8_t v3 = vld1q_s16(in + in_stride * 64 + i);
+    int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
+    int16x8_t v4 = vaddq_s16(v4_tmp, v3);
+    int16x8_t v5 = vld1q_s16(in + in_stride * 192 + i);
+    int16x8_t v6 = vaddq_s16(v5, v3);
+    int16x8_t v7 = vaddq_s16(v4, v6);
+    int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
+    int16x8_t v9 = vaddq_s16(v2, v8);
+    int16x8_t v10 = vld1q_s16(in + in_stride * 32 + i);
+    int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
+    int16x8_t v11 = vaddq_s16(v11_tmp, v10);
+    int16x8_t v12 = vld1q_s16(in + in_stride * 160 + i);
+    int16x8_t v13 = vld1q_s16(in + in_stride * 96 + i);
+    int16x8_t v14 = vaddq_s16(v12, v13);
+    int16x8_t v15 = vaddq_s16(v11, v14);
+    int16x8_t v16 = vaddq_s16(v13, v10);
+    int16x8_t v17_tmp = vqrdmulhq_n_s16(v16, 13573);
+    int16x8_t v17 = vaddq_s16(v17_tmp, v16);
+    int16x8_t v18 = vld1q_s16(in + in_stride * 224 + i);
+    int16x8_t v19 = vaddq_s16(v18, v12);
+    int16x8_t v20 = vaddq_s16(v19, v16);
+    int16x8_t v21 = vaddq_s16(v17, v20);
+    int16x8_t v22 = vqrdmulhq_n_s16(v21, 17734);
+    int16x8_t v23 = vaddq_s16(v15, v22);
+    int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
+    int16x8_t v25 = vaddq_s16(v9, v24);
+    int16x8_t v26 = vld1q_s16(in + in_stride * 16 + i);
+    int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573);
+    int16x8_t v27 = vaddq_s16(v27_tmp, v26);
+    int16x8_t v28 = vld1q_s16(in + in_stride * 144 + i);
+    int16x8_t v29 = vld1q_s16(in + in_stride * 112 + i);
+    int16x8_t v30 = vaddq_s16(v28, v29);
+    int16x8_t v31 = vaddq_s16(v27, v30);
+    int16x8_t v32 = vld1q_s16(in + in_stride * 80 + i);
+    int16x8_t v33 = vld1q_s16(in + in_stride * 48 + i);
+    int16x8_t v34 = vaddq_s16(v32, v33);
+    int16x8_t v35_tmp = vqrdmulhq_n_s16(v34, 13573);
+    int16x8_t v35 = vaddq_s16(v35_tmp, v34);
+    int16x8_t v36 = vld1q_s16(in + in_stride * 208 + i);
+    int16x8_t v37 = vld1q_s16(in + in_stride * 176 + i);
+    int16x8_t v38 = vaddq_s16(v36, v37);
+    int16x8_t v39 = vaddq_s16(v38, v34);
+    int16x8_t v40 = vaddq_s16(v35, v39);
+    int16x8_t v41 = vqrdmulhq_n_s16(v40, 17734);
+    int16x8_t v42 = vaddq_s16(v31, v41);
+    int16x8_t v43 = vaddq_s16(v33, v26);
+    int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573);
+    int16x8_t v44 = vaddq_s16(v44_tmp, v43);
+    int16x8_t v45 = vaddq_s16(v37, v28);
+    int16x8_t v46 = vaddq_s16(v29, v32);
+    int16x8_t v47 = vaddq_s16(v45, v46);
+    int16x8_t v48 = vaddq_s16(v44, v47);
+    int16x8_t v49 = vaddq_s16(v46, v43);
+    int16x8_t v50_tmp = vqrdmulhq_n_s16(v49, 13573);
+    int16x8_t v50 = vaddq_s16(v50_tmp, v49);
+    int16x8_t v51 = vld1q_s16(in + in_stride * 240 + i);
+    int16x8_t v52 = vaddq_s16(v51, v36);
+    int16x8_t v53 = vaddq_s16(v52, v45);
+    int16x8_t v54 = vaddq_s16(v53, v49);
+    int16x8_t v55 = vaddq_s16(v50, v54);
+    int16x8_t v56 = vqrdmulhq_n_s16(v55, 17734);
+    int16x8_t v57 = vaddq_s16(v48, v56);
+    int16x8_t v58 = vqrdmulhq_n_s16(v57, 16705);
+    int16x8_t v59 = vaddq_s16(v42, v58);
+    int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
+    int16x8_t v61 = vaddq_s16(v25, v60);
+    int16x8_t v62 = vld1q_s16(in + in_stride * 8 + i);
+    int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573);
+    int16x8_t v63 = vaddq_s16(v63_tmp, v62);
+    int16x8_t v64 = vld1q_s16(in + in_stride * 136 + i);
+    int16x8_t v65 = vld1q_s16(in + in_stride * 120 + i);
+    int16x8_t v66 = vaddq_s16(v64, v65);
+    int16x8_t v67 = vaddq_s16(v63, v66);
+    int16x8_t v68 = vld1q_s16(in + in_stride * 72 + i);
+    int16x8_t v69 = vld1q_s16(in + in_stride * 56 + i);
+    int16x8_t v70 = vaddq_s16(v68, v69);
+    int16x8_t v71_tmp = vqrdmulhq_n_s16(v70, 13573);
+    int16x8_t v71 = vaddq_s16(v71_tmp, v70);
+    int16x8_t v72 = vld1q_s16(in + in_stride * 200 + i);
+    int16x8_t v73 = vld1q_s16(in + in_stride * 184 + i);
+    int16x8_t v74 = vaddq_s16(v72, v73);
+    int16x8_t v75 = vaddq_s16(v74, v70);
+    int16x8_t v76 = vaddq_s16(v71, v75);
+    int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734);
+    int16x8_t v78 = vaddq_s16(v67, v77);
+    int16x8_t v79 = vld1q_s16(in + in_stride * 40 + i);
+    int16x8_t v80 = vld1q_s16(in + in_stride * 24 + i);
+    int16x8_t v81 = vaddq_s16(v79, v80);
+    int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573);
+    int16x8_t v82 = vaddq_s16(v82_tmp, v81);
+    int16x8_t v83 = vld1q_s16(in + in_stride * 168 + i);
+    int16x8_t v84 = vld1q_s16(in + in_stride * 152 + i);
+    int16x8_t v85 = vaddq_s16(v83, v84);
+    int16x8_t v86 = vld1q_s16(in + in_stride * 104 + i);
+    int16x8_t v87 = vld1q_s16(in + in_stride * 88 + i);
+    int16x8_t v88 = vaddq_s16(v86, v87);
+    int16x8_t v89 = vaddq_s16(v85, v88);
+    int16x8_t v90 = vaddq_s16(v82, v89);
+    int16x8_t v91 = vaddq_s16(v88, v81);
+    int16x8_t v92_tmp = vqrdmulhq_n_s16(v91, 13573);
+    int16x8_t v92 = vaddq_s16(v92_tmp, v91);
+    int16x8_t v93 = vld1q_s16(in + in_stride * 232 + i);
+    int16x8_t v94 = vld1q_s16(in + in_stride * 216 + i);
+    int16x8_t v95 = vaddq_s16(v93, v94);
+    int16x8_t v96 = vaddq_s16(v95, v85);
+    int16x8_t v97 = vaddq_s16(v96, v91);
+    int16x8_t v98 = vaddq_s16(v92, v97);
+    int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734);
+    int16x8_t v100 = vaddq_s16(v90, v99);
+    int16x8_t v101 = vqrdmulhq_n_s16(v100, 16705);
+    int16x8_t v102 = vaddq_s16(v78, v101);
+    int16x8_t v103 = vaddq_s16(v80, v62);
+    int16x8_t v104_tmp = vqrdmulhq_n_s16(v103, 13573);
+    int16x8_t v104 = vaddq_s16(v104_tmp, v103);
+    int16x8_t v105 = vaddq_s16(v84, v64);
+    int16x8_t v106 = vaddq_s16(v65, v86);
+    int16x8_t v107 = vaddq_s16(v105, v106);
+    int16x8_t v108 = vaddq_s16(v104, v107);
+    int16x8_t v109 = vaddq_s16(v87, v68);
+    int16x8_t v110 = vaddq_s16(v69, v79);
+    int16x8_t v111 = vaddq_s16(v109, v110);
+    int16x8_t v112_tmp = vqrdmulhq_n_s16(v111, 13573);
+    int16x8_t v112 = vaddq_s16(v112_tmp, v111);
+    int16x8_t v113 = vaddq_s16(v94, v72);
+    int16x8_t v114 = vaddq_s16(v73, v83);
+    int16x8_t v115 = vaddq_s16(v113, v114);
+    int16x8_t v116 = vaddq_s16(v115, v111);
+    int16x8_t v117 = vaddq_s16(v112, v116);
+    int16x8_t v118 = vqrdmulhq_n_s16(v117, 17734);
+    int16x8_t v119 = vaddq_s16(v108, v118);
+    int16x8_t v120 = vaddq_s16(v110, v103);
+    int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 13573);
+    int16x8_t v121 = vaddq_s16(v121_tmp, v120);
+    int16x8_t v122 = vaddq_s16(v114, v105);
+    int16x8_t v123 = vaddq_s16(v106, v109);
+    int16x8_t v124 = vaddq_s16(v122, v123);
+    int16x8_t v125 = vaddq_s16(v121, v124);
+    int16x8_t v126 = vaddq_s16(v123, v120);
+    int16x8_t v127_tmp = vqrdmulhq_n_s16(v126, 13573);
+    int16x8_t v127 = vaddq_s16(v127_tmp, v126);
+    int16x8_t v128 = vld1q_s16(in + in_stride * 248 + i);
+    int16x8_t v129 = vaddq_s16(v128, v93);
+    int16x8_t v130 = vaddq_s16(v129, v113);
+    int16x8_t v131 = vaddq_s16(v130, v122);
+    int16x8_t v132 = vaddq_s16(v131, v126);
+    int16x8_t v133 = vaddq_s16(v127, v132);
+    int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734);
+    int16x8_t v135 = vaddq_s16(v125, v134);
+    int16x8_t v136 = vqrdmulhq_n_s16(v135, 16705);
+    int16x8_t v137 = vaddq_s16(v119, v136);
+    int16x8_t v138 = vqrdmulhq_n_s16(v137, 16463);
+    int16x8_t v139 = vaddq_s16(v102, v138);
+    int16x8_t v140 = vqrdmulhq_n_s16(v139, 16404);
+    int16x8_t v141 = vaddq_s16(v61, v140);
+    int16x8_t v142 = vld1q_s16(in + in_stride * 4 + i);
+    int16x8_t v143_tmp = vqrdmulhq_n_s16(v142, 13573);
+    int16x8_t v143 = vaddq_s16(v143_tmp, v142);
+    int16x8_t v144 = vld1q_s16(in + in_stride * 132 + i);
+    int16x8_t v145 = vld1q_s16(in + in_stride * 124 + i);
+    int16x8_t v146 = vaddq_s16(v144, v145);
+    int16x8_t v147 = vaddq_s16(v143, v146);
+    int16x8_t v148 = vld1q_s16(in + in_stride * 68 + i);
+    int16x8_t v149 = vld1q_s16(in + in_stride * 60 + i);
+    int16x8_t v150 = vaddq_s16(v148, v149);
+    int16x8_t v151_tmp = vqrdmulhq_n_s16(v150, 13573);
+    int16x8_t v151 = vaddq_s16(v151_tmp, v150);
+    int16x8_t v152 = vld1q_s16(in + in_stride * 196 + i);
+    int16x8_t v153 = vld1q_s16(in + in_stride * 188 + i);
+    int16x8_t v154 = vaddq_s16(v152, v153);
+    int16x8_t v155 = vaddq_s16(v154, v150);
+    int16x8_t v156 = vaddq_s16(v151, v155);
+    int16x8_t v157 = vqrdmulhq_n_s16(v156, 17734);
+    int16x8_t v158 = vaddq_s16(v147, v157);
+    int16x8_t v159 = vld1q_s16(in + in_stride * 36 + i);
+    int16x8_t v160 = vld1q_s16(in + in_stride * 28 + i);
+    int16x8_t v161 = vaddq_s16(v159, v160);
+    int16x8_t v162_tmp = vqrdmulhq_n_s16(v161, 13573);
+    int16x8_t v162 = vaddq_s16(v162_tmp, v161);
+    int16x8_t v163 = vld1q_s16(in + in_stride * 164 + i);
+    int16x8_t v164 = vld1q_s16(in + in_stride * 156 + i);
+    int16x8_t v165 = vaddq_s16(v163, v164);
+    int16x8_t v166 = vld1q_s16(in + in_stride * 100 + i);
+    int16x8_t v167 = vld1q_s16(in + in_stride * 92 + i);
+    int16x8_t v168 = vaddq_s16(v166, v167);
+    int16x8_t v169 = vaddq_s16(v165, v168);
+    int16x8_t v170 = vaddq_s16(v162, v169);
+    int16x8_t v171 = vaddq_s16(v168, v161);
+    int16x8_t v172_tmp = vqrdmulhq_n_s16(v171, 13573);
+    int16x8_t v172 = vaddq_s16(v172_tmp, v171);
+    int16x8_t v173 = vld1q_s16(in + in_stride * 228 + i);
+    int16x8_t v174 = vld1q_s16(in + in_stride * 220 + i);
+    int16x8_t v175 = vaddq_s16(v173, v174);
+    int16x8_t v176 = vaddq_s16(v175, v165);
+    int16x8_t v177 = vaddq_s16(v176, v171);
+    int16x8_t v178 = vaddq_s16(v172, v177);
+    int16x8_t v179 = vqrdmulhq_n_s16(v178, 17734);
+    int16x8_t v180 = vaddq_s16(v170, v179);
+    int16x8_t v181 = vqrdmulhq_n_s16(v180, 16705);
+    int16x8_t v182 = vaddq_s16(v158, v181);
+    int16x8_t v183 = vld1q_s16(in + in_stride * 20 + i);
+    int16x8_t v184 = vld1q_s16(in + in_stride * 12 + i);
+    int16x8_t v185 = vaddq_s16(v183, v184);
+    int16x8_t v186_tmp = vqrdmulhq_n_s16(v185, 13573);
+    int16x8_t v186 = vaddq_s16(v186_tmp, v185);
+    int16x8_t v187 = vld1q_s16(in + in_stride * 148 + i);
+    int16x8_t v188 = vld1q_s16(in + in_stride * 140 + i);
+    int16x8_t v189 = vaddq_s16(v187, v188);
+    int16x8_t v190 = vld1q_s16(in + in_stride * 116 + i);
+    int16x8_t v191 = vld1q_s16(in + in_stride * 108 + i);
+    int16x8_t v192 = vaddq_s16(v190, v191);
+    int16x8_t v193 = vaddq_s16(v189, v192);
+    int16x8_t v194 = vaddq_s16(v186, v193);
+    int16x8_t v195 = vld1q_s16(in + in_stride * 84 + i);
+    int16x8_t v196 = vld1q_s16(in + in_stride * 76 + i);
+    int16x8_t v197 = vaddq_s16(v195, v196);
+    int16x8_t v198 = vld1q_s16(in + in_stride * 52 + i);
+    int16x8_t v199 = vld1q_s16(in + in_stride * 44 + i);
+    int16x8_t v200 = vaddq_s16(v198, v199);
+    int16x8_t v201 = vaddq_s16(v197, v200);
+    int16x8_t v202_tmp = vqrdmulhq_n_s16(v201, 13573);
+    int16x8_t v202 = vaddq_s16(v202_tmp, v201);
+    int16x8_t v203 = vld1q_s16(in + in_stride * 212 + i);
+    int16x8_t v204 = vld1q_s16(in + in_stride * 204 + i);
+    int16x8_t v205 = vaddq_s16(v203, v204);
+    int16x8_t v206 = vld1q_s16(in + in_stride * 180 + i);
+    int16x8_t v207 = vld1q_s16(in + in_stride * 172 + i);
+    int16x8_t v208 = vaddq_s16(v206, v207);
+    int16x8_t v209 = vaddq_s16(v205, v208);
+    int16x8_t v210 = vaddq_s16(v209, v201);
+    int16x8_t v211 = vaddq_s16(v202, v210);
+    int16x8_t v212 = vqrdmulhq_n_s16(v211, 17734);
+    int16x8_t v213 = vaddq_s16(v194, v212);
+    int16x8_t v214 = vaddq_s16(v200, v185);
+    int16x8_t v215_tmp = vqrdmulhq_n_s16(v214, 13573);
+    int16x8_t v215 = vaddq_s16(v215_tmp, v214);
+    int16x8_t v216 = vaddq_s16(v208, v189);
+    int16x8_t v217 = vaddq_s16(v192, v197);
+    int16x8_t v218 = vaddq_s16(v216, v217);
+    int16x8_t v219 = vaddq_s16(v215, v218);
+    int16x8_t v220 = vaddq_s16(v217, v214);
+    int16x8_t v221_tmp = vqrdmulhq_n_s16(v220, 13573);
+    int16x8_t v221 = vaddq_s16(v221_tmp, v220);
+    int16x8_t v222 = vld1q_s16(in + in_stride * 244 + i);
+    int16x8_t v223 = vld1q_s16(in + in_stride * 236 + i);
+    int16x8_t v224 = vaddq_s16(v222, v223);
+    int16x8_t v225 = vaddq_s16(v224, v205);
+    int16x8_t v226 = vaddq_s16(v225, v216);
+    int16x8_t v227 = vaddq_s16(v226, v220);
+    int16x8_t v228 = vaddq_s16(v221, v227);
+    int16x8_t v229 = vqrdmulhq_n_s16(v228, 17734);
+    int16x8_t v230 = vaddq_s16(v219, v229);
+    int16x8_t v231 = vqrdmulhq_n_s16(v230, 16705);
+    int16x8_t v232 = vaddq_s16(v213, v231);
+    int16x8_t v233 = vqrdmulhq_n_s16(v232, 16463);
+    int16x8_t v234 = vaddq_s16(v182, v233);
+    int16x8_t v235 = vaddq_s16(v184, v142);
+    int16x8_t v236_tmp = vqrdmulhq_n_s16(v235, 13573);
+    int16x8_t v236 = vaddq_s16(v236_tmp, v235);
+    int16x8_t v237 = vaddq_s16(v188, v144);
+    int16x8_t v238 = vaddq_s16(v145, v190);
+    int16x8_t v239 = vaddq_s16(v237, v238);
+    int16x8_t v240 = vaddq_s16(v236, v239);
+    int16x8_t v241 = vaddq_s16(v196, v148);
+    int16x8_t v242 = vaddq_s16(v149, v198);
+    int16x8_t v243 = vaddq_s16(v241, v242);
+    int16x8_t v244_tmp = vqrdmulhq_n_s16(v243, 13573);
+    int16x8_t v244 = vaddq_s16(v244_tmp, v243);
+    int16x8_t v245 = vaddq_s16(v204, v152);
+    int16x8_t v246 = vaddq_s16(v153, v206);
+    int16x8_t v247 = vaddq_s16(v245, v246);
+    int16x8_t v248 = vaddq_s16(v247, v243);
+    int16x8_t v249 = vaddq_s16(v244, v248);
+    int16x8_t v250 = vqrdmulhq_n_s16(v249, 17734);
+    int16x8_t v251 = vaddq_s16(v240, v250);
+    int16x8_t v252 = vaddq_s16(v199, v159);
+    int16x8_t v253 = vaddq_s16(v160, v183);
+    int16x8_t v254 = vaddq_s16(v252, v253);
+    int16x8_t v255_tmp = vqrdmulhq_n_s16(v254, 13573);
+    int16x8_t v255 = vaddq_s16(v255_tmp, v254);
+    int16x8_t v256 = vaddq_s16(v207, v163);
+    int16x8_t v257 = vaddq_s16(v164, v187);
+    int16x8_t v258 = vaddq_s16(v256, v257);
+    int16x8_t v259 = vaddq_s16(v191, v166);
+    int16x8_t v260 = vaddq_s16(v167, v195);
+    int16x8_t v261 = vaddq_s16(v259, v260);
+    int16x8_t v262 = vaddq_s16(v258, v261);
+    int16x8_t v263 = vaddq_s16(v255, v262);
+    int16x8_t v264 = vaddq_s16(v261, v254);
+    int16x8_t v265_tmp = vqrdmulhq_n_s16(v264, 13573);
+    int16x8_t v265 = vaddq_s16(v265_tmp, v264);
+    int16x8_t v266 = vaddq_s16(v223, v173);
+    int16x8_t v267 = vaddq_s16(v174, v203);
+    int16x8_t v268 = vaddq_s16(v266, v267);
+    int16x8_t v269 = vaddq_s16(v268, v258);
+    int16x8_t v270 = vaddq_s16(v269, v264);
+    int16x8_t v271 = vaddq_s16(v265, v270);
+    int16x8_t v272 = vqrdmulhq_n_s16(v271, 17734);
+    int16x8_t v273 = vaddq_s16(v263, v272);
+    int16x8_t v274 = vqrdmulhq_n_s16(v273, 16705);
+    int16x8_t v275 = vaddq_s16(v251, v274);
+    int16x8_t v276 = vaddq_s16(v253, v235);
+    int16x8_t v277_tmp = vqrdmulhq_n_s16(v276, 13573);
+    int16x8_t v277 = vaddq_s16(v277_tmp, v276);
+    int16x8_t v278 = vaddq_s16(v257, v237);
+    int16x8_t v279 = vaddq_s16(v238, v259);
+    int16x8_t v280 = vaddq_s16(v278, v279);
+    int16x8_t v281 = vaddq_s16(v277, v280);
+    int16x8_t v282 = vaddq_s16(v260, v241);
+    int16x8_t v283 = vaddq_s16(v242, v252);
+    int16x8_t v284 = vaddq_s16(v282, v283);
+    int16x8_t v285_tmp = vqrdmulhq_n_s16(v284, 13573);
+    int16x8_t v285 = vaddq_s16(v285_tmp, v284);
+    int16x8_t v286 = vaddq_s16(v267, v245);
+    int16x8_t v287 = vaddq_s16(v246, v256);
+    int16x8_t v288 = vaddq_s16(v286, v287);
+    int16x8_t v289 = vaddq_s16(v288, v284);
+    int16x8_t v290 = vaddq_s16(v285, v289);
+    int16x8_t v291 = vqrdmulhq_n_s16(v290, 17734);
+    int16x8_t v292 = vaddq_s16(v281, v291);
+    int16x8_t v293 = vaddq_s16(v283, v276);
+    int16x8_t v294_tmp = vqrdmulhq_n_s16(v293, 13573);
+    int16x8_t v294 = vaddq_s16(v294_tmp, v293);
+    int16x8_t v295 = vaddq_s16(v287, v278);
+    int16x8_t v296 = vaddq_s16(v279, v282);
+    int16x8_t v297 = vaddq_s16(v295, v296);
+    int16x8_t v298 = vaddq_s16(v294, v297);
+    int16x8_t v299 = vaddq_s16(v296, v293);
+    int16x8_t v300_tmp = vqrdmulhq_n_s16(v299, 13573);
+    int16x8_t v300 = vaddq_s16(v300_tmp, v299);
+    int16x8_t v301 = vld1q_s16(in + in_stride * 252 + i);
+    int16x8_t v302 = vaddq_s16(v301, v222);
+    int16x8_t v303 = vaddq_s16(v302, v266);
+    int16x8_t v304 = vaddq_s16(v303, v286);
+    int16x8_t v305 = vaddq_s16(v304, v295);
+    int16x8_t v306 = vaddq_s16(v305, v299);
+    int16x8_t v307 = vaddq_s16(v300, v306);
+    int16x8_t v308 = vqrdmulhq_n_s16(v307, 17734);
+    int16x8_t v309 = vaddq_s16(v298, v308);
+    int16x8_t v310 = vqrdmulhq_n_s16(v309, 16705);
+    int16x8_t v311 = vaddq_s16(v292, v310);
+    int16x8_t v312 = vqrdmulhq_n_s16(v311, 16463);
+    int16x8_t v313 = vaddq_s16(v275, v312);
+    int16x8_t v314 = vqrdmulhq_n_s16(v313, 16404);
+    int16x8_t v315 = vaddq_s16(v234, v314);
+    int16x8_t v316 = vqrdmulhq_n_s16(v315, 16389);
+    int16x8_t v317 = vaddq_s16(v141, v316);
+    int16x8_t v318 = vld1q_s16(in + in_stride * 2 + i);
+    int16x8_t v319_tmp = vqrdmulhq_n_s16(v318, 13573);
+    int16x8_t v319 = vaddq_s16(v319_tmp, v318);
+    int16x8_t v320 = vld1q_s16(in + in_stride * 130 + i);
+    int16x8_t v321 = vld1q_s16(in + in_stride * 126 + i);
+    int16x8_t v322 = vaddq_s16(v320, v321);
+    int16x8_t v323 = vaddq_s16(v319, v322);
+    int16x8_t v324 = vld1q_s16(in + in_stride * 66 + i);
+    int16x8_t v325 = vld1q_s16(in + in_stride * 62 + i);
+    int16x8_t v326 = vaddq_s16(v324, v325);
+    int16x8_t v327_tmp = vqrdmulhq_n_s16(v326, 13573);
+    int16x8_t v327 = vaddq_s16(v327_tmp, v326);
+    int16x8_t v328 = vld1q_s16(in + in_stride * 194 + i);
+    int16x8_t v329 = vld1q_s16(in + in_stride * 190 + i);
+    int16x8_t v330 = vaddq_s16(v328, v329);
+    int16x8_t v331 = vaddq_s16(v330, v326);
+    int16x8_t v332 = vaddq_s16(v327, v331);
+    int16x8_t v333 = vqrdmulhq_n_s16(v332, 17734);
+    int16x8_t v334 = vaddq_s16(v323, v333);
+    int16x8_t v335 = vld1q_s16(in + in_stride * 34 + i);
+    int16x8_t v336 = vld1q_s16(in + in_stride * 30 + i);
+    int16x8_t v337 = vaddq_s16(v335, v336);
+    int16x8_t v338_tmp = vqrdmulhq_n_s16(v337, 13573);
+    int16x8_t v338 = vaddq_s16(v338_tmp, v337);
+    int16x8_t v339 = vld1q_s16(in + in_stride * 162 + i);
+    int16x8_t v340 = vld1q_s16(in + in_stride * 158 + i);
+    int16x8_t v341 = vaddq_s16(v339, v340);
+    int16x8_t v342 = vld1q_s16(in + in_stride * 98 + i);
+    int16x8_t v343 = vld1q_s16(in + in_stride * 94 + i);
+    int16x8_t v344 = vaddq_s16(v342, v343);
+    int16x8_t v345 = vaddq_s16(v341, v344);
+    int16x8_t v346 = vaddq_s16(v338, v345);
+    int16x8_t v347 = vaddq_s16(v344, v337);
+    int16x8_t v348_tmp = vqrdmulhq_n_s16(v347, 13573);
+    int16x8_t v348 = vaddq_s16(v348_tmp, v347);
+    int16x8_t v349 = vld1q_s16(in + in_stride * 226 + i);
+    int16x8_t v350 = vld1q_s16(in + in_stride * 222 + i);
+    int16x8_t v351 = vaddq_s16(v349, v350);
+    int16x8_t v352 = vaddq_s16(v351, v341);
+    int16x8_t v353 = vaddq_s16(v352, v347);
+    int16x8_t v354 = vaddq_s16(v348, v353);
+    int16x8_t v355 = vqrdmulhq_n_s16(v354, 17734);
+    int16x8_t v356 = vaddq_s16(v346, v355);
+    int16x8_t v357 = vqrdmulhq_n_s16(v356, 16705);
+    int16x8_t v358 = vaddq_s16(v334, v357);
+    int16x8_t v359 = vld1q_s16(in + in_stride * 18 + i);
+    int16x8_t v360 = vld1q_s16(in + in_stride * 14 + i);
+    int16x8_t v361 = vaddq_s16(v359, v360);
+    int16x8_t v362_tmp = vqrdmulhq_n_s16(v361, 13573);
+    int16x8_t v362 = vaddq_s16(v362_tmp, v361);
+    int16x8_t v363 = vld1q_s16(in + in_stride * 146 + i);
+    int16x8_t v364 = vld1q_s16(in + in_stride * 142 + i);
+    int16x8_t v365 = vaddq_s16(v363, v364);
+    int16x8_t v366 = vld1q_s16(in + in_stride * 114 + i);
+    int16x8_t v367 = vld1q_s16(in + in_stride * 110 + i);
+    int16x8_t v368 = vaddq_s16(v366, v367);
+    int16x8_t v369 = vaddq_s16(v365, v368);
+    int16x8_t v370 = vaddq_s16(v362, v369);
+    int16x8_t v371 = vld1q_s16(in + in_stride * 82 + i);
+    int16x8_t v372 = vld1q_s16(in + in_stride * 78 + i);
+    int16x8_t v373 = vaddq_s16(v371, v372);
+    int16x8_t v374 = vld1q_s16(in + in_stride * 50 + i);
+    int16x8_t v375 = vld1q_s16(in + in_stride * 46 + i);
+    int16x8_t v376 = vaddq_s16(v374, v375);
+    int16x8_t v377 = vaddq_s16(v373, v376);
+    int16x8_t v378_tmp = vqrdmulhq_n_s16(v377, 13573);
+    int16x8_t v378 = vaddq_s16(v378_tmp, v377);
+    int16x8_t v379 = vld1q_s16(in + in_stride * 210 + i);
+    int16x8_t v380 = vld1q_s16(in + in_stride * 206 + i);
+    int16x8_t v381 = vaddq_s16(v379, v380);
+    int16x8_t v382 = vld1q_s16(in + in_stride * 178 + i);
+    int16x8_t v383 = vld1q_s16(in + in_stride * 174 + i);
+    int16x8_t v384 = vaddq_s16(v382, v383);
+    int16x8_t v385 = vaddq_s16(v381, v384);
+    int16x8_t v386 = vaddq_s16(v385, v377);
+    int16x8_t v387 = vaddq_s16(v378, v386);
+    int16x8_t v388 = vqrdmulhq_n_s16(v387, 17734);
+    int16x8_t v389 = vaddq_s16(v370, v388);
+    int16x8_t v390 = vaddq_s16(v376, v361);
+    int16x8_t v391_tmp = vqrdmulhq_n_s16(v390, 13573);
+    int16x8_t v391 = vaddq_s16(v391_tmp, v390);
+    int16x8_t v392 = vaddq_s16(v384, v365);
+    int16x8_t v393 = vaddq_s16(v368, v373);
+    int16x8_t v394 = vaddq_s16(v392, v393);
+    int16x8_t v395 = vaddq_s16(v391, v394);
+    int16x8_t v396 = vaddq_s16(v393, v390);
+    int16x8_t v397_tmp = vqrdmulhq_n_s16(v396, 13573);
+    int16x8_t v397 = vaddq_s16(v397_tmp, v396);
+    int16x8_t v398 = vld1q_s16(in + in_stride * 242 + i);
+    int16x8_t v399 = vld1q_s16(in + in_stride * 238 + i);
+    int16x8_t v400 = vaddq_s16(v398, v399);
+    int16x8_t v401 = vaddq_s16(v400, v381);
+    int16x8_t v402 = vaddq_s16(v401, v392);
+    int16x8_t v403 = vaddq_s16(v402, v396);
+    int16x8_t v404 = vaddq_s16(v397, v403);
+    int16x8_t v405 = vqrdmulhq_n_s16(v404, 17734);
+    int16x8_t v406 = vaddq_s16(v395, v405);
+    int16x8_t v407 = vqrdmulhq_n_s16(v406, 16705);
+    int16x8_t v408 = vaddq_s16(v389, v407);
+    int16x8_t v409 = vqrdmulhq_n_s16(v408, 16463);
+    int16x8_t v410 = vaddq_s16(v358, v409);
+    int16x8_t v411 = vld1q_s16(in + in_stride * 10 + i);
+    int16x8_t v412 = vld1q_s16(in + in_stride * 6 + i);
+    int16x8_t v413 = vaddq_s16(v411, v412);
+    int16x8_t v414_tmp = vqrdmulhq_n_s16(v413, 13573);
+    int16x8_t v414 = vaddq_s16(v414_tmp, v413);
+    int16x8_t v415 = vld1q_s16(in + in_stride * 138 + i);
+    int16x8_t v416 = vld1q_s16(in + in_stride * 134 + i);
+    int16x8_t v417 = vaddq_s16(v415, v416);
+    int16x8_t v418 = vld1q_s16(in + in_stride * 122 + i);
+    int16x8_t v419 = vld1q_s16(in + in_stride * 118 + i);
+    int16x8_t v420 = vaddq_s16(v418, v419);
+    int16x8_t v421 = vaddq_s16(v417, v420);
+    int16x8_t v422 = vaddq_s16(v414, v421);
+    int16x8_t v423 = vld1q_s16(in + in_stride * 74 + i);
+    int16x8_t v424 = vld1q_s16(in + in_stride * 70 + i);
+    int16x8_t v425 = vaddq_s16(v423, v424);
+    int16x8_t v426 = vld1q_s16(in + in_stride * 58 + i);
+    int16x8_t v427 = vld1q_s16(in + in_stride * 54 + i);
+    int16x8_t v428 = vaddq_s16(v426, v427);
+    int16x8_t v429 = vaddq_s16(v425, v428);
+    int16x8_t v430_tmp = vqrdmulhq_n_s16(v429, 13573);
+    int16x8_t v430 = vaddq_s16(v430_tmp, v429);
+    int16x8_t v431 = vld1q_s16(in + in_stride * 202 + i);
+    int16x8_t v432 = vld1q_s16(in + in_stride * 198 + i);
+    int16x8_t v433 = vaddq_s16(v431, v432);
+    int16x8_t v434 = vld1q_s16(in + in_stride * 186 + i);
+    int16x8_t v435 = vld1q_s16(in + in_stride * 182 + i);
+    int16x8_t v436 = vaddq_s16(v434, v435);
+    int16x8_t v437 = vaddq_s16(v433, v436);
+    int16x8_t v438 = vaddq_s16(v437, v429);
+    int16x8_t v439 = vaddq_s16(v430, v438);
+    int16x8_t v440 = vqrdmulhq_n_s16(v439, 17734);
+    int16x8_t v441 = vaddq_s16(v422, v440);
+    int16x8_t v442 = vld1q_s16(in + in_stride * 42 + i);
+    int16x8_t v443 = vld1q_s16(in + in_stride * 38 + i);
+    int16x8_t v444 = vaddq_s16(v442, v443);
+    int16x8_t v445 = vld1q_s16(in + in_stride * 26 + i);
+    int16x8_t v446 = vld1q_s16(in + in_stride * 22 + i);
+    int16x8_t v447 = vaddq_s16(v445, v446);
+    int16x8_t v448 = vaddq_s16(v444, v447);
+    int16x8_t v449_tmp = vqrdmulhq_n_s16(v448, 13573);
+    int16x8_t v449 = vaddq_s16(v449_tmp, v448);
+    int16x8_t v450 = vld1q_s16(in + in_stride * 170 + i);
+    int16x8_t v451 = vld1q_s16(in + in_stride * 166 + i);
+    int16x8_t v452 = vaddq_s16(v450, v451);
+    int16x8_t v453 = vld1q_s16(in + in_stride * 154 + i);
+    int16x8_t v454 = vld1q_s16(in + in_stride * 150 + i);
+    int16x8_t v455 = vaddq_s16(v453, v454);
+    int16x8_t v456 = vaddq_s16(v452, v455);
+    int16x8_t v457 = vld1q_s16(in + in_stride * 106 + i);
+    int16x8_t v458 = vld1q_s16(in + in_stride * 102 + i);
+    int16x8_t v459 = vaddq_s16(v457, v458);
+    int16x8_t v460 = vld1q_s16(in + in_stride * 90 + i);
+    int16x8_t v461 = vld1q_s16(in + in_stride * 86 + i);
+    int16x8_t v462 = vaddq_s16(v460, v461);
+    int16x8_t v463 = vaddq_s16(v459, v462);
+    int16x8_t v464 = vaddq_s16(v456, v463);
+    int16x8_t v465 = vaddq_s16(v449, v464);
+    int16x8_t v466 = vaddq_s16(v463, v448);
+    int16x8_t v467_tmp = vqrdmulhq_n_s16(v466, 13573);
+    int16x8_t v467 = vaddq_s16(v467_tmp, v466);
+    int16x8_t v468 = vld1q_s16(in + in_stride * 234 + i);
+    int16x8_t v469 = vld1q_s16(in + in_stride * 230 + i);
+    int16x8_t v470 = vaddq_s16(v468, v469);
+    int16x8_t v471 = vld1q_s16(in + in_stride * 218 + i);
+    int16x8_t v472 = vld1q_s16(in + in_stride * 214 + i);
+    int16x8_t v473 = vaddq_s16(v471, v472);
+    int16x8_t v474 = vaddq_s16(v470, v473);
+    int16x8_t v475 = vaddq_s16(v474, v456);
+    int16x8_t v476 = vaddq_s16(v475, v466);
+    int16x8_t v477 = vaddq_s16(v467, v476);
+    int16x8_t v478 = vqrdmulhq_n_s16(v477, 17734);
+    int16x8_t v479 = vaddq_s16(v465, v478);
+    int16x8_t v480 = vqrdmulhq_n_s16(v479, 16705);
+    int16x8_t v481 = vaddq_s16(v441, v480);
+    int16x8_t v482 = vaddq_s16(v447, v413);
+    int16x8_t v483_tmp = vqrdmulhq_n_s16(v482, 13573);
+    int16x8_t v483 = vaddq_s16(v483_tmp, v482);
+    int16x8_t v484 = vaddq_s16(v455, v417);
+    int16x8_t v485 = vaddq_s16(v420, v459);
+    int16x8_t v486 = vaddq_s16(v484, v485);
+    int16x8_t v487 = vaddq_s16(v483, v486);
+    int16x8_t v488 = vaddq_s16(v462, v425);
+    int16x8_t v489 = vaddq_s16(v428, v444);
+    int16x8_t v490 = vaddq_s16(v488, v489);
+    int16x8_t v491_tmp = vqrdmulhq_n_s16(v490, 13573);
+    int16x8_t v491 = vaddq_s16(v491_tmp, v490);
+    int16x8_t v492 = vaddq_s16(v473, v433);
+    int16x8_t v493 = vaddq_s16(v436, v452);
+    int16x8_t v494 = vaddq_s16(v492, v493);
+    int16x8_t v495 = vaddq_s16(v494, v490);
+    int16x8_t v496 = vaddq_s16(v491, v495);
+    int16x8_t v497 = vqrdmulhq_n_s16(v496, 17734);
+    int16x8_t v498 = vaddq_s16(v487, v497);
+    int16x8_t v499 = vaddq_s16(v489, v482);
+    int16x8_t v500_tmp = vqrdmulhq_n_s16(v499, 13573);
+    int16x8_t v500 = vaddq_s16(v500_tmp, v499);
+    int16x8_t v501 = vaddq_s16(v493, v484);
+    int16x8_t v502 = vaddq_s16(v485, v488);
+    int16x8_t v503 = vaddq_s16(v501, v502);
+    int16x8_t v504 = vaddq_s16(v500, v503);
+    int16x8_t v505 = vaddq_s16(v502, v499);
+    int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 13573);
+    int16x8_t v506 = vaddq_s16(v506_tmp, v505);
+    int16x8_t v507 = vld1q_s16(in + in_stride * 250 + i);
+    int16x8_t v508 = vld1q_s16(in + in_stride * 246 + i);
+    int16x8_t v509 = vaddq_s16(v507, v508);
+    int16x8_t v510 = vaddq_s16(v509, v470);
+    int16x8_t v511 = vaddq_s16(v510, v492);
+    int16x8_t v512 = vaddq_s16(v511, v501);
+    int16x8_t v513 = vaddq_s16(v512, v505);
+    int16x8_t v514 = vaddq_s16(v506, v513);
+    int16x8_t v515 = vqrdmulhq_n_s16(v514, 17734);
+    int16x8_t v516 = vaddq_s16(v504, v515);
+    int16x8_t v517 = vqrdmulhq_n_s16(v516, 16705);
+    int16x8_t v518 = vaddq_s16(v498, v517);
+    int16x8_t v519 = vqrdmulhq_n_s16(v518, 16463);
+    int16x8_t v520 = vaddq_s16(v481, v519);
+    int16x8_t v521 = vqrdmulhq_n_s16(v520, 16404);
+    int16x8_t v522 = vaddq_s16(v410, v521);
+    int16x8_t v523 = vaddq_s16(v412, v318);
+    int16x8_t v524_tmp = vqrdmulhq_n_s16(v523, 13573);
+    int16x8_t v524 = vaddq_s16(v524_tmp, v523);
+    int16x8_t v525 = vaddq_s16(v416, v320);
+    int16x8_t v526 = vaddq_s16(v321, v418);
+    int16x8_t v527 = vaddq_s16(v525, v526);
+    int16x8_t v528 = vaddq_s16(v524, v527);
+    int16x8_t v529 = vaddq_s16(v424, v324);
+    int16x8_t v530 = vaddq_s16(v325, v426);
+    int16x8_t v531 = vaddq_s16(v529, v530);
+    int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 13573);
+    int16x8_t v532 = vaddq_s16(v532_tmp, v531);
+    int16x8_t v533 = vaddq_s16(v432, v328);
+    int16x8_t v534 = vaddq_s16(v329, v434);
+    int16x8_t v535 = vaddq_s16(v533, v534);
+    int16x8_t v536 = vaddq_s16(v535, v531);
+    int16x8_t v537 = vaddq_s16(v532, v536);
+    int16x8_t v538 = vqrdmulhq_n_s16(v537, 17734);
+    int16x8_t v539 = vaddq_s16(v528, v538);
+    int16x8_t v540 = vaddq_s16(v443, v335);
+    int16x8_t v541 = vaddq_s16(v336, v445);
+    int16x8_t v542 = vaddq_s16(v540, v541);
+    int16x8_t v543_tmp = vqrdmulhq_n_s16(v542, 13573);
+    int16x8_t v543 = vaddq_s16(v543_tmp, v542);
+    int16x8_t v544 = vaddq_s16(v451, v339);
+    int16x8_t v545 = vaddq_s16(v340, v453);
+    int16x8_t v546 = vaddq_s16(v544, v545);
+    int16x8_t v547 = vaddq_s16(v458, v342);
+    int16x8_t v548 = vaddq_s16(v343, v460);
+    int16x8_t v549 = vaddq_s16(v547, v548);
+    int16x8_t v550 = vaddq_s16(v546, v549);
+    int16x8_t v551 = vaddq_s16(v543, v550);
+    int16x8_t v552 = vaddq_s16(v549, v542);
+    int16x8_t v553_tmp = vqrdmulhq_n_s16(v552, 13573);
+    int16x8_t v553 = vaddq_s16(v553_tmp, v552);
+    int16x8_t v554 = vaddq_s16(v469, v349);
+    int16x8_t v555 = vaddq_s16(v350, v471);
+    int16x8_t v556 = vaddq_s16(v554, v555);
+    int16x8_t v557 = vaddq_s16(v556, v546);
+    int16x8_t v558 = vaddq_s16(v557, v552);
+    int16x8_t v559 = vaddq_s16(v553, v558);
+    int16x8_t v560 = vqrdmulhq_n_s16(v559, 17734);
+    int16x8_t v561 = vaddq_s16(v551, v560);
+    int16x8_t v562 = vqrdmulhq_n_s16(v561, 16705);
+    int16x8_t v563 = vaddq_s16(v539, v562);
+    int16x8_t v564 = vaddq_s16(v446, v359);
+    int16x8_t v565 = vaddq_s16(v360, v411);
+    int16x8_t v566 = vaddq_s16(v564, v565);
+    int16x8_t v567_tmp = vqrdmulhq_n_s16(v566, 13573);
+    int16x8_t v567 = vaddq_s16(v567_tmp, v566);
+    int16x8_t v568 = vaddq_s16(v454, v363);
+    int16x8_t v569 = vaddq_s16(v364, v415);
+    int16x8_t v570 = vaddq_s16(v568, v569);
+    int16x8_t v571 = vaddq_s16(v419, v366);
+    int16x8_t v572 = vaddq_s16(v367, v457);
+    int16x8_t v573 = vaddq_s16(v571, v572);
+    int16x8_t v574 = vaddq_s16(v570, v573);
+    int16x8_t v575 = vaddq_s16(v567, v574);
+    int16x8_t v576 = vaddq_s16(v461, v371);
+    int16x8_t v577 = vaddq_s16(v372, v423);
+    int16x8_t v578 = vaddq_s16(v576, v577);
+    int16x8_t v579 = vaddq_s16(v427, v374);
+    int16x8_t v580 = vaddq_s16(v375, v442);
+    int16x8_t v581 = vaddq_s16(v579, v580);
+    int16x8_t v582 = vaddq_s16(v578, v581);
+    int16x8_t v583_tmp = vqrdmulhq_n_s16(v582, 13573);
+    int16x8_t v583 = vaddq_s16(v583_tmp, v582);
+    int16x8_t v584 = vaddq_s16(v472, v379);
+    int16x8_t v585 = vaddq_s16(v380, v431);
+    int16x8_t v586 = vaddq_s16(v584, v585);
+    int16x8_t v587 = vaddq_s16(v435, v382);
+    int16x8_t v588 = vaddq_s16(v383, v450);
+    int16x8_t v589 = vaddq_s16(v587, v588);
+    int16x8_t v590 = vaddq_s16(v586, v589);
+    int16x8_t v591 = vaddq_s16(v590, v582);
+    int16x8_t v592 = vaddq_s16(v583, v591);
+    int16x8_t v593 = vqrdmulhq_n_s16(v592, 17734);
+    int16x8_t v594 = vaddq_s16(v575, v593);
+    int16x8_t v595 = vaddq_s16(v581, v566);
+    int16x8_t v596_tmp = vqrdmulhq_n_s16(v595, 13573);
+    int16x8_t v596 = vaddq_s16(v596_tmp, v595);
+    int16x8_t v597 = vaddq_s16(v589, v570);
+    int16x8_t v598 = vaddq_s16(v573, v578);
+    int16x8_t v599 = vaddq_s16(v597, v598);
+    int16x8_t v600 = vaddq_s16(v596, v599);
+    int16x8_t v601 = vaddq_s16(v598, v595);
+    int16x8_t v602_tmp = vqrdmulhq_n_s16(v601, 13573);
+    int16x8_t v602 = vaddq_s16(v602_tmp, v601);
+    int16x8_t v603 = vaddq_s16(v508, v398);
+    int16x8_t v604 = vaddq_s16(v399, v468);
+    int16x8_t v605 = vaddq_s16(v603, v604);
+    int16x8_t v606 = vaddq_s16(v605, v586);
+    int16x8_t v607 = vaddq_s16(v606, v597);
+    int16x8_t v608 = vaddq_s16(v607, v601);
+    int16x8_t v609 = vaddq_s16(v602, v608);
+    int16x8_t v610 = vqrdmulhq_n_s16(v609, 17734);
+    int16x8_t v611 = vaddq_s16(v600, v610);
+    int16x8_t v612 = vqrdmulhq_n_s16(v611, 16705);
+    int16x8_t v613 = vaddq_s16(v594, v612);
+    int16x8_t v614 = vqrdmulhq_n_s16(v613, 16463);
+    int16x8_t v615 = vaddq_s16(v563, v614);
+    int16x8_t v616 = vaddq_s16(v565, v523);
+    int16x8_t v617_tmp = vqrdmulhq_n_s16(v616, 13573);
+    int16x8_t v617 = vaddq_s16(v617_tmp, v616);
+    int16x8_t v618 = vaddq_s16(v569, v525);
+    int16x8_t v619 = vaddq_s16(v526, v571);
+    int16x8_t v620 = vaddq_s16(v618, v619);
+    int16x8_t v621 = vaddq_s16(v617, v620);
+    int16x8_t v622 = vaddq_s16(v577, v529);
+    int16x8_t v623 = vaddq_s16(v530, v579);
+    int16x8_t v624 = vaddq_s16(v622, v623);
+    int16x8_t v625_tmp = vqrdmulhq_n_s16(v624, 13573);
+    int16x8_t v625 = vaddq_s16(v625_tmp, v624);
+    int16x8_t v626 = vaddq_s16(v585, v533);
+    int16x8_t v627 = vaddq_s16(v534, v587);
+    int16x8_t v628 = vaddq_s16(v626, v627);
+    int16x8_t v629 = vaddq_s16(v628, v624);
+    int16x8_t v630 = vaddq_s16(v625, v629);
+    int16x8_t v631 = vqrdmulhq_n_s16(v630, 17734);
+    int16x8_t v632 = vaddq_s16(v621, v631);
+    int16x8_t v633 = vaddq_s16(v580, v540);
+    int16x8_t v634 = vaddq_s16(v541, v564);
+    int16x8_t v635 = vaddq_s16(v633, v634);
+    int16x8_t v636_tmp = vqrdmulhq_n_s16(v635, 13573);
+    int16x8_t v636 = vaddq_s16(v636_tmp, v635);
+    int16x8_t v637 = vaddq_s16(v588, v544);
+    int16x8_t v638 = vaddq_s16(v545, v568);
+    int16x8_t v639 = vaddq_s16(v637, v638);
+    int16x8_t v640 = vaddq_s16(v572, v547);
+    int16x8_t v641 = vaddq_s16(v548, v576);
+    int16x8_t v642 = vaddq_s16(v640, v641);
+    int16x8_t v643 = vaddq_s16(v639, v642);
+    int16x8_t v644 = vaddq_s16(v636, v643);
+    int16x8_t v645 = vaddq_s16(v642, v635);
+    int16x8_t v646_tmp = vqrdmulhq_n_s16(v645, 13573);
+    int16x8_t v646 = vaddq_s16(v646_tmp, v645);
+    int16x8_t v647 = vaddq_s16(v604, v554);
+    int16x8_t v648 = vaddq_s16(v555, v584);
+    int16x8_t v649 = vaddq_s16(v647, v648);
+    int16x8_t v650 = vaddq_s16(v649, v639);
+    int16x8_t v651 = vaddq_s16(v650, v645);
+    int16x8_t v652 = vaddq_s16(v646, v651);
+    int16x8_t v653 = vqrdmulhq_n_s16(v652, 17734);
+    int16x8_t v654 = vaddq_s16(v644, v653);
+    int16x8_t v655 = vqrdmulhq_n_s16(v654, 16705);
+    int16x8_t v656 = vaddq_s16(v632, v655);
+    int16x8_t v657 = vaddq_s16(v634, v616);
+    int16x8_t v658_tmp = vqrdmulhq_n_s16(v657, 13573);
+    int16x8_t v658 = vaddq_s16(v658_tmp, v657);
+    int16x8_t v659 = vaddq_s16(v638, v618);
+    int16x8_t v660 = vaddq_s16(v619, v640);
+    int16x8_t v661 = vaddq_s16(v659, v660);
+    int16x8_t v662 = vaddq_s16(v658, v661);
+    int16x8_t v663 = vaddq_s16(v641, v622);
+    int16x8_t v664 = vaddq_s16(v623, v633);
+    int16x8_t v665 = vaddq_s16(v663, v664);
+    int16x8_t v666_tmp = vqrdmulhq_n_s16(v665, 13573);
+    int16x8_t v666 = vaddq_s16(v666_tmp, v665);
+    int16x8_t v667 = vaddq_s16(v648, v626);
+    int16x8_t v668 = vaddq_s16(v627, v637);
+    int16x8_t v669 = vaddq_s16(v667, v668);
+    int16x8_t v670 = vaddq_s16(v669, v665);
+    int16x8_t v671 = vaddq_s16(v666, v670);
+    int16x8_t v672 = vqrdmulhq_n_s16(v671, 17734);
+    int16x8_t v673 = vaddq_s16(v662, v672);
+    int16x8_t v674 = vaddq_s16(v664, v657);
+    int16x8_t v675_tmp = vqrdmulhq_n_s16(v674, 13573);
+    int16x8_t v675 = vaddq_s16(v675_tmp, v674);
+    int16x8_t v676 = vaddq_s16(v668, v659);
+    int16x8_t v677 = vaddq_s16(v660, v663);
+    int16x8_t v678 = vaddq_s16(v676, v677);
+    int16x8_t v679 = vaddq_s16(v675, v678);
+    int16x8_t v680 = vaddq_s16(v677, v674);
+    int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 13573);
+    int16x8_t v681 = vaddq_s16(v681_tmp, v680);
+    int16x8_t v682 = vld1q_s16(in + in_stride * 254 + i);
+    int16x8_t v683 = vaddq_s16(v682, v507);
+    int16x8_t v684 = vaddq_s16(v683, v603);
+    int16x8_t v685 = vaddq_s16(v684, v647);
+    int16x8_t v686 = vaddq_s16(v685, v667);
+    int16x8_t v687 = vaddq_s16(v686, v676);
+    int16x8_t v688 = vaddq_s16(v687, v680);
+    int16x8_t v689 = vaddq_s16(v681, v688);
+    int16x8_t v690 = vqrdmulhq_n_s16(v689, 17734);
+    int16x8_t v691 = vaddq_s16(v679, v690);
+    int16x8_t v692 = vqrdmulhq_n_s16(v691, 16705);
+    int16x8_t v693 = vaddq_s16(v673, v692);
+    int16x8_t v694 = vqrdmulhq_n_s16(v693, 16463);
+    int16x8_t v695 = vaddq_s16(v656, v694);
+    int16x8_t v696 = vqrdmulhq_n_s16(v695, 16404);
+    int16x8_t v697 = vaddq_s16(v615, v696);
+    int16x8_t v698 = vqrdmulhq_n_s16(v697, 16389);
+    int16x8_t v699 = vaddq_s16(v522, v698);
+    int16x8_t v700 = vqrdmulhq_n_s16(v699, 16385);
+    int16x8_t v701 = vaddq_s16(v317, v700);
+    int16x8_t v702 = vld1q_s16(in + in_stride * 1 + i);
+    int16x8_t v703_tmp = vqrdmulhq_n_s16(v702, 13573);
+    int16x8_t v703 = vaddq_s16(v703_tmp, v702);
+    int16x8_t v704 = vld1q_s16(in + in_stride * 129 + i);
+    int16x8_t v705 = vld1q_s16(in + in_stride * 127 + i);
+    int16x8_t v706 = vaddq_s16(v704, v705);
+    int16x8_t v707 = vaddq_s16(v703, v706);
+    int16x8_t v708 = vld1q_s16(in + in_stride * 65 + i);
+    int16x8_t v709 = vld1q_s16(in + in_stride * 63 + i);
+    int16x8_t v710 = vaddq_s16(v708, v709);
+    int16x8_t v711_tmp = vqrdmulhq_n_s16(v710, 13573);
+    int16x8_t v711 = vaddq_s16(v711_tmp, v710);
+    int16x8_t v712 = vld1q_s16(in + in_stride * 193 + i);
+    int16x8_t v713 = vld1q_s16(in + in_stride * 191 + i);
+    int16x8_t v714 = vaddq_s16(v712, v713);
+    int16x8_t v715 = vaddq_s16(v714, v710);
+    int16x8_t v716 = vaddq_s16(v711, v715);
+    int16x8_t v717 = vqrdmulhq_n_s16(v716, 17734);
+    int16x8_t v718 = vaddq_s16(v707, v717);
+    int16x8_t v719 = vld1q_s16(in + in_stride * 33 + i);
+    int16x8_t v720 = vld1q_s16(in + in_stride * 31 + i);
+    int16x8_t v721 = vaddq_s16(v719, v720);
+    int16x8_t v722_tmp = vqrdmulhq_n_s16(v721, 13573);
+    int16x8_t v722 = vaddq_s16(v722_tmp, v721);
+    int16x8_t v723 = vld1q_s16(in + in_stride * 161 + i);
+    int16x8_t v724 = vld1q_s16(in + in_stride * 159 + i);
+    int16x8_t v725 = vaddq_s16(v723, v724);
+    int16x8_t v726 = vld1q_s16(in + in_stride * 97 + i);
+    int16x8_t v727 = vld1q_s16(in + in_stride * 95 + i);
+    int16x8_t v728 = vaddq_s16(v726, v727);
+    int16x8_t v729 = vaddq_s16(v725, v728);
+    int16x8_t v730 = vaddq_s16(v722, v729);
+    int16x8_t v731 = vaddq_s16(v728, v721);
+    int16x8_t v732_tmp = vqrdmulhq_n_s16(v731, 13573);
+    int16x8_t v732 = vaddq_s16(v732_tmp, v731);
+    int16x8_t v733 = vld1q_s16(in + in_stride * 225 + i);
+    int16x8_t v734 = vld1q_s16(in + in_stride * 223 + i);
+    int16x8_t v735 = vaddq_s16(v733, v734);
+    int16x8_t v736 = vaddq_s16(v735, v725);
+    int16x8_t v737 = vaddq_s16(v736, v731);
+    int16x8_t v738 = vaddq_s16(v732, v737);
+    int16x8_t v739 = vqrdmulhq_n_s16(v738, 17734);
+    int16x8_t v740 = vaddq_s16(v730, v739);
+    int16x8_t v741 = vqrdmulhq_n_s16(v740, 16705);
+    int16x8_t v742 = vaddq_s16(v718, v741);
+    int16x8_t v743 = vld1q_s16(in + in_stride * 17 + i);
+    int16x8_t v744 = vld1q_s16(in + in_stride * 15 + i);
+    int16x8_t v745 = vaddq_s16(v743, v744);
+    int16x8_t v746_tmp = vqrdmulhq_n_s16(v745, 13573);
+    int16x8_t v746 = vaddq_s16(v746_tmp, v745);
+    int16x8_t v747 = vld1q_s16(in + in_stride * 145 + i);
+    int16x8_t v748 = vld1q_s16(in + in_stride * 143 + i);
+    int16x8_t v749 = vaddq_s16(v747, v748);
+    int16x8_t v750 = vld1q_s16(in + in_stride * 113 + i);
+    int16x8_t v751 = vld1q_s16(in + in_stride * 111 + i);
+    int16x8_t v752 = vaddq_s16(v750, v751);
+    int16x8_t v753 = vaddq_s16(v749, v752);
+    int16x8_t v754 = vaddq_s16(v746, v753);
+    int16x8_t v755 = vld1q_s16(in + in_stride * 81 + i);
+    int16x8_t v756 = vld1q_s16(in + in_stride * 79 + i);
+    int16x8_t v757 = vaddq_s16(v755, v756);
+    int16x8_t v758 = vld1q_s16(in + in_stride * 49 + i);
+    int16x8_t v759 = vld1q_s16(in + in_stride * 47 + i);
+    int16x8_t v760 = vaddq_s16(v758, v759);
+    int16x8_t v761 = vaddq_s16(v757, v760);
+    int16x8_t v762_tmp = vqrdmulhq_n_s16(v761, 13573);
+    int16x8_t v762 = vaddq_s16(v762_tmp, v761);
+    int16x8_t v763 = vld1q_s16(in + in_stride * 209 + i);
+    int16x8_t v764 = vld1q_s16(in + in_stride * 207 + i);
+    int16x8_t v765 = vaddq_s16(v763, v764);
+    int16x8_t v766 = vld1q_s16(in + in_stride * 177 + i);
+    int16x8_t v767 = vld1q_s16(in + in_stride * 175 + i);
+    int16x8_t v768 = vaddq_s16(v766, v767);
+    int16x8_t v769 = vaddq_s16(v765, v768);
+    int16x8_t v770 = vaddq_s16(v769, v761);
+    int16x8_t v771 = vaddq_s16(v762, v770);
+    int16x8_t v772 = vqrdmulhq_n_s16(v771, 17734);
+    int16x8_t v773 = vaddq_s16(v754, v772);
+    int16x8_t v774 = vaddq_s16(v760, v745);
+    int16x8_t v775_tmp = vqrdmulhq_n_s16(v774, 13573);
+    int16x8_t v775 = vaddq_s16(v775_tmp, v774);
+    int16x8_t v776 = vaddq_s16(v768, v749);
+    int16x8_t v777 = vaddq_s16(v752, v757);
+    int16x8_t v778 = vaddq_s16(v776, v777);
+    int16x8_t v779 = vaddq_s16(v775, v778);
+    int16x8_t v780 = vaddq_s16(v777, v774);
+    int16x8_t v781_tmp = vqrdmulhq_n_s16(v780, 13573);
+    int16x8_t v781 = vaddq_s16(v781_tmp, v780);
+    int16x8_t v782 = vld1q_s16(in + in_stride * 241 + i);
+    int16x8_t v783 = vld1q_s16(in + in_stride * 239 + i);
+    int16x8_t v784 = vaddq_s16(v782, v783);
+    int16x8_t v785 = vaddq_s16(v784, v765);
+    int16x8_t v786 = vaddq_s16(v785, v776);
+    int16x8_t v787 = vaddq_s16(v786, v780);
+    int16x8_t v788 = vaddq_s16(v781, v787);
+    int16x8_t v789 = vqrdmulhq_n_s16(v788, 17734);
+    int16x8_t v790 = vaddq_s16(v779, v789);
+    int16x8_t v791 = vqrdmulhq_n_s16(v790, 16705);
+    int16x8_t v792 = vaddq_s16(v773, v791);
+    int16x8_t v793 = vqrdmulhq_n_s16(v792, 16463);
+    int16x8_t v794 = vaddq_s16(v742, v793);
+    int16x8_t v795 = vld1q_s16(in + in_stride * 9 + i);
+    int16x8_t v796 = vld1q_s16(in + in_stride * 7 + i);
+    int16x8_t v797 = vaddq_s16(v795, v796);
+    int16x8_t v798_tmp = vqrdmulhq_n_s16(v797, 13573);
+    int16x8_t v798 = vaddq_s16(v798_tmp, v797);
+    int16x8_t v799 = vld1q_s16(in + in_stride * 137 + i);
+    int16x8_t v800 = vld1q_s16(in + in_stride * 135 + i);
+    int16x8_t v801 = vaddq_s16(v799, v800);
+    int16x8_t v802 = vld1q_s16(in + in_stride * 121 + i);
+    int16x8_t v803 = vld1q_s16(in + in_stride * 119 + i);
+    int16x8_t v804 = vaddq_s16(v802, v803);
+    int16x8_t v805 = vaddq_s16(v801, v804);
+    int16x8_t v806 = vaddq_s16(v798, v805);
+    int16x8_t v807 = vld1q_s16(in + in_stride * 73 + i);
+    int16x8_t v808 = vld1q_s16(in + in_stride * 71 + i);
+    int16x8_t v809 = vaddq_s16(v807, v808);
+    int16x8_t v810 = vld1q_s16(in + in_stride * 57 + i);
+    int16x8_t v811 = vld1q_s16(in + in_stride * 55 + i);
+    int16x8_t v812 = vaddq_s16(v810, v811);
+    int16x8_t v813 = vaddq_s16(v809, v812);
+    int16x8_t v814_tmp = vqrdmulhq_n_s16(v813, 13573);
+    int16x8_t v814 = vaddq_s16(v814_tmp, v813);
+    int16x8_t v815 = vld1q_s16(in + in_stride * 201 + i);
+    int16x8_t v816 = vld1q_s16(in + in_stride * 199 + i);
+    int16x8_t v817 = vaddq_s16(v815, v816);
+    int16x8_t v818 = vld1q_s16(in + in_stride * 185 + i);
+    int16x8_t v819 = vld1q_s16(in + in_stride * 183 + i);
+    int16x8_t v820 = vaddq_s16(v818, v819);
+    int16x8_t v821 = vaddq_s16(v817, v820);
+    int16x8_t v822 = vaddq_s16(v821, v813);
+    int16x8_t v823 = vaddq_s16(v814, v822);
+    int16x8_t v824 = vqrdmulhq_n_s16(v823, 17734);
+    int16x8_t v825 = vaddq_s16(v806, v824);
+    int16x8_t v826 = vld1q_s16(in + in_stride * 41 + i);
+    int16x8_t v827 = vld1q_s16(in + in_stride * 39 + i);
+    int16x8_t v828 = vaddq_s16(v826, v827);
+    int16x8_t v829 = vld1q_s16(in + in_stride * 25 + i);
+    int16x8_t v830 = vld1q_s16(in + in_stride * 23 + i);
+    int16x8_t v831 = vaddq_s16(v829, v830);
+    int16x8_t v832 = vaddq_s16(v828, v831);
+    int16x8_t v833_tmp = vqrdmulhq_n_s16(v832, 13573);
+    int16x8_t v833 = vaddq_s16(v833_tmp, v832);
+    int16x8_t v834 = vld1q_s16(in + in_stride * 169 + i);
+    int16x8_t v835 = vld1q_s16(in + in_stride * 167 + i);
+    int16x8_t v836 = vaddq_s16(v834, v835);
+    int16x8_t v837 = vld1q_s16(in + in_stride * 153 + i);
+    int16x8_t v838 = vld1q_s16(in + in_stride * 151 + i);
+    int16x8_t v839 = vaddq_s16(v837, v838);
+    int16x8_t v840 = vaddq_s16(v836, v839);
+    int16x8_t v841 = vld1q_s16(in + in_stride * 105 + i);
+    int16x8_t v842 = vld1q_s16(in + in_stride * 103 + i);
+    int16x8_t v843 = vaddq_s16(v841, v842);
+    int16x8_t v844 = vld1q_s16(in + in_stride * 89 + i);
+    int16x8_t v845 = vld1q_s16(in + in_stride * 87 + i);
+    int16x8_t v846 = vaddq_s16(v844, v845);
+    int16x8_t v847 = vaddq_s16(v843, v846);
+    int16x8_t v848 = vaddq_s16(v840, v847);
+    int16x8_t v849 = vaddq_s16(v833, v848);
+    int16x8_t v850 = vaddq_s16(v847, v832);
+    int16x8_t v851_tmp = vqrdmulhq_n_s16(v850, 13573);
+    int16x8_t v851 = vaddq_s16(v851_tmp, v850);
+    int16x8_t v852 = vld1q_s16(in + in_stride * 233 + i);
+    int16x8_t v853 = vld1q_s16(in + in_stride * 231 + i);
+    int16x8_t v854 = vaddq_s16(v852, v853);
+    int16x8_t v855 = vld1q_s16(in + in_stride * 217 + i);
+    int16x8_t v856 = vld1q_s16(in + in_stride * 215 + i);
+    int16x8_t v857 = vaddq_s16(v855, v856);
+    int16x8_t v858 = vaddq_s16(v854, v857);
+    int16x8_t v859 = vaddq_s16(v858, v840);
+    int16x8_t v860 = vaddq_s16(v859, v850);
+    int16x8_t v861 = vaddq_s16(v851, v860);
+    int16x8_t v862 = vqrdmulhq_n_s16(v861, 17734);
+    int16x8_t v863 = vaddq_s16(v849, v862);
+    int16x8_t v864 = vqrdmulhq_n_s16(v863, 16705);
+    int16x8_t v865 = vaddq_s16(v825, v864);
+    int16x8_t v866 = vaddq_s16(v831, v797);
+    int16x8_t v867_tmp = vqrdmulhq_n_s16(v866, 13573);
+    int16x8_t v867 = vaddq_s16(v867_tmp, v866);
+    int16x8_t v868 = vaddq_s16(v839, v801);
+    int16x8_t v869 = vaddq_s16(v804, v843);
+    int16x8_t v870 = vaddq_s16(v868, v869);
+    int16x8_t v871 = vaddq_s16(v867, v870);
+    int16x8_t v872 = vaddq_s16(v846, v809);
+    int16x8_t v873 = vaddq_s16(v812, v828);
+    int16x8_t v874 = vaddq_s16(v872, v873);
+    int16x8_t v875_tmp = vqrdmulhq_n_s16(v874, 13573);
+    int16x8_t v875 = vaddq_s16(v875_tmp, v874);
+    int16x8_t v876 = vaddq_s16(v857, v817);
+    int16x8_t v877 = vaddq_s16(v820, v836);
+    int16x8_t v878 = vaddq_s16(v876, v877);
+    int16x8_t v879 = vaddq_s16(v878, v874);
+    int16x8_t v880 = vaddq_s16(v875, v879);
+    int16x8_t v881 = vqrdmulhq_n_s16(v880, 17734);
+    int16x8_t v882 = vaddq_s16(v871, v881);
+    int16x8_t v883 = vaddq_s16(v873, v866);
+    int16x8_t v884_tmp = vqrdmulhq_n_s16(v883, 13573);
+    int16x8_t v884 = vaddq_s16(v884_tmp, v883);
+    int16x8_t v885 = vaddq_s16(v877, v868);
+    int16x8_t v886 = vaddq_s16(v869, v872);
+    int16x8_t v887 = vaddq_s16(v885, v886);
+    int16x8_t v888 = vaddq_s16(v884, v887);
+    int16x8_t v889 = vaddq_s16(v886, v883);
+    int16x8_t v890_tmp = vqrdmulhq_n_s16(v889, 13573);
+    int16x8_t v890 = vaddq_s16(v890_tmp, v889);
+    int16x8_t v891 = vld1q_s16(in + in_stride * 249 + i);
+    int16x8_t v892 = vld1q_s16(in + in_stride * 247 + i);
+    int16x8_t v893 = vaddq_s16(v891, v892);
+    int16x8_t v894 = vaddq_s16(v893, v854);
+    int16x8_t v895 = vaddq_s16(v894, v876);
+    int16x8_t v896 = vaddq_s16(v895, v885);
+    int16x8_t v897 = vaddq_s16(v896, v889);
+    int16x8_t v898 = vaddq_s16(v890, v897);
+    int16x8_t v899 = vqrdmulhq_n_s16(v898, 17734);
+    int16x8_t v900 = vaddq_s16(v888, v899);
+    int16x8_t v901 = vqrdmulhq_n_s16(v900, 16705);
+    int16x8_t v902 = vaddq_s16(v882, v901);
+    int16x8_t v903 = vqrdmulhq_n_s16(v902, 16463);
+    int16x8_t v904 = vaddq_s16(v865, v903);
+    int16x8_t v905 = vqrdmulhq_n_s16(v904, 16404);
+    int16x8_t v906 = vaddq_s16(v794, v905);
+    int16x8_t v907 = vld1q_s16(in + in_stride * 5 + i);
+    int16x8_t v908 = vld1q_s16(in + in_stride * 3 + i);
+    int16x8_t v909 = vaddq_s16(v907, v908);
+    int16x8_t v910_tmp = vqrdmulhq_n_s16(v909, 13573);
+    int16x8_t v910 = vaddq_s16(v910_tmp, v909);
+    int16x8_t v911 = vld1q_s16(in + in_stride * 133 + i);
+    int16x8_t v912 = vld1q_s16(in + in_stride * 131 + i);
+    int16x8_t v913 = vaddq_s16(v911, v912);
+    int16x8_t v914 = vld1q_s16(in + in_stride * 125 + i);
+    int16x8_t v915 = vld1q_s16(in + in_stride * 123 + i);
+    int16x8_t v916 = vaddq_s16(v914, v915);
+    int16x8_t v917 = vaddq_s16(v913, v916);
+    int16x8_t v918 = vaddq_s16(v910, v917);
+    int16x8_t v919 = vld1q_s16(in + in_stride * 69 + i);
+    int16x8_t v920 = vld1q_s16(in + in_stride * 67 + i);
+    int16x8_t v921 = vaddq_s16(v919, v920);
+    int16x8_t v922 = vld1q_s16(in + in_stride * 61 + i);
+    int16x8_t v923 = vld1q_s16(in + in_stride * 59 + i);
+    int16x8_t v924 = vaddq_s16(v922, v923);
+    int16x8_t v925 = vaddq_s16(v921, v924);
+    int16x8_t v926_tmp = vqrdmulhq_n_s16(v925, 13573);
+    int16x8_t v926 = vaddq_s16(v926_tmp, v925);
+    int16x8_t v927 = vld1q_s16(in + in_stride * 197 + i);
+    int16x8_t v928 = vld1q_s16(in + in_stride * 195 + i);
+    int16x8_t v929 = vaddq_s16(v927, v928);
+    int16x8_t v930 = vld1q_s16(in + in_stride * 189 + i);
+    int16x8_t v931 = vld1q_s16(in + in_stride * 187 + i);
+    int16x8_t v932 = vaddq_s16(v930, v931);
+    int16x8_t v933 = vaddq_s16(v929, v932);
+    int16x8_t v934 = vaddq_s16(v933, v925);
+    int16x8_t v935 = vaddq_s16(v926, v934);
+    int16x8_t v936 = vqrdmulhq_n_s16(v935, 17734);
+    int16x8_t v937 = vaddq_s16(v918, v936);
+    int16x8_t v938 = vld1q_s16(in + in_stride * 37 + i);
+    int16x8_t v939 = vld1q_s16(in + in_stride * 35 + i);
+    int16x8_t v940 = vaddq_s16(v938, v939);
+    int16x8_t v941 = vld1q_s16(in + in_stride * 29 + i);
+    int16x8_t v942 = vld1q_s16(in + in_stride * 27 + i);
+    int16x8_t v943 = vaddq_s16(v941, v942);
+    int16x8_t v944 = vaddq_s16(v940, v943);
+    int16x8_t v945_tmp = vqrdmulhq_n_s16(v944, 13573);
+    int16x8_t v945 = vaddq_s16(v945_tmp, v944);
+    int16x8_t v946 = vld1q_s16(in + in_stride * 165 + i);
+    int16x8_t v947 = vld1q_s16(in + in_stride * 163 + i);
+    int16x8_t v948 = vaddq_s16(v946, v947);
+    int16x8_t v949 = vld1q_s16(in + in_stride * 157 + i);
+    int16x8_t v950 = vld1q_s16(in + in_stride * 155 + i);
+    int16x8_t v951 = vaddq_s16(v949, v950);
+    int16x8_t v952 = vaddq_s16(v948, v951);
+    int16x8_t v953 = vld1q_s16(in + in_stride * 101 + i);
+    int16x8_t v954 = vld1q_s16(in + in_stride * 99 + i);
+    int16x8_t v955 = vaddq_s16(v953, v954);
+    int16x8_t v956 = vld1q_s16(in + in_stride * 93 + i);
+    int16x8_t v957 = vld1q_s16(in + in_stride * 91 + i);
+    int16x8_t v958 = vaddq_s16(v956, v957);
+    int16x8_t v959 = vaddq_s16(v955, v958);
+    int16x8_t v960 = vaddq_s16(v952, v959);
+    int16x8_t v961 = vaddq_s16(v945, v960);
+    int16x8_t v962 = vaddq_s16(v959, v944);
+    int16x8_t v963_tmp = vqrdmulhq_n_s16(v962, 13573);
+    int16x8_t v963 = vaddq_s16(v963_tmp, v962);
+    int16x8_t v964 = vld1q_s16(in + in_stride * 229 + i);
+    int16x8_t v965 = vld1q_s16(in + in_stride * 227 + i);
+    int16x8_t v966 = vaddq_s16(v964, v965);
+    int16x8_t v967 = vld1q_s16(in + in_stride * 221 + i);
+    int16x8_t v968 = vld1q_s16(in + in_stride * 219 + i);
+    int16x8_t v969 = vaddq_s16(v967, v968);
+    int16x8_t v970 = vaddq_s16(v966, v969);
+    int16x8_t v971 = vaddq_s16(v970, v952);
+    int16x8_t v972 = vaddq_s16(v971, v962);
+    int16x8_t v973 = vaddq_s16(v963, v972);
+    int16x8_t v974 = vqrdmulhq_n_s16(v973, 17734);
+    int16x8_t v975 = vaddq_s16(v961, v974);
+    int16x8_t v976 = vqrdmulhq_n_s16(v975, 16705);
+    int16x8_t v977 = vaddq_s16(v937, v976);
+    int16x8_t v978 = vld1q_s16(in + in_stride * 21 + i);
+    int16x8_t v979 = vld1q_s16(in + in_stride * 19 + i);
+    int16x8_t v980 = vaddq_s16(v978, v979);
+    int16x8_t v981 = vld1q_s16(in + in_stride * 13 + i);
+    int16x8_t v982 = vld1q_s16(in + in_stride * 11 + i);
+    int16x8_t v983 = vaddq_s16(v981, v982);
+    int16x8_t v984 = vaddq_s16(v980, v983);
+    int16x8_t v985_tmp = vqrdmulhq_n_s16(v984, 13573);
+    int16x8_t v985 = vaddq_s16(v985_tmp, v984);
+    int16x8_t v986 = vld1q_s16(in + in_stride * 149 + i);
+    int16x8_t v987 = vld1q_s16(in + in_stride * 147 + i);
+    int16x8_t v988 = vaddq_s16(v986, v987);
+    int16x8_t v989 = vld1q_s16(in + in_stride * 141 + i);
+    int16x8_t v990 = vld1q_s16(in + in_stride * 139 + i);
+    int16x8_t v991 = vaddq_s16(v989, v990);
+    int16x8_t v992 = vaddq_s16(v988, v991);
+    int16x8_t v993 = vld1q_s16(in + in_stride * 117 + i);
+    int16x8_t v994 = vld1q_s16(in + in_stride * 115 + i);
+    int16x8_t v995 = vaddq_s16(v993, v994);
+    int16x8_t v996 = vld1q_s16(in + in_stride * 109 + i);
+    int16x8_t v997 = vld1q_s16(in + in_stride * 107 + i);
+    int16x8_t v998 = vaddq_s16(v996, v997);
+    int16x8_t v999 = vaddq_s16(v995, v998);
+    int16x8_t v1000 = vaddq_s16(v992, v999);
+    int16x8_t v1001 = vaddq_s16(v985, v1000);
+    int16x8_t v1002 = vld1q_s16(in + in_stride * 85 + i);
+    int16x8_t v1003 = vld1q_s16(in + in_stride * 83 + i);
+    int16x8_t v1004 = vaddq_s16(v1002, v1003);
+    int16x8_t v1005 = vld1q_s16(in + in_stride * 77 + i);
+    int16x8_t v1006 = vld1q_s16(in + in_stride * 75 + i);
+    int16x8_t v1007 = vaddq_s16(v1005, v1006);
+    int16x8_t v1008 = vaddq_s16(v1004, v1007);
+    int16x8_t v1009 = vld1q_s16(in + in_stride * 53 + i);
+    int16x8_t v1010 = vld1q_s16(in + in_stride * 51 + i);
+    int16x8_t v1011 = vaddq_s16(v1009, v1010);
+    int16x8_t v1012 = vld1q_s16(in + in_stride * 45 + i);
+    int16x8_t v1013 = vld1q_s16(in + in_stride * 43 + i);
+    int16x8_t v1014 = vaddq_s16(v1012, v1013);
+    int16x8_t v1015 = vaddq_s16(v1011, v1014);
+    int16x8_t v1016 = vaddq_s16(v1008, v1015);
+    int16x8_t v1017_tmp = vqrdmulhq_n_s16(v1016, 13573);
+    int16x8_t v1017 = vaddq_s16(v1017_tmp, v1016);
+    int16x8_t v1018 = vld1q_s16(in + in_stride * 213 + i);
+    int16x8_t v1019 = vld1q_s16(in + in_stride * 211 + i);
+    int16x8_t v1020 = vaddq_s16(v1018, v1019);
+    int16x8_t v1021 = vld1q_s16(in + in_stride * 205 + i);
+    int16x8_t v1022 = vld1q_s16(in + in_stride * 203 + i);
+    int16x8_t v1023 = vaddq_s16(v1021, v1022);
+    int16x8_t v1024 = vaddq_s16(v1020, v1023);
+    int16x8_t v1025 = vld1q_s16(in + in_stride * 181 + i);
+    int16x8_t v1026 = vld1q_s16(in + in_stride * 179 + i);
+    int16x8_t v1027 = vaddq_s16(v1025, v1026);
+    int16x8_t v1028 = vld1q_s16(in + in_stride * 173 + i);
+    int16x8_t v1029 = vld1q_s16(in + in_stride * 171 + i);
+    int16x8_t v1030 = vaddq_s16(v1028, v1029);
+    int16x8_t v1031 = vaddq_s16(v1027, v1030);
+    int16x8_t v1032 = vaddq_s16(v1024, v1031);
+    int16x8_t v1033 = vaddq_s16(v1032, v1016);
+    int16x8_t v1034 = vaddq_s16(v1017, v1033);
+    int16x8_t v1035 = vqrdmulhq_n_s16(v1034, 17734);
+    int16x8_t v1036 = vaddq_s16(v1001, v1035);
+    int16x8_t v1037 = vaddq_s16(v1015, v984);
+    int16x8_t v1038_tmp = vqrdmulhq_n_s16(v1037, 13573);
+    int16x8_t v1038 = vaddq_s16(v1038_tmp, v1037);
+    int16x8_t v1039 = vaddq_s16(v1031, v992);
+    int16x8_t v1040 = vaddq_s16(v999, v1008);
+    int16x8_t v1041 = vaddq_s16(v1039, v1040);
+    int16x8_t v1042 = vaddq_s16(v1038, v1041);
+    int16x8_t v1043 = vaddq_s16(v1040, v1037);
+    int16x8_t v1044_tmp = vqrdmulhq_n_s16(v1043, 13573);
+    int16x8_t v1044 = vaddq_s16(v1044_tmp, v1043);
+    int16x8_t v1045 = vld1q_s16(in + in_stride * 245 + i);
+    int16x8_t v1046 = vld1q_s16(in + in_stride * 243 + i);
+    int16x8_t v1047 = vaddq_s16(v1045, v1046);
+    int16x8_t v1048 = vld1q_s16(in + in_stride * 237 + i);
+    int16x8_t v1049 = vld1q_s16(in + in_stride * 235 + i);
+    int16x8_t v1050 = vaddq_s16(v1048, v1049);
+    int16x8_t v1051 = vaddq_s16(v1047, v1050);
+    int16x8_t v1052 = vaddq_s16(v1051, v1024);
+    int16x8_t v1053 = vaddq_s16(v1052, v1039);
+    int16x8_t v1054 = vaddq_s16(v1053, v1043);
+    int16x8_t v1055 = vaddq_s16(v1044, v1054);
+    int16x8_t v1056 = vqrdmulhq_n_s16(v1055, 17734);
+    int16x8_t v1057 = vaddq_s16(v1042, v1056);
+    int16x8_t v1058 = vqrdmulhq_n_s16(v1057, 16705);
+    int16x8_t v1059 = vaddq_s16(v1036, v1058);
+    int16x8_t v1060 = vqrdmulhq_n_s16(v1059, 16463);
+    int16x8_t v1061 = vaddq_s16(v977, v1060);
+    int16x8_t v1062 = vaddq_s16(v983, v909);
+    int16x8_t v1063_tmp = vqrdmulhq_n_s16(v1062, 13573);
+    int16x8_t v1063 = vaddq_s16(v1063_tmp, v1062);
+    int16x8_t v1064 = vaddq_s16(v991, v913);
+    int16x8_t v1065 = vaddq_s16(v916, v995);
+    int16x8_t v1066 = vaddq_s16(v1064, v1065);
+    int16x8_t v1067 = vaddq_s16(v1063, v1066);
+    int16x8_t v1068 = vaddq_s16(v1007, v921);
+    int16x8_t v1069 = vaddq_s16(v924, v1011);
+    int16x8_t v1070 = vaddq_s16(v1068, v1069);
+    int16x8_t v1071_tmp = vqrdmulhq_n_s16(v1070, 13573);
+    int16x8_t v1071 = vaddq_s16(v1071_tmp, v1070);
+    int16x8_t v1072 = vaddq_s16(v1023, v929);
+    int16x8_t v1073 = vaddq_s16(v932, v1027);
+    int16x8_t v1074 = vaddq_s16(v1072, v1073);
+    int16x8_t v1075 = vaddq_s16(v1074, v1070);
+    int16x8_t v1076 = vaddq_s16(v1071, v1075);
+    int16x8_t v1077 = vqrdmulhq_n_s16(v1076, 17734);
+    int16x8_t v1078 = vaddq_s16(v1067, v1077);
+    int16x8_t v1079 = vaddq_s16(v1014, v940);
+    int16x8_t v1080 = vaddq_s16(v943, v980);
+    int16x8_t v1081 = vaddq_s16(v1079, v1080);
+    int16x8_t v1082_tmp = vqrdmulhq_n_s16(v1081, 13573);
+    int16x8_t v1082 = vaddq_s16(v1082_tmp, v1081);
+    int16x8_t v1083 = vaddq_s16(v1030, v948);
+    int16x8_t v1084 = vaddq_s16(v951, v988);
+    int16x8_t v1085 = vaddq_s16(v1083, v1084);
+    int16x8_t v1086 = vaddq_s16(v998, v955);
+    int16x8_t v1087 = vaddq_s16(v958, v1004);
+    int16x8_t v1088 = vaddq_s16(v1086, v1087);
+    int16x8_t v1089 = vaddq_s16(v1085, v1088);
+    int16x8_t v1090 = vaddq_s16(v1082, v1089);
+    int16x8_t v1091 = vaddq_s16(v1088, v1081);
+    int16x8_t v1092_tmp = vqrdmulhq_n_s16(v1091, 13573);
+    int16x8_t v1092 = vaddq_s16(v1092_tmp, v1091);
+    int16x8_t v1093 = vaddq_s16(v1050, v966);
+    int16x8_t v1094 = vaddq_s16(v969, v1020);
+    int16x8_t v1095 = vaddq_s16(v1093, v1094);
+    int16x8_t v1096 = vaddq_s16(v1095, v1085);
+    int16x8_t v1097 = vaddq_s16(v1096, v1091);
+    int16x8_t v1098 = vaddq_s16(v1092, v1097);
+    int16x8_t v1099 = vqrdmulhq_n_s16(v1098, 17734);
+    int16x8_t v1100 = vaddq_s16(v1090, v1099);
+    int16x8_t v1101 = vqrdmulhq_n_s16(v1100, 16705);
+    int16x8_t v1102 = vaddq_s16(v1078, v1101);
+    int16x8_t v1103 = vaddq_s16(v1080, v1062);
+    int16x8_t v1104_tmp = vqrdmulhq_n_s16(v1103, 13573);
+    int16x8_t v1104 = vaddq_s16(v1104_tmp, v1103);
+    int16x8_t v1105 = vaddq_s16(v1084, v1064);
+    int16x8_t v1106 = vaddq_s16(v1065, v1086);
+    int16x8_t v1107 = vaddq_s16(v1105, v1106);
+    int16x8_t v1108 = vaddq_s16(v1104, v1107);
+    int16x8_t v1109 = vaddq_s16(v1087, v1068);
+    int16x8_t v1110 = vaddq_s16(v1069, v1079);
+    int16x8_t v1111 = vaddq_s16(v1109, v1110);
+    int16x8_t v1112_tmp = vqrdmulhq_n_s16(v1111, 13573);
+    int16x8_t v1112 = vaddq_s16(v1112_tmp, v1111);
+    int16x8_t v1113 = vaddq_s16(v1094, v1072);
+    int16x8_t v1114 = vaddq_s16(v1073, v1083);
+    int16x8_t v1115 = vaddq_s16(v1113, v1114);
+    int16x8_t v1116 = vaddq_s16(v1115, v1111);
+    int16x8_t v1117 = vaddq_s16(v1112, v1116);
+    int16x8_t v1118 = vqrdmulhq_n_s16(v1117, 17734);
+    int16x8_t v1119 = vaddq_s16(v1108, v1118);
+    int16x8_t v1120 = vaddq_s16(v1110, v1103);
+    int16x8_t v1121_tmp = vqrdmulhq_n_s16(v1120, 13573);
+    int16x8_t v1121 = vaddq_s16(v1121_tmp, v1120);
+    int16x8_t v1122 = vaddq_s16(v1114, v1105);
+    int16x8_t v1123 = vaddq_s16(v1106, v1109);
+    int16x8_t v1124 = vaddq_s16(v1122, v1123);
+    int16x8_t v1125 = vaddq_s16(v1121, v1124);
+    int16x8_t v1126 = vaddq_s16(v1123, v1120);
+    int16x8_t v1127_tmp = vqrdmulhq_n_s16(v1126, 13573);
+    int16x8_t v1127 = vaddq_s16(v1127_tmp, v1126);
+    int16x8_t v1128 = vld1q_s16(in + in_stride * 253 + i);
+    int16x8_t v1129 = vld1q_s16(in + in_stride * 251 + i);
+    int16x8_t v1130 = vaddq_s16(v1128, v1129);
+    int16x8_t v1131 = vaddq_s16(v1130, v1047);
+    int16x8_t v1132 = vaddq_s16(v1131, v1093);
+    int16x8_t v1133 = vaddq_s16(v1132, v1113);
+    int16x8_t v1134 = vaddq_s16(v1133, v1122);
+    int16x8_t v1135 = vaddq_s16(v1134, v1126);
+    int16x8_t v1136 = vaddq_s16(v1127, v1135);
+    int16x8_t v1137 = vqrdmulhq_n_s16(v1136, 17734);
+    int16x8_t v1138 = vaddq_s16(v1125, v1137);
+    int16x8_t v1139 = vqrdmulhq_n_s16(v1138, 16705);
+    int16x8_t v1140 = vaddq_s16(v1119, v1139);
+    int16x8_t v1141 = vqrdmulhq_n_s16(v1140, 16463);
+    int16x8_t v1142 = vaddq_s16(v1102, v1141);
+    int16x8_t v1143 = vqrdmulhq_n_s16(v1142, 16404);
+    int16x8_t v1144 = vaddq_s16(v1061, v1143);
+    int16x8_t v1145 = vqrdmulhq_n_s16(v1144, 16389);
+    int16x8_t v1146 = vaddq_s16(v906, v1145);
+    int16x8_t v1147 = vaddq_s16(v908, v702);
+    int16x8_t v1148_tmp = vqrdmulhq_n_s16(v1147, 13573);
+    int16x8_t v1148 = vaddq_s16(v1148_tmp, v1147);
+    int16x8_t v1149 = vaddq_s16(v912, v704);
+    int16x8_t v1150 = vaddq_s16(v705, v914);
+    int16x8_t v1151 = vaddq_s16(v1149, v1150);
+    int16x8_t v1152 = vaddq_s16(v1148, v1151);
+    int16x8_t v1153 = vaddq_s16(v920, v708);
+    int16x8_t v1154 = vaddq_s16(v709, v922);
+    int16x8_t v1155 = vaddq_s16(v1153, v1154);
+    int16x8_t v1156_tmp = vqrdmulhq_n_s16(v1155, 13573);
+    int16x8_t v1156 = vaddq_s16(v1156_tmp, v1155);
+    int16x8_t v1157 = vaddq_s16(v928, v712);
+    int16x8_t v1158 = vaddq_s16(v713, v930);
+    int16x8_t v1159 = vaddq_s16(v1157, v1158);
+    int16x8_t v1160 = vaddq_s16(v1159, v1155);
+    int16x8_t v1161 = vaddq_s16(v1156, v1160);
+    int16x8_t v1162 = vqrdmulhq_n_s16(v1161, 17734);
+    int16x8_t v1163 = vaddq_s16(v1152, v1162);
+    int16x8_t v1164 = vaddq_s16(v939, v719);
+    int16x8_t v1165 = vaddq_s16(v720, v941);
+    int16x8_t v1166 = vaddq_s16(v1164, v1165);
+    int16x8_t v1167_tmp = vqrdmulhq_n_s16(v1166, 13573);
+    int16x8_t v1167 = vaddq_s16(v1167_tmp, v1166);
+    int16x8_t v1168 = vaddq_s16(v947, v723);
+    int16x8_t v1169 = vaddq_s16(v724, v949);
+    int16x8_t v1170 = vaddq_s16(v1168, v1169);
+    int16x8_t v1171 = vaddq_s16(v954, v726);
+    int16x8_t v1172 = vaddq_s16(v727, v956);
+    int16x8_t v1173 = vaddq_s16(v1171, v1172);
+    int16x8_t v1174 = vaddq_s16(v1170, v1173);
+    int16x8_t v1175 = vaddq_s16(v1167, v1174);
+    int16x8_t v1176 = vaddq_s16(v1173, v1166);
+    int16x8_t v1177_tmp = vqrdmulhq_n_s16(v1176, 13573);
+    int16x8_t v1177 = vaddq_s16(v1177_tmp, v1176);
+    int16x8_t v1178 = vaddq_s16(v965, v733);
+    int16x8_t v1179 = vaddq_s16(v734, v967);
+    int16x8_t v1180 = vaddq_s16(v1178, v1179);
+    int16x8_t v1181 = vaddq_s16(v1180, v1170);
+    int16x8_t v1182 = vaddq_s16(v1181, v1176);
+    int16x8_t v1183 = vaddq_s16(v1177, v1182);
+    int16x8_t v1184 = vqrdmulhq_n_s16(v1183, 17734);
+    int16x8_t v1185 = vaddq_s16(v1175, v1184);
+    int16x8_t v1186 = vqrdmulhq_n_s16(v1185, 16705);
+    int16x8_t v1187 = vaddq_s16(v1163, v1186);
+    int16x8_t v1188 = vaddq_s16(v979, v743);
+    int16x8_t v1189 = vaddq_s16(v744, v981);
+    int16x8_t v1190 = vaddq_s16(v1188, v1189);
+    int16x8_t v1191_tmp = vqrdmulhq_n_s16(v1190, 13573);
+    int16x8_t v1191 = vaddq_s16(v1191_tmp, v1190);
+    int16x8_t v1192 = vaddq_s16(v987, v747);
+    int16x8_t v1193 = vaddq_s16(v748, v989);
+    int16x8_t v1194 = vaddq_s16(v1192, v1193);
+    int16x8_t v1195 = vaddq_s16(v994, v750);
+    int16x8_t v1196 = vaddq_s16(v751, v996);
+    int16x8_t v1197 = vaddq_s16(v1195, v1196);
+    int16x8_t v1198 = vaddq_s16(v1194, v1197);
+    int16x8_t v1199 = vaddq_s16(v1191, v1198);
+    int16x8_t v1200 = vaddq_s16(v1003, v755);
+    int16x8_t v1201 = vaddq_s16(v756, v1005);
+    int16x8_t v1202 = vaddq_s16(v1200, v1201);
+    int16x8_t v1203 = vaddq_s16(v1010, v758);
+    int16x8_t v1204 = vaddq_s16(v759, v1012);
+    int16x8_t v1205 = vaddq_s16(v1203, v1204);
+    int16x8_t v1206 = vaddq_s16(v1202, v1205);
+    int16x8_t v1207_tmp = vqrdmulhq_n_s16(v1206, 13573);
+    int16x8_t v1207 = vaddq_s16(v1207_tmp, v1206);
+    int16x8_t v1208 = vaddq_s16(v1019, v763);
+    int16x8_t v1209 = vaddq_s16(v764, v1021);
+    int16x8_t v1210 = vaddq_s16(v1208, v1209);
+    int16x8_t v1211 = vaddq_s16(v1026, v766);
+    int16x8_t v1212 = vaddq_s16(v767, v1028);
+    int16x8_t v1213 = vaddq_s16(v1211, v1212);
+    int16x8_t v1214 = vaddq_s16(v1210, v1213);
+    int16x8_t v1215 = vaddq_s16(v1214, v1206);
+    int16x8_t v1216 = vaddq_s16(v1207, v1215);
+    int16x8_t v1217 = vqrdmulhq_n_s16(v1216, 17734);
+    int16x8_t v1218 = vaddq_s16(v1199, v1217);
+    int16x8_t v1219 = vaddq_s16(v1205, v1190);
+    int16x8_t v1220_tmp = vqrdmulhq_n_s16(v1219, 13573);
+    int16x8_t v1220 = vaddq_s16(v1220_tmp, v1219);
+    int16x8_t v1221 = vaddq_s16(v1213, v1194);
+    int16x8_t v1222 = vaddq_s16(v1197, v1202);
+    int16x8_t v1223 = vaddq_s16(v1221, v1222);
+    int16x8_t v1224 = vaddq_s16(v1220, v1223);
+    int16x8_t v1225 = vaddq_s16(v1222, v1219);
+    int16x8_t v1226_tmp = vqrdmulhq_n_s16(v1225, 13573);
+    int16x8_t v1226 = vaddq_s16(v1226_tmp, v1225);
+    int16x8_t v1227 = vaddq_s16(v1046, v782);
+    int16x8_t v1228 = vaddq_s16(v783, v1048);
+    int16x8_t v1229 = vaddq_s16(v1227, v1228);
+    int16x8_t v1230 = vaddq_s16(v1229, v1210);
+    int16x8_t v1231 = vaddq_s16(v1230, v1221);
+    int16x8_t v1232 = vaddq_s16(v1231, v1225);
+    int16x8_t v1233 = vaddq_s16(v1226, v1232);
+    int16x8_t v1234 = vqrdmulhq_n_s16(v1233, 17734);
+    int16x8_t v1235 = vaddq_s16(v1224, v1234);
+    int16x8_t v1236 = vqrdmulhq_n_s16(v1235, 16705);
+    int16x8_t v1237 = vaddq_s16(v1218, v1236);
+    int16x8_t v1238 = vqrdmulhq_n_s16(v1237, 16463);
+    int16x8_t v1239 = vaddq_s16(v1187, v1238);
+    int16x8_t v1240 = vaddq_s16(v982, v795);
+    int16x8_t v1241 = vaddq_s16(v796, v907);
+    int16x8_t v1242 = vaddq_s16(v1240, v1241);
+    int16x8_t v1243_tmp = vqrdmulhq_n_s16(v1242, 13573);
+    int16x8_t v1243 = vaddq_s16(v1243_tmp, v1242);
+    int16x8_t v1244 = vaddq_s16(v990, v799);
+    int16x8_t v1245 = vaddq_s16(v800, v911);
+    int16x8_t v1246 = vaddq_s16(v1244, v1245);
+    int16x8_t v1247 = vaddq_s16(v915, v802);
+    int16x8_t v1248 = vaddq_s16(v803, v993);
+    int16x8_t v1249 = vaddq_s16(v1247, v1248);
+    int16x8_t v1250 = vaddq_s16(v1246, v1249);
+    int16x8_t v1251 = vaddq_s16(v1243, v1250);
+    int16x8_t v1252 = vaddq_s16(v1006, v807);
+    int16x8_t v1253 = vaddq_s16(v808, v919);
+    int16x8_t v1254 = vaddq_s16(v1252, v1253);
+    int16x8_t v1255 = vaddq_s16(v923, v810);
+    int16x8_t v1256 = vaddq_s16(v811, v1009);
+    int16x8_t v1257 = vaddq_s16(v1255, v1256);
+    int16x8_t v1258 = vaddq_s16(v1254, v1257);
+    int16x8_t v1259_tmp = vqrdmulhq_n_s16(v1258, 13573);
+    int16x8_t v1259 = vaddq_s16(v1259_tmp, v1258);
+    int16x8_t v1260 = vaddq_s16(v1022, v815);
+    int16x8_t v1261 = vaddq_s16(v816, v927);
+    int16x8_t v1262 = vaddq_s16(v1260, v1261);
+    int16x8_t v1263 = vaddq_s16(v931, v818);
+    int16x8_t v1264 = vaddq_s16(v819, v1025);
+    int16x8_t v1265 = vaddq_s16(v1263, v1264);
+    int16x8_t v1266 = vaddq_s16(v1262, v1265);
+    int16x8_t v1267 = vaddq_s16(v1266, v1258);
+    int16x8_t v1268 = vaddq_s16(v1259, v1267);
+    int16x8_t v1269 = vqrdmulhq_n_s16(v1268, 17734);
+    int16x8_t v1270 = vaddq_s16(v1251, v1269);
+    int16x8_t v1271 = vaddq_s16(v1013, v826);
+    int16x8_t v1272 = vaddq_s16(v827, v938);
+    int16x8_t v1273 = vaddq_s16(v1271, v1272);
+    int16x8_t v1274 = vaddq_s16(v942, v829);
+    int16x8_t v1275 = vaddq_s16(v830, v978);
+    int16x8_t v1276 = vaddq_s16(v1274, v1275);
+    int16x8_t v1277 = vaddq_s16(v1273, v1276);
+    int16x8_t v1278_tmp = vqrdmulhq_n_s16(v1277, 13573);
+    int16x8_t v1278 = vaddq_s16(v1278_tmp, v1277);
+    int16x8_t v1279 = vaddq_s16(v1029, v834);
+    int16x8_t v1280 = vaddq_s16(v835, v946);
+    int16x8_t v1281 = vaddq_s16(v1279, v1280);
+    int16x8_t v1282 = vaddq_s16(v950, v837);
+    int16x8_t v1283 = vaddq_s16(v838, v986);
+    int16x8_t v1284 = vaddq_s16(v1282, v1283);
+    int16x8_t v1285 = vaddq_s16(v1281, v1284);
+    int16x8_t v1286 = vaddq_s16(v997, v841);
+    int16x8_t v1287 = vaddq_s16(v842, v953);
+    int16x8_t v1288 = vaddq_s16(v1286, v1287);
+    int16x8_t v1289 = vaddq_s16(v957, v844);
+    int16x8_t v1290 = vaddq_s16(v845, v1002);
+    int16x8_t v1291 = vaddq_s16(v1289, v1290);
+    int16x8_t v1292 = vaddq_s16(v1288, v1291);
+    int16x8_t v1293 = vaddq_s16(v1285, v1292);
+    int16x8_t v1294 = vaddq_s16(v1278, v1293);
+    int16x8_t v1295 = vaddq_s16(v1292, v1277);
+    int16x8_t v1296_tmp = vqrdmulhq_n_s16(v1295, 13573);
+    int16x8_t v1296 = vaddq_s16(v1296_tmp, v1295);
+    int16x8_t v1297 = vaddq_s16(v1049, v852);
+    int16x8_t v1298 = vaddq_s16(v853, v964);
+    int16x8_t v1299 = vaddq_s16(v1297, v1298);
+    int16x8_t v1300 = vaddq_s16(v968, v855);
+    int16x8_t v1301 = vaddq_s16(v856, v1018);
+    int16x8_t v1302 = vaddq_s16(v1300, v1301);
+    int16x8_t v1303 = vaddq_s16(v1299, v1302);
+    int16x8_t v1304 = vaddq_s16(v1303, v1285);
+    int16x8_t v1305 = vaddq_s16(v1304, v1295);
+    int16x8_t v1306 = vaddq_s16(v1296, v1305);
+    int16x8_t v1307 = vqrdmulhq_n_s16(v1306, 17734);
+    int16x8_t v1308 = vaddq_s16(v1294, v1307);
+    int16x8_t v1309 = vqrdmulhq_n_s16(v1308, 16705);
+    int16x8_t v1310 = vaddq_s16(v1270, v1309);
+    int16x8_t v1311 = vaddq_s16(v1276, v1242);
+    int16x8_t v1312_tmp = vqrdmulhq_n_s16(v1311, 13573);
+    int16x8_t v1312 = vaddq_s16(v1312_tmp, v1311);
+    int16x8_t v1313 = vaddq_s16(v1284, v1246);
+    int16x8_t v1314 = vaddq_s16(v1249, v1288);
+    int16x8_t v1315 = vaddq_s16(v1313, v1314);
+    int16x8_t v1316 = vaddq_s16(v1312, v1315);
+    int16x8_t v1317 = vaddq_s16(v1291, v1254);
+    int16x8_t v1318 = vaddq_s16(v1257, v1273);
+    int16x8_t v1319 = vaddq_s16(v1317, v1318);
+    int16x8_t v1320_tmp = vqrdmulhq_n_s16(v1319, 13573);
+    int16x8_t v1320 = vaddq_s16(v1320_tmp, v1319);
+    int16x8_t v1321 = vaddq_s16(v1302, v1262);
+    int16x8_t v1322 = vaddq_s16(v1265, v1281);
+    int16x8_t v1323 = vaddq_s16(v1321, v1322);
+    int16x8_t v1324 = vaddq_s16(v1323, v1319);
+    int16x8_t v1325 = vaddq_s16(v1320, v1324);
+    int16x8_t v1326 = vqrdmulhq_n_s16(v1325, 17734);
+    int16x8_t v1327 = vaddq_s16(v1316, v1326);
+    int16x8_t v1328 = vaddq_s16(v1318, v1311);
+    int16x8_t v1329_tmp = vqrdmulhq_n_s16(v1328, 13573);
+    int16x8_t v1329 = vaddq_s16(v1329_tmp, v1328);
+    int16x8_t v1330 = vaddq_s16(v1322, v1313);
+    int16x8_t v1331 = vaddq_s16(v1314, v1317);
+    int16x8_t v1332 = vaddq_s16(v1330, v1331);
+    int16x8_t v1333 = vaddq_s16(v1329, v1332);
+    int16x8_t v1334 = vaddq_s16(v1331, v1328);
+    int16x8_t v1335_tmp = vqrdmulhq_n_s16(v1334, 13573);
+    int16x8_t v1335 = vaddq_s16(v1335_tmp, v1334);
+    int16x8_t v1336 = vaddq_s16(v1129, v891);
+    int16x8_t v1337 = vaddq_s16(v892, v1045);
+    int16x8_t v1338 = vaddq_s16(v1336, v1337);
+    int16x8_t v1339 = vaddq_s16(v1338, v1299);
+    int16x8_t v1340 = vaddq_s16(v1339, v1321);
+    int16x8_t v1341 = vaddq_s16(v1340, v1330);
+    int16x8_t v1342 = vaddq_s16(v1341, v1334);
+    int16x8_t v1343 = vaddq_s16(v1335, v1342);
+    int16x8_t v1344 = vqrdmulhq_n_s16(v1343, 17734);
+    int16x8_t v1345 = vaddq_s16(v1333, v1344);
+    int16x8_t v1346 = vqrdmulhq_n_s16(v1345, 16705);
+    int16x8_t v1347 = vaddq_s16(v1327, v1346);
+    int16x8_t v1348 = vqrdmulhq_n_s16(v1347, 16463);
+    int16x8_t v1349 = vaddq_s16(v1310, v1348);
+    int16x8_t v1350 = vqrdmulhq_n_s16(v1349, 16404);
+    int16x8_t v1351 = vaddq_s16(v1239, v1350);
+    int16x8_t v1352 = vaddq_s16(v1241, v1147);
+    int16x8_t v1353_tmp = vqrdmulhq_n_s16(v1352, 13573);
+    int16x8_t v1353 = vaddq_s16(v1353_tmp, v1352);
+    int16x8_t v1354 = vaddq_s16(v1245, v1149);
+    int16x8_t v1355 = vaddq_s16(v1150, v1247);
+    int16x8_t v1356 = vaddq_s16(v1354, v1355);
+    int16x8_t v1357 = vaddq_s16(v1353, v1356);
+    int16x8_t v1358 = vaddq_s16(v1253, v1153);
+    int16x8_t v1359 = vaddq_s16(v1154, v1255);
+    int16x8_t v1360 = vaddq_s16(v1358, v1359);
+    int16x8_t v1361_tmp = vqrdmulhq_n_s16(v1360, 13573);
+    int16x8_t v1361 = vaddq_s16(v1361_tmp, v1360);
+    int16x8_t v1362 = vaddq_s16(v1261, v1157);
+    int16x8_t v1363 = vaddq_s16(v1158, v1263);
+    int16x8_t v1364 = vaddq_s16(v1362, v1363);
+    int16x8_t v1365 = vaddq_s16(v1364, v1360);
+    int16x8_t v1366 = vaddq_s16(v1361, v1365);
+    int16x8_t v1367 = vqrdmulhq_n_s16(v1366, 17734);
+    int16x8_t v1368 = vaddq_s16(v1357, v1367);
+    int16x8_t v1369 = vaddq_s16(v1272, v1164);
+    int16x8_t v1370 = vaddq_s16(v1165, v1274);
+    int16x8_t v1371 = vaddq_s16(v1369, v1370);
+    int16x8_t v1372_tmp = vqrdmulhq_n_s16(v1371, 13573);
+    int16x8_t v1372 = vaddq_s16(v1372_tmp, v1371);
+    int16x8_t v1373 = vaddq_s16(v1280, v1168);
+    int16x8_t v1374 = vaddq_s16(v1169, v1282);
+    int16x8_t v1375 = vaddq_s16(v1373, v1374);
+    int16x8_t v1376 = vaddq_s16(v1287, v1171);
+    int16x8_t v1377 = vaddq_s16(v1172, v1289);
+    int16x8_t v1378 = vaddq_s16(v1376, v1377);
+    int16x8_t v1379 = vaddq_s16(v1375, v1378);
+    int16x8_t v1380 = vaddq_s16(v1372, v1379);
+    int16x8_t v1381 = vaddq_s16(v1378, v1371);
+    int16x8_t v1382_tmp = vqrdmulhq_n_s16(v1381, 13573);
+    int16x8_t v1382 = vaddq_s16(v1382_tmp, v1381);
+    int16x8_t v1383 = vaddq_s16(v1298, v1178);
+    int16x8_t v1384 = vaddq_s16(v1179, v1300);
+    int16x8_t v1385 = vaddq_s16(v1383, v1384);
+    int16x8_t v1386 = vaddq_s16(v1385, v1375);
+    int16x8_t v1387 = vaddq_s16(v1386, v1381);
+    int16x8_t v1388 = vaddq_s16(v1382, v1387);
+    int16x8_t v1389 = vqrdmulhq_n_s16(v1388, 17734);
+    int16x8_t v1390 = vaddq_s16(v1380, v1389);
+    int16x8_t v1391 = vqrdmulhq_n_s16(v1390, 16705);
+    int16x8_t v1392 = vaddq_s16(v1368, v1391);
+    int16x8_t v1393 = vaddq_s16(v1275, v1188);
+    int16x8_t v1394 = vaddq_s16(v1189, v1240);
+    int16x8_t v1395 = vaddq_s16(v1393, v1394);
+    int16x8_t v1396_tmp = vqrdmulhq_n_s16(v1395, 13573);
+    int16x8_t v1396 = vaddq_s16(v1396_tmp, v1395);
+    int16x8_t v1397 = vaddq_s16(v1283, v1192);
+    int16x8_t v1398 = vaddq_s16(v1193, v1244);
+    int16x8_t v1399 = vaddq_s16(v1397, v1398);
+    int16x8_t v1400 = vaddq_s16(v1248, v1195);
+    int16x8_t v1401 = vaddq_s16(v1196, v1286);
+    int16x8_t v1402 = vaddq_s16(v1400, v1401);
+    int16x8_t v1403 = vaddq_s16(v1399, v1402);
+    int16x8_t v1404 = vaddq_s16(v1396, v1403);
+    int16x8_t v1405 = vaddq_s16(v1290, v1200);
+    int16x8_t v1406 = vaddq_s16(v1201, v1252);
+    int16x8_t v1407 = vaddq_s16(v1405, v1406);
+    int16x8_t v1408 = vaddq_s16(v1256, v1203);
+    int16x8_t v1409 = vaddq_s16(v1204, v1271);
+    int16x8_t v1410 = vaddq_s16(v1408, v1409);
+    int16x8_t v1411 = vaddq_s16(v1407, v1410);
+    int16x8_t v1412_tmp = vqrdmulhq_n_s16(v1411, 13573);
+    int16x8_t v1412 = vaddq_s16(v1412_tmp, v1411);
+    int16x8_t v1413 = vaddq_s16(v1301, v1208);
+    int16x8_t v1414 = vaddq_s16(v1209, v1260);
+    int16x8_t v1415 = vaddq_s16(v1413, v1414);
+    int16x8_t v1416 = vaddq_s16(v1264, v1211);
+    int16x8_t v1417 = vaddq_s16(v1212, v1279);
+    int16x8_t v1418 = vaddq_s16(v1416, v1417);
+    int16x8_t v1419 = vaddq_s16(v1415, v1418);
+    int16x8_t v1420 = vaddq_s16(v1419, v1411);
+    int16x8_t v1421 = vaddq_s16(v1412, v1420);
+    int16x8_t v1422 = vqrdmulhq_n_s16(v1421, 17734);
+    int16x8_t v1423 = vaddq_s16(v1404, v1422);
+    int16x8_t v1424 = vaddq_s16(v1410, v1395);
+    int16x8_t v1425_tmp = vqrdmulhq_n_s16(v1424, 13573);
+    int16x8_t v1425 = vaddq_s16(v1425_tmp, v1424);
+    int16x8_t v1426 = vaddq_s16(v1418, v1399);
+    int16x8_t v1427 = vaddq_s16(v1402, v1407);
+    int16x8_t v1428 = vaddq_s16(v1426, v1427);
+    int16x8_t v1429 = vaddq_s16(v1425, v1428);
+    int16x8_t v1430 = vaddq_s16(v1427, v1424);
+    int16x8_t v1431_tmp = vqrdmulhq_n_s16(v1430, 13573);
+    int16x8_t v1431 = vaddq_s16(v1431_tmp, v1430);
+    int16x8_t v1432 = vaddq_s16(v1337, v1227);
+    int16x8_t v1433 = vaddq_s16(v1228, v1297);
+    int16x8_t v1434 = vaddq_s16(v1432, v1433);
+    int16x8_t v1435 = vaddq_s16(v1434, v1415);
+    int16x8_t v1436 = vaddq_s16(v1435, v1426);
+    int16x8_t v1437 = vaddq_s16(v1436, v1430);
+    int16x8_t v1438 = vaddq_s16(v1431, v1437);
+    int16x8_t v1439 = vqrdmulhq_n_s16(v1438, 17734);
+    int16x8_t v1440 = vaddq_s16(v1429, v1439);
+    int16x8_t v1441 = vqrdmulhq_n_s16(v1440, 16705);
+    int16x8_t v1442 = vaddq_s16(v1423, v1441);
+    int16x8_t v1443 = vqrdmulhq_n_s16(v1442, 16463);
+    int16x8_t v1444 = vaddq_s16(v1392, v1443);
+    int16x8_t v1445 = vaddq_s16(v1394, v1352);
+    int16x8_t v1446_tmp = vqrdmulhq_n_s16(v1445, 13573);
+    int16x8_t v1446 = vaddq_s16(v1446_tmp, v1445);
+    int16x8_t v1447 = vaddq_s16(v1398, v1354);
+    int16x8_t v1448 = vaddq_s16(v1355, v1400);
+    int16x8_t v1449 = vaddq_s16(v1447, v1448);
+    int16x8_t v1450 = vaddq_s16(v1446, v1449);
+    int16x8_t v1451 = vaddq_s16(v1406, v1358);
+    int16x8_t v1452 = vaddq_s16(v1359, v1408);
+    int16x8_t v1453 = vaddq_s16(v1451, v1452);
+    int16x8_t v1454_tmp = vqrdmulhq_n_s16(v1453, 13573);
+    int16x8_t v1454 = vaddq_s16(v1454_tmp, v1453);
+    int16x8_t v1455 = vaddq_s16(v1414, v1362);
+    int16x8_t v1456 = vaddq_s16(v1363, v1416);
+    int16x8_t v1457 = vaddq_s16(v1455, v1456);
+    int16x8_t v1458 = vaddq_s16(v1457, v1453);
+    int16x8_t v1459 = vaddq_s16(v1454, v1458);
+    int16x8_t v1460 = vqrdmulhq_n_s16(v1459, 17734);
+    int16x8_t v1461 = vaddq_s16(v1450, v1460);
+    int16x8_t v1462 = vaddq_s16(v1409, v1369);
+    int16x8_t v1463 = vaddq_s16(v1370, v1393);
+    int16x8_t v1464 = vaddq_s16(v1462, v1463);
+    int16x8_t v1465_tmp = vqrdmulhq_n_s16(v1464, 13573);
+    int16x8_t v1465 = vaddq_s16(v1465_tmp, v1464);
+    int16x8_t v1466 = vaddq_s16(v1417, v1373);
+    int16x8_t v1467 = vaddq_s16(v1374, v1397);
+    int16x8_t v1468 = vaddq_s16(v1466, v1467);
+    int16x8_t v1469 = vaddq_s16(v1401, v1376);
+    int16x8_t v1470 = vaddq_s16(v1377, v1405);
+    int16x8_t v1471 = vaddq_s16(v1469, v1470);
+    int16x8_t v1472 = vaddq_s16(v1468, v1471);
+    int16x8_t v1473 = vaddq_s16(v1465, v1472);
+    int16x8_t v1474 = vaddq_s16(v1471, v1464);
+    int16x8_t v1475_tmp = vqrdmulhq_n_s16(v1474, 13573);
+    int16x8_t v1475 = vaddq_s16(v1475_tmp, v1474);
+    int16x8_t v1476 = vaddq_s16(v1433, v1383);
+    int16x8_t v1477 = vaddq_s16(v1384, v1413);
+    int16x8_t v1478 = vaddq_s16(v1476, v1477);
+    int16x8_t v1479 = vaddq_s16(v1478, v1468);
+    int16x8_t v1480 = vaddq_s16(v1479, v1474);
+    int16x8_t v1481 = vaddq_s16(v1475, v1480);
+    int16x8_t v1482 = vqrdmulhq_n_s16(v1481, 17734);
+    int16x8_t v1483 = vaddq_s16(v1473, v1482);
+    int16x8_t v1484 = vqrdmulhq_n_s16(v1483, 16705);
+    int16x8_t v1485 = vaddq_s16(v1461, v1484);
+    int16x8_t v1486 = vaddq_s16(v1463, v1445);
+    int16x8_t v1487_tmp = vqrdmulhq_n_s16(v1486, 13573);
+    int16x8_t v1487 = vaddq_s16(v1487_tmp, v1486);
+    int16x8_t v1488 = vaddq_s16(v1467, v1447);
+    int16x8_t v1489 = vaddq_s16(v1448, v1469);
+    int16x8_t v1490 = vaddq_s16(v1488, v1489);
+    int16x8_t v1491 = vaddq_s16(v1487, v1490);
+    int16x8_t v1492 = vaddq_s16(v1470, v1451);
+    int16x8_t v1493 = vaddq_s16(v1452, v1462);
+    int16x8_t v1494 = vaddq_s16(v1492, v1493);
+    int16x8_t v1495_tmp = vqrdmulhq_n_s16(v1494, 13573);
+    int16x8_t v1495 = vaddq_s16(v1495_tmp, v1494);
+    int16x8_t v1496 = vaddq_s16(v1477, v1455);
+    int16x8_t v1497 = vaddq_s16(v1456, v1466);
+    int16x8_t v1498 = vaddq_s16(v1496, v1497);
+    int16x8_t v1499 = vaddq_s16(v1498, v1494);
+    int16x8_t v1500 = vaddq_s16(v1495, v1499);
+    int16x8_t v1501 = vqrdmulhq_n_s16(v1500, 17734);
+    int16x8_t v1502 = vaddq_s16(v1491, v1501);
+    int16x8_t v1503 = vaddq_s16(v1493, v1486);
+    int16x8_t v1504_tmp = vqrdmulhq_n_s16(v1503, 13573);
+    int16x8_t v1504 = vaddq_s16(v1504_tmp, v1503);
+    int16x8_t v1505 = vaddq_s16(v1497, v1488);
+    int16x8_t v1506 = vaddq_s16(v1489, v1492);
+    int16x8_t v1507 = vaddq_s16(v1505, v1506);
+    int16x8_t v1508 = vaddq_s16(v1504, v1507);
+    int16x8_t v1509 = vaddq_s16(v1506, v1503);
+    int16x8_t v1510_tmp = vqrdmulhq_n_s16(v1509, 13573);
+    int16x8_t v1510 = vaddq_s16(v1510_tmp, v1509);
+    int16x8_t v1511 = vld1q_s16(in + in_stride * 255 + i);
+    int16x8_t v1512 = vaddq_s16(v1511, v1128);
+    int16x8_t v1513 = vaddq_s16(v1512, v1336);
+    int16x8_t v1514 = vaddq_s16(v1513, v1432);
+    int16x8_t v1515 = vaddq_s16(v1514, v1476);
+    int16x8_t v1516 = vaddq_s16(v1515, v1496);
+    int16x8_t v1517 = vaddq_s16(v1516, v1505);
+    int16x8_t v1518 = vaddq_s16(v1517, v1509);
+    int16x8_t v1519 = vaddq_s16(v1510, v1518);
+    int16x8_t v1520 = vqrdmulhq_n_s16(v1519, 17734);
+    int16x8_t v1521 = vaddq_s16(v1508, v1520);
+    int16x8_t v1522 = vqrdmulhq_n_s16(v1521, 16705);
+    int16x8_t v1523 = vaddq_s16(v1502, v1522);
+    int16x8_t v1524 = vqrdmulhq_n_s16(v1523, 16463);
+    int16x8_t v1525 = vaddq_s16(v1485, v1524);
+    int16x8_t v1526 = vqrdmulhq_n_s16(v1525, 16404);
+    int16x8_t v1527 = vaddq_s16(v1444, v1526);
+    int16x8_t v1528 = vqrdmulhq_n_s16(v1527, 16389);
+    int16x8_t v1529 = vaddq_s16(v1351, v1528);
+    int16x8_t v1530 = vqrdmulhq_n_s16(v1529, 16385);
+    int16x8_t v1531 = vaddq_s16(v1146, v1530);
+    int16x8_t v1532 = vqrdmulhq_n_s16(v1531, 16384);
+    int16x8_t v1533 = vaddq_s16(v701, v1532);
+    int16x8_t v1534 = vsubq_s16(v0, v1);
+    int16x8_t v1535 = vsubq_s16(v4, v6);
+    int16x8_t v1536_tmp = vqrdmulhq_n_s16(v1535, 10045);
+    int16x8_t v1536 = vaddq_s16(v1536_tmp, v1535);
+    int16x8_t v1537 = vaddq_s16(v1534, v1536);
+    int16x8_t v1538 = vsubq_s16(v11, v14);
+    int16x8_t v1539 = vsubq_s16(v17, v20);
+    int16x8_t v1540_tmp = vqrdmulhq_n_s16(v1539, 10045);
+    int16x8_t v1540 = vaddq_s16(v1540_tmp, v1539);
+    int16x8_t v1541 = vaddq_s16(v1538, v1540);
+    int16x8_t v1542 = vqrdmulhq_n_s16(v1541, 19705);
+    int16x8_t v1543 = vaddq_s16(v1537, v1542);
+    int16x8_t v1544 = vsubq_s16(v27, v30);
+    int16x8_t v1545 = vsubq_s16(v35, v39);
+    int16x8_t v1546_tmp = vqrdmulhq_n_s16(v1545, 10045);
+    int16x8_t v1546 = vaddq_s16(v1546_tmp, v1545);
+    int16x8_t v1547 = vaddq_s16(v1544, v1546);
+    int16x8_t v1548 = vsubq_s16(v44, v47);
+    int16x8_t v1549 = vsubq_s16(v50, v54);
+    int16x8_t v1550_tmp = vqrdmulhq_n_s16(v1549, 10045);
+    int16x8_t v1550 = vaddq_s16(v1550_tmp, v1549);
+    int16x8_t v1551 = vaddq_s16(v1548, v1550);
+    int16x8_t v1552 = vqrdmulhq_n_s16(v1551, 19705);
+    int16x8_t v1553 = vaddq_s16(v1547, v1552);
+    int16x8_t v1554 = vqrdmulhq_n_s16(v1553, 17121);
+    int16x8_t v1555 = vaddq_s16(v1543, v1554);
+    int16x8_t v1556 = vsubq_s16(v63, v66);
+    int16x8_t v1557 = vsubq_s16(v71, v75);
+    int16x8_t v1558_tmp = vqrdmulhq_n_s16(v1557, 10045);
+    int16x8_t v1558 = vaddq_s16(v1558_tmp, v1557);
+    int16x8_t v1559 = vaddq_s16(v1556, v1558);
+    int16x8_t v1560 = vsubq_s16(v82, v89);
+    int16x8_t v1561 = vsubq_s16(v92, v97);
+    int16x8_t v1562_tmp = vqrdmulhq_n_s16(v1561, 10045);
+    int16x8_t v1562 = vaddq_s16(v1562_tmp, v1561);
+    int16x8_t v1563 = vaddq_s16(v1560, v1562);
+    int16x8_t v1564 = vqrdmulhq_n_s16(v1563, 19705);
+    int16x8_t v1565 = vaddq_s16(v1559, v1564);
+    int16x8_t v1566 = vsubq_s16(v104, v107);
+    int16x8_t v1567 = vsubq_s16(v112, v116);
+    int16x8_t v1568_tmp = vqrdmulhq_n_s16(v1567, 10045);
+    int16x8_t v1568 = vaddq_s16(v1568_tmp, v1567);
+    int16x8_t v1569 = vaddq_s16(v1566, v1568);
+    int16x8_t v1570 = vsubq_s16(v121, v124);
+    int16x8_t v1571 = vsubq_s16(v127, v132);
+    int16x8_t v1572_tmp = vqrdmulhq_n_s16(v1571, 10045);
+    int16x8_t v1572 = vaddq_s16(v1572_tmp, v1571);
+    int16x8_t v1573 = vaddq_s16(v1570, v1572);
+    int16x8_t v1574 = vqrdmulhq_n_s16(v1573, 19705);
+    int16x8_t v1575 = vaddq_s16(v1569, v1574);
+    int16x8_t v1576 = vqrdmulhq_n_s16(v1575, 17121);
+    int16x8_t v1577 = vaddq_s16(v1565, v1576);
+    int16x8_t v1578 = vqrdmulhq_n_s16(v1577, 16563);
+    int16x8_t v1579 = vaddq_s16(v1555, v1578);
+    int16x8_t v1580 = vsubq_s16(v143, v146);
+    int16x8_t v1581 = vsubq_s16(v151, v155);
+    int16x8_t v1582_tmp = vqrdmulhq_n_s16(v1581, 10045);
+    int16x8_t v1582 = vaddq_s16(v1582_tmp, v1581);
+    int16x8_t v1583 = vaddq_s16(v1580, v1582);
+    int16x8_t v1584 = vsubq_s16(v162, v169);
+    int16x8_t v1585 = vsubq_s16(v172, v177);
+    int16x8_t v1586_tmp = vqrdmulhq_n_s16(v1585, 10045);
+    int16x8_t v1586 = vaddq_s16(v1586_tmp, v1585);
+    int16x8_t v1587 = vaddq_s16(v1584, v1586);
+    int16x8_t v1588 = vqrdmulhq_n_s16(v1587, 19705);
+    int16x8_t v1589 = vaddq_s16(v1583, v1588);
+    int16x8_t v1590 = vsubq_s16(v186, v193);
+    int16x8_t v1591 = vsubq_s16(v202, v210);
+    int16x8_t v1592_tmp = vqrdmulhq_n_s16(v1591, 10045);
+    int16x8_t v1592 = vaddq_s16(v1592_tmp, v1591);
+    int16x8_t v1593 = vaddq_s16(v1590, v1592);
+    int16x8_t v1594 = vsubq_s16(v215, v218);
+    int16x8_t v1595 = vsubq_s16(v221, v227);
+    int16x8_t v1596_tmp = vqrdmulhq_n_s16(v1595, 10045);
+    int16x8_t v1596 = vaddq_s16(v1596_tmp, v1595);
+    int16x8_t v1597 = vaddq_s16(v1594, v1596);
+    int16x8_t v1598 = vqrdmulhq_n_s16(v1597, 19705);
+    int16x8_t v1599 = vaddq_s16(v1593, v1598);
+    int16x8_t v1600 = vqrdmulhq_n_s16(v1599, 17121);
+    int16x8_t v1601 = vaddq_s16(v1589, v1600);
+    int16x8_t v1602 = vsubq_s16(v236, v239);
+    int16x8_t v1603 = vsubq_s16(v244, v248);
+    int16x8_t v1604_tmp = vqrdmulhq_n_s16(v1603, 10045);
+    int16x8_t v1604 = vaddq_s16(v1604_tmp, v1603);
+    int16x8_t v1605 = vaddq_s16(v1602, v1604);
+    int16x8_t v1606 = vsubq_s16(v255, v262);
+    int16x8_t v1607 = vsubq_s16(v265, v270);
+    int16x8_t v1608_tmp = vqrdmulhq_n_s16(v1607, 10045);
+    int16x8_t v1608 = vaddq_s16(v1608_tmp, v1607);
+    int16x8_t v1609 = vaddq_s16(v1606, v1608);
+    int16x8_t v1610 = vqrdmulhq_n_s16(v1609, 19705);
+    int16x8_t v1611 = vaddq_s16(v1605, v1610);
+    int16x8_t v1612 = vsubq_s16(v277, v280);
+    int16x8_t v1613 = vsubq_s16(v285, v289);
+    int16x8_t v1614_tmp = vqrdmulhq_n_s16(v1613, 10045);
+    int16x8_t v1614 = vaddq_s16(v1614_tmp, v1613);
+    int16x8_t v1615 = vaddq_s16(v1612, v1614);
+    int16x8_t v1616 = vsubq_s16(v294, v297);
+    int16x8_t v1617 = vsubq_s16(v300, v306);
+    int16x8_t v1618_tmp = vqrdmulhq_n_s16(v1617, 10045);
+    int16x8_t v1618 = vaddq_s16(v1618_tmp, v1617);
+    int16x8_t v1619 = vaddq_s16(v1616, v1618);
+    int16x8_t v1620 = vqrdmulhq_n_s16(v1619, 19705);
+    int16x8_t v1621 = vaddq_s16(v1615, v1620);
+    int16x8_t v1622 = vqrdmulhq_n_s16(v1621, 17121);
+    int16x8_t v1623 = vaddq_s16(v1611, v1622);
+    int16x8_t v1624 = vqrdmulhq_n_s16(v1623, 16563);
+    int16x8_t v1625 = vaddq_s16(v1601, v1624);
+    int16x8_t v1626 = vqrdmulhq_n_s16(v1625, 16429);
+    int16x8_t v1627 = vaddq_s16(v1579, v1626);
+    int16x8_t v1628 = vsubq_s16(v319, v322);
+    int16x8_t v1629 = vsubq_s16(v327, v331);
+    int16x8_t v1630_tmp = vqrdmulhq_n_s16(v1629, 10045);
+    int16x8_t v1630 = vaddq_s16(v1630_tmp, v1629);
+    int16x8_t v1631 = vaddq_s16(v1628, v1630);
+    int16x8_t v1632 = vsubq_s16(v338, v345);
+    int16x8_t v1633 = vsubq_s16(v348, v353);
+    int16x8_t v1634_tmp = vqrdmulhq_n_s16(v1633, 10045);
+    int16x8_t v1634 = vaddq_s16(v1634_tmp, v1633);
+    int16x8_t v1635 = vaddq_s16(v1632, v1634);
+    int16x8_t v1636 = vqrdmulhq_n_s16(v1635, 19705);
+    int16x8_t v1637 = vaddq_s16(v1631, v1636);
+    int16x8_t v1638 = vsubq_s16(v362, v369);
+    int16x8_t v1639 = vsubq_s16(v378, v386);
+    int16x8_t v1640_tmp = vqrdmulhq_n_s16(v1639, 10045);
+    int16x8_t v1640 = vaddq_s16(v1640_tmp, v1639);
+    int16x8_t v1641 = vaddq_s16(v1638, v1640);
+    int16x8_t v1642 = vsubq_s16(v391, v394);
+    int16x8_t v1643 = vsubq_s16(v397, v403);
+    int16x8_t v1644_tmp = vqrdmulhq_n_s16(v1643, 10045);
+    int16x8_t v1644 = vaddq_s16(v1644_tmp, v1643);
+    int16x8_t v1645 = vaddq_s16(v1642, v1644);
+    int16x8_t v1646 = vqrdmulhq_n_s16(v1645, 19705);
+    int16x8_t v1647 = vaddq_s16(v1641, v1646);
+    int16x8_t v1648 = vqrdmulhq_n_s16(v1647, 17121);
+    int16x8_t v1649 = vaddq_s16(v1637, v1648);
+    int16x8_t v1650 = vsubq_s16(v414, v421);
+    int16x8_t v1651 = vsubq_s16(v430, v438);
+    int16x8_t v1652_tmp = vqrdmulhq_n_s16(v1651, 10045);
+    int16x8_t v1652 = vaddq_s16(v1652_tmp, v1651);
+    int16x8_t v1653 = vaddq_s16(v1650, v1652);
+    int16x8_t v1654 = vsubq_s16(v449, v464);
+    int16x8_t v1655 = vsubq_s16(v467, v476);
+    int16x8_t v1656_tmp = vqrdmulhq_n_s16(v1655, 10045);
+    int16x8_t v1656 = vaddq_s16(v1656_tmp, v1655);
+    int16x8_t v1657 = vaddq_s16(v1654, v1656);
+    int16x8_t v1658 = vqrdmulhq_n_s16(v1657, 19705);
+    int16x8_t v1659 = vaddq_s16(v1653, v1658);
+    int16x8_t v1660 = vsubq_s16(v483, v486);
+    int16x8_t v1661 = vsubq_s16(v491, v495);
+    int16x8_t v1662_tmp = vqrdmulhq_n_s16(v1661, 10045);
+    int16x8_t v1662 = vaddq_s16(v1662_tmp, v1661);
+    int16x8_t v1663 = vaddq_s16(v1660, v1662);
+    int16x8_t v1664 = vsubq_s16(v500, v503);
+    int16x8_t v1665 = vsubq_s16(v506, v513);
+    int16x8_t v1666_tmp = vqrdmulhq_n_s16(v1665, 10045);
+    int16x8_t v1666 = vaddq_s16(v1666_tmp, v1665);
+    int16x8_t v1667 = vaddq_s16(v1664, v1666);
+    int16x8_t v1668 = vqrdmulhq_n_s16(v1667, 19705);
+    int16x8_t v1669 = vaddq_s16(v1663, v1668);
+    int16x8_t v1670 = vqrdmulhq_n_s16(v1669, 17121);
+    int16x8_t v1671 = vaddq_s16(v1659, v1670);
+    int16x8_t v1672 = vqrdmulhq_n_s16(v1671, 16563);
+    int16x8_t v1673 = vaddq_s16(v1649, v1672);
+    int16x8_t v1674 = vsubq_s16(v524, v527);
+    int16x8_t v1675 = vsubq_s16(v532, v536);
+    int16x8_t v1676_tmp = vqrdmulhq_n_s16(v1675, 10045);
+    int16x8_t v1676 = vaddq_s16(v1676_tmp, v1675);
+    int16x8_t v1677 = vaddq_s16(v1674, v1676);
+    int16x8_t v1678 = vsubq_s16(v543, v550);
+    int16x8_t v1679 = vsubq_s16(v553, v558);
+    int16x8_t v1680_tmp = vqrdmulhq_n_s16(v1679, 10045);
+    int16x8_t v1680 = vaddq_s16(v1680_tmp, v1679);
+    int16x8_t v1681 = vaddq_s16(v1678, v1680);
+    int16x8_t v1682 = vqrdmulhq_n_s16(v1681, 19705);
+    int16x8_t v1683 = vaddq_s16(v1677, v1682);
+    int16x8_t v1684 = vsubq_s16(v567, v574);
+    int16x8_t v1685 = vsubq_s16(v583, v591);
+    int16x8_t v1686_tmp = vqrdmulhq_n_s16(v1685, 10045);
+    int16x8_t v1686 = vaddq_s16(v1686_tmp, v1685);
+    int16x8_t v1687 = vaddq_s16(v1684, v1686);
+    int16x8_t v1688 = vsubq_s16(v596, v599);
+    int16x8_t v1689 = vsubq_s16(v602, v608);
+    int16x8_t v1690_tmp = vqrdmulhq_n_s16(v1689, 10045);
+    int16x8_t v1690 = vaddq_s16(v1690_tmp, v1689);
+    int16x8_t v1691 = vaddq_s16(v1688, v1690);
+    int16x8_t v1692 = vqrdmulhq_n_s16(v1691, 19705);
+    int16x8_t v1693 = vaddq_s16(v1687, v1692);
+    int16x8_t v1694 = vqrdmulhq_n_s16(v1693, 17121);
+    int16x8_t v1695 = vaddq_s16(v1683, v1694);
+    int16x8_t v1696 = vsubq_s16(v617, v620);
+    int16x8_t v1697 = vsubq_s16(v625, v629);
+    int16x8_t v1698_tmp = vqrdmulhq_n_s16(v1697, 10045);
+    int16x8_t v1698 = vaddq_s16(v1698_tmp, v1697);
+    int16x8_t v1699 = vaddq_s16(v1696, v1698);
+    int16x8_t v1700 = vsubq_s16(v636, v643);
+    int16x8_t v1701 = vsubq_s16(v646, v651);
+    int16x8_t v1702_tmp = vqrdmulhq_n_s16(v1701, 10045);
+    int16x8_t v1702 = vaddq_s16(v1702_tmp, v1701);
+    int16x8_t v1703 = vaddq_s16(v1700, v1702);
+    int16x8_t v1704 = vqrdmulhq_n_s16(v1703, 19705);
+    int16x8_t v1705 = vaddq_s16(v1699, v1704);
+    int16x8_t v1706 = vsubq_s16(v658, v661);
+    int16x8_t v1707 = vsubq_s16(v666, v670);
+    int16x8_t v1708_tmp = vqrdmulhq_n_s16(v1707, 10045);
+    int16x8_t v1708 = vaddq_s16(v1708_tmp, v1707);
+    int16x8_t v1709 = vaddq_s16(v1706, v1708);
+    int16x8_t v1710 = vsubq_s16(v675, v678);
+    int16x8_t v1711 = vsubq_s16(v681, v688);
+    int16x8_t v1712_tmp = vqrdmulhq_n_s16(v1711, 10045);
+    int16x8_t v1712 = vaddq_s16(v1712_tmp, v1711);
+    int16x8_t v1713 = vaddq_s16(v1710, v1712);
+    int16x8_t v1714 = vqrdmulhq_n_s16(v1713, 19705);
+    int16x8_t v1715 = vaddq_s16(v1709, v1714);
+    int16x8_t v1716 = vqrdmulhq_n_s16(v1715, 17121);
+    int16x8_t v1717 = vaddq_s16(v1705, v1716);
+    int16x8_t v1718 = vqrdmulhq_n_s16(v1717, 16563);
+    int16x8_t v1719 = vaddq_s16(v1695, v1718);
+    int16x8_t v1720 = vqrdmulhq_n_s16(v1719, 16429);
+    int16x8_t v1721 = vaddq_s16(v1673, v1720);
+    int16x8_t v1722 = vqrdmulhq_n_s16(v1721, 16395);
+    int16x8_t v1723 = vaddq_s16(v1627, v1722);
+    int16x8_t v1724 = vsubq_s16(v703, v706);
+    int16x8_t v1725 = vsubq_s16(v711, v715);
+    int16x8_t v1726_tmp = vqrdmulhq_n_s16(v1725, 10045);
+    int16x8_t v1726 = vaddq_s16(v1726_tmp, v1725);
+    int16x8_t v1727 = vaddq_s16(v1724, v1726);
+    int16x8_t v1728 = vsubq_s16(v722, v729);
+    int16x8_t v1729 = vsubq_s16(v732, v737);
+    int16x8_t v1730_tmp = vqrdmulhq_n_s16(v1729, 10045);
+    int16x8_t v1730 = vaddq_s16(v1730_tmp, v1729);
+    int16x8_t v1731 = vaddq_s16(v1728, v1730);
+    int16x8_t v1732 = vqrdmulhq_n_s16(v1731, 19705);
+    int16x8_t v1733 = vaddq_s16(v1727, v1732);
+    int16x8_t v1734 = vsubq_s16(v746, v753);
+    int16x8_t v1735 = vsubq_s16(v762, v770);
+    int16x8_t v1736_tmp = vqrdmulhq_n_s16(v1735, 10045);
+    int16x8_t v1736 = vaddq_s16(v1736_tmp, v1735);
+    int16x8_t v1737 = vaddq_s16(v1734, v1736);
+    int16x8_t v1738 = vsubq_s16(v775, v778);
+    int16x8_t v1739 = vsubq_s16(v781, v787);
+    int16x8_t v1740_tmp = vqrdmulhq_n_s16(v1739, 10045);
+    int16x8_t v1740 = vaddq_s16(v1740_tmp, v1739);
+    int16x8_t v1741 = vaddq_s16(v1738, v1740);
+    int16x8_t v1742 = vqrdmulhq_n_s16(v1741, 19705);
+    int16x8_t v1743 = vaddq_s16(v1737, v1742);
+    int16x8_t v1744 = vqrdmulhq_n_s16(v1743, 17121);
+    int16x8_t v1745 = vaddq_s16(v1733, v1744);
+    int16x8_t v1746 = vsubq_s16(v798, v805);
+    int16x8_t v1747 = vsubq_s16(v814, v822);
+    int16x8_t v1748_tmp = vqrdmulhq_n_s16(v1747, 10045);
+    int16x8_t v1748 = vaddq_s16(v1748_tmp, v1747);
+    int16x8_t v1749 = vaddq_s16(v1746, v1748);
+    int16x8_t v1750 = vsubq_s16(v833, v848);
+    int16x8_t v1751 = vsubq_s16(v851, v860);
+    int16x8_t v1752_tmp = vqrdmulhq_n_s16(v1751, 10045);
+    int16x8_t v1752 = vaddq_s16(v1752_tmp, v1751);
+    int16x8_t v1753 = vaddq_s16(v1750, v1752);
+    int16x8_t v1754 = vqrdmulhq_n_s16(v1753, 19705);
+    int16x8_t v1755 = vaddq_s16(v1749, v1754);
+    int16x8_t v1756 = vsubq_s16(v867, v870);
+    int16x8_t v1757 = vsubq_s16(v875, v879);
+    int16x8_t v1758_tmp = vqrdmulhq_n_s16(v1757, 10045);
+    int16x8_t v1758 = vaddq_s16(v1758_tmp, v1757);
+    int16x8_t v1759 = vaddq_s16(v1756, v1758);
+    int16x8_t v1760 = vsubq_s16(v884, v887);
+    int16x8_t v1761 = vsubq_s16(v890, v897);
+    int16x8_t v1762_tmp = vqrdmulhq_n_s16(v1761, 10045);
+    int16x8_t v1762 = vaddq_s16(v1762_tmp, v1761);
+    int16x8_t v1763 = vaddq_s16(v1760, v1762);
+    int16x8_t v1764 = vqrdmulhq_n_s16(v1763, 19705);
+    int16x8_t v1765 = vaddq_s16(v1759, v1764);
+    int16x8_t v1766 = vqrdmulhq_n_s16(v1765, 17121);
+    int16x8_t v1767 = vaddq_s16(v1755, v1766);
+    int16x8_t v1768 = vqrdmulhq_n_s16(v1767, 16563);
+    int16x8_t v1769 = vaddq_s16(v1745, v1768);
+    int16x8_t v1770 = vsubq_s16(v910, v917);
+    int16x8_t v1771 = vsubq_s16(v926, v934);
+    int16x8_t v1772_tmp = vqrdmulhq_n_s16(v1771, 10045);
+    int16x8_t v1772 = vaddq_s16(v1772_tmp, v1771);
+    int16x8_t v1773 = vaddq_s16(v1770, v1772);
+    int16x8_t v1774 = vsubq_s16(v945, v960);
+    int16x8_t v1775 = vsubq_s16(v963, v972);
+    int16x8_t v1776_tmp = vqrdmulhq_n_s16(v1775, 10045);
+    int16x8_t v1776 = vaddq_s16(v1776_tmp, v1775);
+    int16x8_t v1777 = vaddq_s16(v1774, v1776);
+    int16x8_t v1778 = vqrdmulhq_n_s16(v1777, 19705);
+    int16x8_t v1779 = vaddq_s16(v1773, v1778);
+    int16x8_t v1780 = vsubq_s16(v985, v1000);
+    int16x8_t v1781 = vsubq_s16(v1017, v1033);
+    int16x8_t v1782_tmp = vqrdmulhq_n_s16(v1781, 10045);
+    int16x8_t v1782 = vaddq_s16(v1782_tmp, v1781);
+    int16x8_t v1783 = vaddq_s16(v1780, v1782);
+    int16x8_t v1784 = vsubq_s16(v1038, v1041);
+    int16x8_t v1785 = vsubq_s16(v1044, v1054);
+    int16x8_t v1786_tmp = vqrdmulhq_n_s16(v1785, 10045);
+    int16x8_t v1786 = vaddq_s16(v1786_tmp, v1785);
+    int16x8_t v1787 = vaddq_s16(v1784, v1786);
+    int16x8_t v1788 = vqrdmulhq_n_s16(v1787, 19705);
+    int16x8_t v1789 = vaddq_s16(v1783, v1788);
+    int16x8_t v1790 = vqrdmulhq_n_s16(v1789, 17121);
+    int16x8_t v1791 = vaddq_s16(v1779, v1790);
+    int16x8_t v1792 = vsubq_s16(v1063, v1066);
+    int16x8_t v1793 = vsubq_s16(v1071, v1075);
+    int16x8_t v1794_tmp = vqrdmulhq_n_s16(v1793, 10045);
+    int16x8_t v1794 = vaddq_s16(v1794_tmp, v1793);
+    int16x8_t v1795 = vaddq_s16(v1792, v1794);
+    int16x8_t v1796 = vsubq_s16(v1082, v1089);
+    int16x8_t v1797 = vsubq_s16(v1092, v1097);
+    int16x8_t v1798_tmp = vqrdmulhq_n_s16(v1797, 10045);
+    int16x8_t v1798 = vaddq_s16(v1798_tmp, v1797);
+    int16x8_t v1799 = vaddq_s16(v1796, v1798);
+    int16x8_t v1800 = vqrdmulhq_n_s16(v1799, 19705);
+    int16x8_t v1801 = vaddq_s16(v1795, v1800);
+    int16x8_t v1802 = vsubq_s16(v1104, v1107);
+    int16x8_t v1803 = vsubq_s16(v1112, v1116);
+    int16x8_t v1804_tmp = vqrdmulhq_n_s16(v1803, 10045);
+    int16x8_t v1804 = vaddq_s16(v1804_tmp, v1803);
+    int16x8_t v1805 = vaddq_s16(v1802, v1804);
+    int16x8_t v1806 = vsubq_s16(v1121, v1124);
+    int16x8_t v1807 = vsubq_s16(v1127, v1135);
+    int16x8_t v1808_tmp = vqrdmulhq_n_s16(v1807, 10045);
+    int16x8_t v1808 = vaddq_s16(v1808_tmp, v1807);
+    int16x8_t v1809 = vaddq_s16(v1806, v1808);
+    int16x8_t v1810 = vqrdmulhq_n_s16(v1809, 19705);
+    int16x8_t v1811 = vaddq_s16(v1805, v1810);
+    int16x8_t v1812 = vqrdmulhq_n_s16(v1811, 17121);
+    int16x8_t v1813 = vaddq_s16(v1801, v1812);
+    int16x8_t v1814 = vqrdmulhq_n_s16(v1813, 16563);
+    int16x8_t v1815 = vaddq_s16(v1791, v1814);
+    int16x8_t v1816 = vqrdmulhq_n_s16(v1815, 16429);
+    int16x8_t v1817 = vaddq_s16(v1769, v1816);
+    int16x8_t v1818 = vsubq_s16(v1148, v1151);
+    int16x8_t v1819 = vsubq_s16(v1156, v1160);
+    int16x8_t v1820_tmp = vqrdmulhq_n_s16(v1819, 10045);
+    int16x8_t v1820 = vaddq_s16(v1820_tmp, v1819);
+    int16x8_t v1821 = vaddq_s16(v1818, v1820);
+    int16x8_t v1822 = vsubq_s16(v1167, v1174);
+    int16x8_t v1823 = vsubq_s16(v1177, v1182);
+    int16x8_t v1824_tmp = vqrdmulhq_n_s16(v1823, 10045);
+    int16x8_t v1824 = vaddq_s16(v1824_tmp, v1823);
+    int16x8_t v1825 = vaddq_s16(v1822, v1824);
+    int16x8_t v1826 = vqrdmulhq_n_s16(v1825, 19705);
+    int16x8_t v1827 = vaddq_s16(v1821, v1826);
+    int16x8_t v1828 = vsubq_s16(v1191, v1198);
+    int16x8_t v1829 = vsubq_s16(v1207, v1215);
+    int16x8_t v1830_tmp = vqrdmulhq_n_s16(v1829, 10045);
+    int16x8_t v1830 = vaddq_s16(v1830_tmp, v1829);
+    int16x8_t v1831 = vaddq_s16(v1828, v1830);
+    int16x8_t v1832 = vsubq_s16(v1220, v1223);
+    int16x8_t v1833 = vsubq_s16(v1226, v1232);
+    int16x8_t v1834_tmp = vqrdmulhq_n_s16(v1833, 10045);
+    int16x8_t v1834 = vaddq_s16(v1834_tmp, v1833);
+    int16x8_t v1835 = vaddq_s16(v1832, v1834);
+    int16x8_t v1836 = vqrdmulhq_n_s16(v1835, 19705);
+    int16x8_t v1837 = vaddq_s16(v1831, v1836);
+    int16x8_t v1838 = vqrdmulhq_n_s16(v1837, 17121);
+    int16x8_t v1839 = vaddq_s16(v1827, v1838);
+    int16x8_t v1840 = vsubq_s16(v1243, v1250);
+    int16x8_t v1841 = vsubq_s16(v1259, v1267);
+    int16x8_t v1842_tmp = vqrdmulhq_n_s16(v1841, 10045);
+    int16x8_t v1842 = vaddq_s16(v1842_tmp, v1841);
+    int16x8_t v1843 = vaddq_s16(v1840, v1842);
+    int16x8_t v1844 = vsubq_s16(v1278, v1293);
+    int16x8_t v1845 = vsubq_s16(v1296, v1305);
+    int16x8_t v1846_tmp = vqrdmulhq_n_s16(v1845, 10045);
+    int16x8_t v1846 = vaddq_s16(v1846_tmp, v1845);
+    int16x8_t v1847 = vaddq_s16(v1844, v1846);
+    int16x8_t v1848 = vqrdmulhq_n_s16(v1847, 19705);
+    int16x8_t v1849 = vaddq_s16(v1843, v1848);
+    int16x8_t v1850 = vsubq_s16(v1312, v1315);
+    int16x8_t v1851 = vsubq_s16(v1320, v1324);
+    int16x8_t v1852_tmp = vqrdmulhq_n_s16(v1851, 10045);
+    int16x8_t v1852 = vaddq_s16(v1852_tmp, v1851);
+    int16x8_t v1853 = vaddq_s16(v1850, v1852);
+    int16x8_t v1854 = vsubq_s16(v1329, v1332);
+    int16x8_t v1855 = vsubq_s16(v1335, v1342);
+    int16x8_t v1856_tmp = vqrdmulhq_n_s16(v1855, 10045);
+    int16x8_t v1856 = vaddq_s16(v1856_tmp, v1855);
+    int16x8_t v1857 = vaddq_s16(v1854, v1856);
+    int16x8_t v1858 = vqrdmulhq_n_s16(v1857, 19705);
+    int16x8_t v1859 = vaddq_s16(v1853, v1858);
+    int16x8_t v1860 = vqrdmulhq_n_s16(v1859, 17121);
+    int16x8_t v1861 = vaddq_s16(v1849, v1860);
+    int16x8_t v1862 = vqrdmulhq_n_s16(v1861, 16563);
+    int16x8_t v1863 = vaddq_s16(v1839, v1862);
+    int16x8_t v1864 = vsubq_s16(v1353, v1356);
+    int16x8_t v1865 = vsubq_s16(v1361, v1365);
+    int16x8_t v1866_tmp = vqrdmulhq_n_s16(v1865, 10045);
+    int16x8_t v1866 = vaddq_s16(v1866_tmp, v1865);
+    int16x8_t v1867 = vaddq_s16(v1864, v1866);
+    int16x8_t v1868 = vsubq_s16(v1372, v1379);
+    int16x8_t v1869 = vsubq_s16(v1382, v1387);
+    int16x8_t v1870_tmp = vqrdmulhq_n_s16(v1869, 10045);
+    int16x8_t v1870 = vaddq_s16(v1870_tmp, v1869);
+    int16x8_t v1871 = vaddq_s16(v1868, v1870);
+    int16x8_t v1872 = vqrdmulhq_n_s16(v1871, 19705);
+    int16x8_t v1873 = vaddq_s16(v1867, v1872);
+    int16x8_t v1874 = vsubq_s16(v1396, v1403);
+    int16x8_t v1875 = vsubq_s16(v1412, v1420);
+    int16x8_t v1876_tmp = vqrdmulhq_n_s16(v1875, 10045);
+    int16x8_t v1876 = vaddq_s16(v1876_tmp, v1875);
+    int16x8_t v1877 = vaddq_s16(v1874, v1876);
+    int16x8_t v1878 = vsubq_s16(v1425, v1428);
+    int16x8_t v1879 = vsubq_s16(v1431, v1437);
+    int16x8_t v1880_tmp = vqrdmulhq_n_s16(v1879, 10045);
+    int16x8_t v1880 = vaddq_s16(v1880_tmp, v1879);
+    int16x8_t v1881 = vaddq_s16(v1878, v1880);
+    int16x8_t v1882 = vqrdmulhq_n_s16(v1881, 19705);
+    int16x8_t v1883 = vaddq_s16(v1877, v1882);
+    int16x8_t v1884 = vqrdmulhq_n_s16(v1883, 17121);
+    int16x8_t v1885 = vaddq_s16(v1873, v1884);
+    int16x8_t v1886 = vsubq_s16(v1446, v1449);
+    int16x8_t v1887 = vsubq_s16(v1454, v1458);
+    int16x8_t v1888_tmp = vqrdmulhq_n_s16(v1887, 10045);
+    int16x8_t v1888 = vaddq_s16(v1888_tmp, v1887);
+    int16x8_t v1889 = vaddq_s16(v1886, v1888);
+    int16x8_t v1890 = vsubq_s16(v1465, v1472);
+    int16x8_t v1891 = vsubq_s16(v1475, v1480);
+    int16x8_t v1892_tmp = vqrdmulhq_n_s16(v1891, 10045);
+    int16x8_t v1892 = vaddq_s16(v1892_tmp, v1891);
+    int16x8_t v1893 = vaddq_s16(v1890, v1892);
+    int16x8_t v1894 = vqrdmulhq_n_s16(v1893, 19705);
+    int16x8_t v1895 = vaddq_s16(v1889, v1894);
+    int16x8_t v1896 = vsubq_s16(v1487, v1490);
+    int16x8_t v1897 = vsubq_s16(v1495, v1499);
+    int16x8_t v1898_tmp = vqrdmulhq_n_s16(v1897, 10045);
+    int16x8_t v1898 = vaddq_s16(v1898_tmp, v1897);
+    int16x8_t v1899 = vaddq_s16(v1896, v1898);
+    int16x8_t v1900 = vsubq_s16(v1504, v1507);
+    int16x8_t v1901 = vsubq_s16(v1510, v1518);
+    int16x8_t v1902_tmp = vqrdmulhq_n_s16(v1901, 10045);
+    int16x8_t v1902 = vaddq_s16(v1902_tmp, v1901);
+    int16x8_t v1903 = vaddq_s16(v1900, v1902);
+    int16x8_t v1904 = vqrdmulhq_n_s16(v1903, 19705);
+    int16x8_t v1905 = vaddq_s16(v1899, v1904);
+    int16x8_t v1906 = vqrdmulhq_n_s16(v1905, 17121);
+    int16x8_t v1907 = vaddq_s16(v1895, v1906);
+    int16x8_t v1908 = vqrdmulhq_n_s16(v1907, 16563);
+    int16x8_t v1909 = vaddq_s16(v1885, v1908);
+    int16x8_t v1910 = vqrdmulhq_n_s16(v1909, 16429);
+    int16x8_t v1911 = vaddq_s16(v1863, v1910);
+    int16x8_t v1912 = vqrdmulhq_n_s16(v1911, 16395);
+    int16x8_t v1913 = vaddq_s16(v1817, v1912);
+    int16x8_t v1914 = vqrdmulhq_n_s16(v1913, 16387);
+    int16x8_t v1915 = vaddq_s16(v1723, v1914);
+    int16x8_t v1916 = vsubq_s16(v1534, v1536);
+    int16x8_t v1917 = vsubq_s16(v1538, v1540);
+    int16x8_t v1918 = vqrdmulhq_n_s16(v1917, 29490);
+    int16x8_t v1919 = vaddq_s16(v1916, v1918);
+    int16x8_t v1920 = vsubq_s16(v1544, v1546);
+    int16x8_t v1921 = vsubq_s16(v1548, v1550);
+    int16x8_t v1922 = vqrdmulhq_n_s16(v1921, 29490);
+    int16x8_t v1923 = vaddq_s16(v1920, v1922);
+    int16x8_t v1924 = vqrdmulhq_n_s16(v1923, 18578);
+    int16x8_t v1925 = vaddq_s16(v1919, v1924);
+    int16x8_t v1926 = vsubq_s16(v1556, v1558);
+    int16x8_t v1927 = vsubq_s16(v1560, v1562);
+    int16x8_t v1928 = vqrdmulhq_n_s16(v1927, 29490);
+    int16x8_t v1929 = vaddq_s16(v1926, v1928);
+    int16x8_t v1930 = vsubq_s16(v1566, v1568);
+    int16x8_t v1931 = vsubq_s16(v1570, v1572);
+    int16x8_t v1932 = vqrdmulhq_n_s16(v1931, 29490);
+    int16x8_t v1933 = vaddq_s16(v1930, v1932);
+    int16x8_t v1934 = vqrdmulhq_n_s16(v1933, 18578);
+    int16x8_t v1935 = vaddq_s16(v1929, v1934);
+    int16x8_t v1936 = vqrdmulhq_n_s16(v1935, 16890);
+    int16x8_t v1937 = vaddq_s16(v1925, v1936);
+    int16x8_t v1938 = vsubq_s16(v1580, v1582);
+    int16x8_t v1939 = vsubq_s16(v1584, v1586);
+    int16x8_t v1940 = vqrdmulhq_n_s16(v1939, 29490);
+    int16x8_t v1941 = vaddq_s16(v1938, v1940);
+    int16x8_t v1942 = vsubq_s16(v1590, v1592);
+    int16x8_t v1943 = vsubq_s16(v1594, v1596);
+    int16x8_t v1944 = vqrdmulhq_n_s16(v1943, 29490);
+    int16x8_t v1945 = vaddq_s16(v1942, v1944);
+    int16x8_t v1946 = vqrdmulhq_n_s16(v1945, 18578);
+    int16x8_t v1947 = vaddq_s16(v1941, v1946);
+    int16x8_t v1948 = vsubq_s16(v1602, v1604);
+    int16x8_t v1949 = vsubq_s16(v1606, v1608);
+    int16x8_t v1950 = vqrdmulhq_n_s16(v1949, 29490);
+    int16x8_t v1951 = vaddq_s16(v1948, v1950);
+    int16x8_t v1952 = vsubq_s16(v1612, v1614);
+    int16x8_t v1953 = vsubq_s16(v1616, v1618);
+    int16x8_t v1954 = vqrdmulhq_n_s16(v1953, 29490);
+    int16x8_t v1955 = vaddq_s16(v1952, v1954);
+    int16x8_t v1956 = vqrdmulhq_n_s16(v1955, 18578);
+    int16x8_t v1957 = vaddq_s16(v1951, v1956);
+    int16x8_t v1958 = vqrdmulhq_n_s16(v1957, 16890);
+    int16x8_t v1959 = vaddq_s16(v1947, v1958);
+    int16x8_t v1960 = vqrdmulhq_n_s16(v1959, 16508);
+    int16x8_t v1961 = vaddq_s16(v1937, v1960);
+    int16x8_t v1962 = vsubq_s16(v1628, v1630);
+    int16x8_t v1963 = vsubq_s16(v1632, v1634);
+    int16x8_t v1964 = vqrdmulhq_n_s16(v1963, 29490);
+    int16x8_t v1965 = vaddq_s16(v1962, v1964);
+    int16x8_t v1966 = vsubq_s16(v1638, v1640);
+    int16x8_t v1967 = vsubq_s16(v1642, v1644);
+    int16x8_t v1968 = vqrdmulhq_n_s16(v1967, 29490);
+    int16x8_t v1969 = vaddq_s16(v1966, v1968);
+    int16x8_t v1970 = vqrdmulhq_n_s16(v1969, 18578);
+    int16x8_t v1971 = vaddq_s16(v1965, v1970);
+    int16x8_t v1972 = vsubq_s16(v1650, v1652);
+    int16x8_t v1973 = vsubq_s16(v1654, v1656);
+    int16x8_t v1974 = vqrdmulhq_n_s16(v1973, 29490);
+    int16x8_t v1975 = vaddq_s16(v1972, v1974);
+    int16x8_t v1976 = vsubq_s16(v1660, v1662);
+    int16x8_t v1977 = vsubq_s16(v1664, v1666);
+    int16x8_t v1978 = vqrdmulhq_n_s16(v1977, 29490);
+    int16x8_t v1979 = vaddq_s16(v1976, v1978);
+    int16x8_t v1980 = vqrdmulhq_n_s16(v1979, 18578);
+    int16x8_t v1981 = vaddq_s16(v1975, v1980);
+    int16x8_t v1982 = vqrdmulhq_n_s16(v1981, 16890);
+    int16x8_t v1983 = vaddq_s16(v1971, v1982);
+    int16x8_t v1984 = vsubq_s16(v1674, v1676);
+    int16x8_t v1985 = vsubq_s16(v1678, v1680);
+    int16x8_t v1986 = vqrdmulhq_n_s16(v1985, 29490);
+    int16x8_t v1987 = vaddq_s16(v1984, v1986);
+    int16x8_t v1988 = vsubq_s16(v1684, v1686);
+    int16x8_t v1989 = vsubq_s16(v1688, v1690);
+    int16x8_t v1990 = vqrdmulhq_n_s16(v1989, 29490);
+    int16x8_t v1991 = vaddq_s16(v1988, v1990);
+    int16x8_t v1992 = vqrdmulhq_n_s16(v1991, 18578);
+    int16x8_t v1993 = vaddq_s16(v1987, v1992);
+    int16x8_t v1994 = vsubq_s16(v1696, v1698);
+    int16x8_t v1995 = vsubq_s16(v1700, v1702);
+    int16x8_t v1996 = vqrdmulhq_n_s16(v1995, 29490);
+    int16x8_t v1997 = vaddq_s16(v1994, v1996);
+    int16x8_t v1998 = vsubq_s16(v1706, v1708);
+    int16x8_t v1999 = vsubq_s16(v1710, v1712);
+    int16x8_t v2000 = vqrdmulhq_n_s16(v1999, 29490);
+    int16x8_t v2001 = vaddq_s16(v1998, v2000);
+    int16x8_t v2002 = vqrdmulhq_n_s16(v2001, 18578);
+    int16x8_t v2003 = vaddq_s16(v1997, v2002);
+    int16x8_t v2004 = vqrdmulhq_n_s16(v2003, 16890);
+    int16x8_t v2005 = vaddq_s16(v1993, v2004);
+    int16x8_t v2006 = vqrdmulhq_n_s16(v2005, 16508);
+    int16x8_t v2007 = vaddq_s16(v1983, v2006);
+    int16x8_t v2008 = vqrdmulhq_n_s16(v2007, 16415);
+    int16x8_t v2009 = vaddq_s16(v1961, v2008);
+    int16x8_t v2010 = vsubq_s16(v1724, v1726);
+    int16x8_t v2011 = vsubq_s16(v1728, v1730);
+    int16x8_t v2012 = vqrdmulhq_n_s16(v2011, 29490);
+    int16x8_t v2013 = vaddq_s16(v2010, v2012);
+    int16x8_t v2014 = vsubq_s16(v1734, v1736);
+    int16x8_t v2015 = vsubq_s16(v1738, v1740);
+    int16x8_t v2016 = vqrdmulhq_n_s16(v2015, 29490);
+    int16x8_t v2017 = vaddq_s16(v2014, v2016);
+    int16x8_t v2018 = vqrdmulhq_n_s16(v2017, 18578);
+    int16x8_t v2019 = vaddq_s16(v2013, v2018);
+    int16x8_t v2020 = vsubq_s16(v1746, v1748);
+    int16x8_t v2021 = vsubq_s16(v1750, v1752);
+    int16x8_t v2022 = vqrdmulhq_n_s16(v2021, 29490);
+    int16x8_t v2023 = vaddq_s16(v2020, v2022);
+    int16x8_t v2024 = vsubq_s16(v1756, v1758);
+    int16x8_t v2025 = vsubq_s16(v1760, v1762);
+    int16x8_t v2026 = vqrdmulhq_n_s16(v2025, 29490);
+    int16x8_t v2027 = vaddq_s16(v2024, v2026);
+    int16x8_t v2028 = vqrdmulhq_n_s16(v2027, 18578);
+    int16x8_t v2029 = vaddq_s16(v2023, v2028);
+    int16x8_t v2030 = vqrdmulhq_n_s16(v2029, 16890);
+    int16x8_t v2031 = vaddq_s16(v2019, v2030);
+    int16x8_t v2032 = vsubq_s16(v1770, v1772);
+    int16x8_t v2033 = vsubq_s16(v1774, v1776);
+    int16x8_t v2034 = vqrdmulhq_n_s16(v2033, 29490);
+    int16x8_t v2035 = vaddq_s16(v2032, v2034);
+    int16x8_t v2036 = vsubq_s16(v1780, v1782);
+    int16x8_t v2037 = vsubq_s16(v1784, v1786);
+    int16x8_t v2038 = vqrdmulhq_n_s16(v2037, 29490);
+    int16x8_t v2039 = vaddq_s16(v2036, v2038);
+    int16x8_t v2040 = vqrdmulhq_n_s16(v2039, 18578);
+    int16x8_t v2041 = vaddq_s16(v2035, v2040);
+    int16x8_t v2042 = vsubq_s16(v1792, v1794);
+    int16x8_t v2043 = vsubq_s16(v1796, v1798);
+    int16x8_t v2044 = vqrdmulhq_n_s16(v2043, 29490);
+    int16x8_t v2045 = vaddq_s16(v2042, v2044);
+    int16x8_t v2046 = vsubq_s16(v1802, v1804);
+    int16x8_t v2047 = vsubq_s16(v1806, v1808);
+    int16x8_t v2048 = vqrdmulhq_n_s16(v2047, 29490);
+    int16x8_t v2049 = vaddq_s16(v2046, v2048);
+    int16x8_t v2050 = vqrdmulhq_n_s16(v2049, 18578);
+    int16x8_t v2051 = vaddq_s16(v2045, v2050);
+    int16x8_t v2052 = vqrdmulhq_n_s16(v2051, 16890);
+    int16x8_t v2053 = vaddq_s16(v2041, v2052);
+    int16x8_t v2054 = vqrdmulhq_n_s16(v2053, 16508);
+    int16x8_t v2055 = vaddq_s16(v2031, v2054);
+    int16x8_t v2056 = vsubq_s16(v1818, v1820);
+    int16x8_t v2057 = vsubq_s16(v1822, v1824);
+    int16x8_t v2058 = vqrdmulhq_n_s16(v2057, 29490);
+    int16x8_t v2059 = vaddq_s16(v2056, v2058);
+    int16x8_t v2060 = vsubq_s16(v1828, v1830);
+    int16x8_t v2061 = vsubq_s16(v1832, v1834);
+    int16x8_t v2062 = vqrdmulhq_n_s16(v2061, 29490);
+    int16x8_t v2063 = vaddq_s16(v2060, v2062);
+    int16x8_t v2064 = vqrdmulhq_n_s16(v2063, 18578);
+    int16x8_t v2065 = vaddq_s16(v2059, v2064);
+    int16x8_t v2066 = vsubq_s16(v1840, v1842);
+    int16x8_t v2067 = vsubq_s16(v1844, v1846);
+    int16x8_t v2068 = vqrdmulhq_n_s16(v2067, 29490);
+    int16x8_t v2069 = vaddq_s16(v2066, v2068);
+    int16x8_t v2070 = vsubq_s16(v1850, v1852);
+    int16x8_t v2071 = vqrdmulhq_n_s16(v2070, 18578);
+    int16x8_t v2072 = vsubq_s16(v1854, v1856);
+    int16x8_t v2073 = vqrdmulhq_n_s16(v2072, 16719);
+    int16x8_t v2074 = vaddq_s16(v2071, v2073);
+    int16x8_t v2075 = vaddq_s16(v2069, v2074);
+    int16x8_t v2076 = vqrdmulhq_n_s16(v2075, 16890);
+    int16x8_t v2077 = vaddq_s16(v2065, v2076);
+    int16x8_t v2078 = vsubq_s16(v1864, v1866);
+    int16x8_t v2079 = vsubq_s16(v1868, v1870);
+    int16x8_t v2080 = vqrdmulhq_n_s16(v2079, 29490);
+    int16x8_t v2081 = vaddq_s16(v2078, v2080);
+    int16x8_t v2082 = vsubq_s16(v1874, v1876);
+    int16x8_t v2083 = vsubq_s16(v1878, v1880);
+    int16x8_t v2084 = vqrdmulhq_n_s16(v2083, 29490);
+    int16x8_t v2085 = vaddq_s16(v2082, v2084);
+    int16x8_t v2086 = vqrdmulhq_n_s16(v2085, 18578);
+    int16x8_t v2087 = vaddq_s16(v2081, v2086);
+    int16x8_t v2088 = vsubq_s16(v1886, v1888);
+    int16x8_t v2089 = vsubq_s16(v1890, v1892);
+    int16x8_t v2090 = vqrdmulhq_n_s16(v2089, 29490);
+    int16x8_t v2091 = vaddq_s16(v2088, v2090);
+    int16x8_t v2092 = vsubq_s16(v1896, v1898);
+    int16x8_t v2093 = vsubq_s16(v1900, v1902);
+    int16x8_t v2094 = vqrdmulhq_n_s16(v2093, 29490);
+    int16x8_t v2095 = vaddq_s16(v2092, v2094);
+    int16x8_t v2096 = vqrdmulhq_n_s16(v2095, 18578);
+    int16x8_t v2097 = vaddq_s16(v2091, v2096);
+    int16x8_t v2098 = vqrdmulhq_n_s16(v2097, 16890);
+    int16x8_t v2099 = vaddq_s16(v2087, v2098);
+    int16x8_t v2100 = vqrdmulhq_n_s16(v2099, 16508);
+    int16x8_t v2101 = vaddq_s16(v2077, v2100);
+    int16x8_t v2102 = vqrdmulhq_n_s16(v2101, 16415);
+    int16x8_t v2103 = vaddq_s16(v2055, v2102);
+    int16x8_t v2104 = vqrdmulhq_n_s16(v2103, 16392);
+    int16x8_t v2105 = vaddq_s16(v2009, v2104);
+    int16x8_t v2106 = vsubq_s16(v2, v8);
+    int16x8_t v2107 = vsubq_s16(v15, v22);
+    int16x8_t v2108_tmp = vqrdmulhq_n_s16(v2107, 18446);
+    int16x8_t v2108 = vmlaq_n_s16(v2108_tmp, v2107, 2);
+    int16x8_t v2109 = vaddq_s16(v2106, v2108);
+    int16x8_t v2110 = vsubq_s16(v31, v41);
+    int16x8_t v2111 = vsubq_s16(v48, v56);
+    int16x8_t v2112_tmp = vqrdmulhq_n_s16(v2111, 18446);
+    int16x8_t v2112 = vmlaq_n_s16(v2112_tmp, v2111, 2);
+    int16x8_t v2113 = vaddq_s16(v2110, v2112);
+    int16x8_t v2114 = vqrdmulhq_n_s16(v2113, 21195);
+    int16x8_t v2115 = vaddq_s16(v2109, v2114);
+    int16x8_t v2116 = vsubq_s16(v67, v77);
+    int16x8_t v2117 = vsubq_s16(v90, v99);
+    int16x8_t v2118_tmp = vqrdmulhq_n_s16(v2117, 18446);
+    int16x8_t v2118 = vmlaq_n_s16(v2118_tmp, v2117, 2);
+    int16x8_t v2119 = vaddq_s16(v2116, v2118);
+    int16x8_t v2120 = vsubq_s16(v108, v118);
+    int16x8_t v2121 = vsubq_s16(v125, v134);
+    int16x8_t v2122_tmp = vqrdmulhq_n_s16(v2121, 18446);
+    int16x8_t v2122 = vmlaq_n_s16(v2122_tmp, v2121, 2);
+    int16x8_t v2123 = vaddq_s16(v2120, v2122);
+    int16x8_t v2124 = vqrdmulhq_n_s16(v2123, 21195);
+    int16x8_t v2125 = vaddq_s16(v2119, v2124);
+    int16x8_t v2126 = vqrdmulhq_n_s16(v2125, 17401);
+    int16x8_t v2127 = vaddq_s16(v2115, v2126);
+    int16x8_t v2128 = vsubq_s16(v147, v157);
+    int16x8_t v2129 = vsubq_s16(v170, v179);
+    int16x8_t v2130_tmp = vqrdmulhq_n_s16(v2129, 18446);
+    int16x8_t v2130 = vmlaq_n_s16(v2130_tmp, v2129, 2);
+    int16x8_t v2131 = vaddq_s16(v2128, v2130);
+    int16x8_t v2132 = vsubq_s16(v194, v212);
+    int16x8_t v2133 = vsubq_s16(v219, v229);
+    int16x8_t v2134_tmp = vqrdmulhq_n_s16(v2133, 18446);
+    int16x8_t v2134 = vmlaq_n_s16(v2134_tmp, v2133, 2);
+    int16x8_t v2135 = vaddq_s16(v2132, v2134);
+    int16x8_t v2136 = vqrdmulhq_n_s16(v2135, 21195);
+    int16x8_t v2137 = vaddq_s16(v2131, v2136);
+    int16x8_t v2138 = vsubq_s16(v240, v250);
+    int16x8_t v2139 = vsubq_s16(v263, v272);
+    int16x8_t v2140_tmp = vqrdmulhq_n_s16(v2139, 18446);
+    int16x8_t v2140 = vmlaq_n_s16(v2140_tmp, v2139, 2);
+    int16x8_t v2141 = vaddq_s16(v2138, v2140);
+    int16x8_t v2142 = vsubq_s16(v281, v291);
+    int16x8_t v2143 = vsubq_s16(v298, v308);
+    int16x8_t v2144_tmp = vqrdmulhq_n_s16(v2143, 18446);
+    int16x8_t v2144 = vmlaq_n_s16(v2144_tmp, v2143, 2);
+    int16x8_t v2145 = vaddq_s16(v2142, v2144);
+    int16x8_t v2146 = vqrdmulhq_n_s16(v2145, 21195);
+    int16x8_t v2147 = vaddq_s16(v2141, v2146);
+    int16x8_t v2148 = vqrdmulhq_n_s16(v2147, 17401);
+    int16x8_t v2149 = vaddq_s16(v2137, v2148);
+    int16x8_t v2150 = vqrdmulhq_n_s16(v2149, 16629);
+    int16x8_t v2151 = vaddq_s16(v2127, v2150);
+    int16x8_t v2152 = vsubq_s16(v323, v333);
+    int16x8_t v2153 = vsubq_s16(v346, v355);
+    int16x8_t v2154_tmp = vqrdmulhq_n_s16(v2153, 18446);
+    int16x8_t v2154 = vmlaq_n_s16(v2154_tmp, v2153, 2);
+    int16x8_t v2155 = vaddq_s16(v2152, v2154);
+    int16x8_t v2156 = vsubq_s16(v370, v388);
+    int16x8_t v2157 = vsubq_s16(v395, v405);
+    int16x8_t v2158_tmp = vqrdmulhq_n_s16(v2157, 18446);
+    int16x8_t v2158 = vmlaq_n_s16(v2158_tmp, v2157, 2);
+    int16x8_t v2159 = vaddq_s16(v2156, v2158);
+    int16x8_t v2160 = vqrdmulhq_n_s16(v2159, 21195);
+    int16x8_t v2161 = vaddq_s16(v2155, v2160);
+    int16x8_t v2162 = vsubq_s16(v422, v440);
+    int16x8_t v2163 = vsubq_s16(v465, v478);
+    int16x8_t v2164_tmp = vqrdmulhq_n_s16(v2163, 18446);
+    int16x8_t v2164 = vmlaq_n_s16(v2164_tmp, v2163, 2);
+    int16x8_t v2165 = vaddq_s16(v2162, v2164);
+    int16x8_t v2166 = vsubq_s16(v487, v497);
+    int16x8_t v2167 = vsubq_s16(v504, v515);
+    int16x8_t v2168_tmp = vqrdmulhq_n_s16(v2167, 18446);
+    int16x8_t v2168 = vmlaq_n_s16(v2168_tmp, v2167, 2);
+    int16x8_t v2169 = vaddq_s16(v2166, v2168);
+    int16x8_t v2170 = vqrdmulhq_n_s16(v2169, 21195);
+    int16x8_t v2171 = vaddq_s16(v2165, v2170);
+    int16x8_t v2172 = vqrdmulhq_n_s16(v2171, 17401);
+    int16x8_t v2173 = vaddq_s16(v2161, v2172);
+    int16x8_t v2174 = vsubq_s16(v528, v538);
+    int16x8_t v2175 = vsubq_s16(v551, v560);
+    int16x8_t v2176_tmp = vqrdmulhq_n_s16(v2175, 18446);
+    int16x8_t v2176 = vmlaq_n_s16(v2176_tmp, v2175, 2);
+    int16x8_t v2177 = vaddq_s16(v2174, v2176);
+    int16x8_t v2178 = vsubq_s16(v575, v593);
+    int16x8_t v2179 = vsubq_s16(v600, v610);
+    int16x8_t v2180_tmp = vqrdmulhq_n_s16(v2179, 18446);
+    int16x8_t v2180 = vmlaq_n_s16(v2180_tmp, v2179, 2);
+    int16x8_t v2181 = vaddq_s16(v2178, v2180);
+    int16x8_t v2182 = vqrdmulhq_n_s16(v2181, 21195);
+    int16x8_t v2183 = vaddq_s16(v2177, v2182);
+    int16x8_t v2184 = vsubq_s16(v621, v631);
+    int16x8_t v2185 = vsubq_s16(v644, v653);
+    int16x8_t v2186_tmp = vqrdmulhq_n_s16(v2185, 18446);
+    int16x8_t v2186 = vmlaq_n_s16(v2186_tmp, v2185, 2);
+    int16x8_t v2187 = vaddq_s16(v2184, v2186);
+    int16x8_t v2188 = vsubq_s16(v662, v672);
+    int16x8_t v2189 = vsubq_s16(v679, v690);
+    int16x8_t v2190_tmp = vqrdmulhq_n_s16(v2189, 18446);
+    int16x8_t v2190 = vmlaq_n_s16(v2190_tmp, v2189, 2);
+    int16x8_t v2191 = vaddq_s16(v2188, v2190);
+    int16x8_t v2192 = vqrdmulhq_n_s16(v2191, 21195);
+    int16x8_t v2193 = vaddq_s16(v2187, v2192);
+    int16x8_t v2194 = vqrdmulhq_n_s16(v2193, 17401);
+    int16x8_t v2195 = vaddq_s16(v2183, v2194);
+    int16x8_t v2196 = vqrdmulhq_n_s16(v2195, 16629);
+    int16x8_t v2197 = vaddq_s16(v2173, v2196);
+    int16x8_t v2198 = vqrdmulhq_n_s16(v2197, 16445);
+    int16x8_t v2199 = vaddq_s16(v2151, v2198);
+    int16x8_t v2200 = vsubq_s16(v707, v717);
+    int16x8_t v2201 = vsubq_s16(v730, v739);
+    int16x8_t v2202_tmp = vqrdmulhq_n_s16(v2201, 18446);
+    int16x8_t v2202 = vmlaq_n_s16(v2202_tmp, v2201, 2);
+    int16x8_t v2203 = vaddq_s16(v2200, v2202);
+    int16x8_t v2204 = vsubq_s16(v754, v772);
+    int16x8_t v2205 = vsubq_s16(v779, v789);
+    int16x8_t v2206_tmp = vqrdmulhq_n_s16(v2205, 18446);
+    int16x8_t v2206 = vmlaq_n_s16(v2206_tmp, v2205, 2);
+    int16x8_t v2207 = vaddq_s16(v2204, v2206);
+    int16x8_t v2208 = vqrdmulhq_n_s16(v2207, 21195);
+    int16x8_t v2209 = vaddq_s16(v2203, v2208);
+    int16x8_t v2210 = vsubq_s16(v806, v824);
+    int16x8_t v2211 = vsubq_s16(v849, v862);
+    int16x8_t v2212_tmp = vqrdmulhq_n_s16(v2211, 18446);
+    int16x8_t v2212 = vmlaq_n_s16(v2212_tmp, v2211, 2);
+    int16x8_t v2213 = vaddq_s16(v2210, v2212);
+    int16x8_t v2214 = vsubq_s16(v871, v881);
+    int16x8_t v2215 = vsubq_s16(v888, v899);
+    int16x8_t v2216_tmp = vqrdmulhq_n_s16(v2215, 18446);
+    int16x8_t v2216 = vmlaq_n_s16(v2216_tmp, v2215, 2);
+    int16x8_t v2217 = vaddq_s16(v2214, v2216);
+    int16x8_t v2218 = vqrdmulhq_n_s16(v2217, 21195);
+    int16x8_t v2219 = vaddq_s16(v2213, v2218);
+    int16x8_t v2220 = vqrdmulhq_n_s16(v2219, 17401);
+    int16x8_t v2221 = vaddq_s16(v2209, v2220);
+    int16x8_t v2222 = vsubq_s16(v918, v936);
+    int16x8_t v2223 = vsubq_s16(v961, v974);
+    int16x8_t v2224_tmp = vqrdmulhq_n_s16(v2223, 18446);
+    int16x8_t v2224 = vmlaq_n_s16(v2224_tmp, v2223, 2);
+    int16x8_t v2225 = vaddq_s16(v2222, v2224);
+    int16x8_t v2226 = vsubq_s16(v1001, v1035);
+    int16x8_t v2227 = vsubq_s16(v1042, v1056);
+    int16x8_t v2228_tmp = vqrdmulhq_n_s16(v2227, 18446);
+    int16x8_t v2228 = vmlaq_n_s16(v2228_tmp, v2227, 2);
+    int16x8_t v2229 = vaddq_s16(v2226, v2228);
+    int16x8_t v2230 = vqrdmulhq_n_s16(v2229, 21195);
+    int16x8_t v2231 = vaddq_s16(v2225, v2230);
+    int16x8_t v2232 = vsubq_s16(v1067, v1077);
+    int16x8_t v2233 = vsubq_s16(v1090, v1099);
+    int16x8_t v2234_tmp = vqrdmulhq_n_s16(v2233, 18446);
+    int16x8_t v2234 = vmlaq_n_s16(v2234_tmp, v2233, 2);
+    int16x8_t v2235 = vaddq_s16(v2232, v2234);
+    int16x8_t v2236 = vsubq_s16(v1108, v1118);
+    int16x8_t v2237 = vsubq_s16(v1125, v1137);
+    int16x8_t v2238_tmp = vqrdmulhq_n_s16(v2237, 18446);
+    int16x8_t v2238 = vmlaq_n_s16(v2238_tmp, v2237, 2);
+    int16x8_t v2239 = vaddq_s16(v2236, v2238);
+    int16x8_t v2240 = vqrdmulhq_n_s16(v2239, 21195);
+    int16x8_t v2241 = vaddq_s16(v2235, v2240);
+    int16x8_t v2242 = vqrdmulhq_n_s16(v2241, 17401);
+    int16x8_t v2243 = vaddq_s16(v2231, v2242);
+    int16x8_t v2244 = vqrdmulhq_n_s16(v2243, 16629);
+    int16x8_t v2245 = vaddq_s16(v2221, v2244);
+    int16x8_t v2246 = vsubq_s16(v1152, v1162);
+    int16x8_t v2247 = vsubq_s16(v1175, v1184);
+    int16x8_t v2248_tmp = vqrdmulhq_n_s16(v2247, 18446);
+    int16x8_t v2248 = vmlaq_n_s16(v2248_tmp, v2247, 2);
+    int16x8_t v2249 = vaddq_s16(v2246, v2248);
+    int16x8_t v2250 = vsubq_s16(v1199, v1217);
+    int16x8_t v2251 = vsubq_s16(v1224, v1234);
+    int16x8_t v2252_tmp = vqrdmulhq_n_s16(v2251, 18446);
+    int16x8_t v2252 = vmlaq_n_s16(v2252_tmp, v2251, 2);
+    int16x8_t v2253 = vaddq_s16(v2250, v2252);
+    int16x8_t v2254 = vqrdmulhq_n_s16(v2253, 21195);
+    int16x8_t v2255 = vaddq_s16(v2249, v2254);
+    int16x8_t v2256 = vsubq_s16(v1251, v1269);
+    int16x8_t v2257 = vsubq_s16(v1294, v1307);
+    int16x8_t v2258_tmp = vqrdmulhq_n_s16(v2257, 18446);
+    int16x8_t v2258 = vmlaq_n_s16(v2258_tmp, v2257, 2);
+    int16x8_t v2259 = vaddq_s16(v2256, v2258);
+    int16x8_t v2260 = vsubq_s16(v1316, v1326);
+    int16x8_t v2261 = vsubq_s16(v1333, v1344);
+    int16x8_t v2262_tmp = vqrdmulhq_n_s16(v2261, 18446);
+    int16x8_t v2262 = vmlaq_n_s16(v2262_tmp, v2261, 2);
+    int16x8_t v2263 = vaddq_s16(v2260, v2262);
+    int16x8_t v2264 = vqrdmulhq_n_s16(v2263, 21195);
+    int16x8_t v2265 = vaddq_s16(v2259, v2264);
+    int16x8_t v2266 = vqrdmulhq_n_s16(v2265, 17401);
+    int16x8_t v2267 = vaddq_s16(v2255, v2266);
+    int16x8_t v2268 = vsubq_s16(v1357, v1367);
+    int16x8_t v2269 = vsubq_s16(v1380, v1389);
+    int16x8_t v2270_tmp = vqrdmulhq_n_s16(v2269, 18446);
+    int16x8_t v2270 = vmlaq_n_s16(v2270_tmp, v2269, 2);
+    int16x8_t v2271 = vaddq_s16(v2268, v2270);
+    int16x8_t v2272 = vsubq_s16(v1404, v1422);
+    int16x8_t v2273 = vsubq_s16(v1429, v1439);
+    int16x8_t v2274_tmp = vqrdmulhq_n_s16(v2273, 18446);
+    int16x8_t v2274 = vmlaq_n_s16(v2274_tmp, v2273, 2);
+    int16x8_t v2275 = vaddq_s16(v2272, v2274);
+    int16x8_t v2276 = vqrdmulhq_n_s16(v2275, 21195);
+    int16x8_t v2277 = vaddq_s16(v2271, v2276);
+    int16x8_t v2278 = vsubq_s16(v1450, v1460);
+    int16x8_t v2279 = vsubq_s16(v1473, v1482);
+    int16x8_t v2280_tmp = vqrdmulhq_n_s16(v2279, 18446);
+    int16x8_t v2280 = vmlaq_n_s16(v2280_tmp, v2279, 2);
+    int16x8_t v2281 = vaddq_s16(v2278, v2280);
+    int16x8_t v2282 = vsubq_s16(v1491, v1501);
+    int16x8_t v2283 = vsubq_s16(v1508, v1520);
+    int16x8_t v2284_tmp = vqrdmulhq_n_s16(v2283, 18446);
+    int16x8_t v2284 = vmlaq_n_s16(v2284_tmp, v2283, 2);
+    int16x8_t v2285 = vaddq_s16(v2282, v2284);
+    int16x8_t v2286 = vqrdmulhq_n_s16(v2285, 21195);
+    int16x8_t v2287 = vaddq_s16(v2281, v2286);
+    int16x8_t v2288 = vqrdmulhq_n_s16(v2287, 17401);
+    int16x8_t v2289 = vaddq_s16(v2277, v2288);
+    int16x8_t v2290 = vqrdmulhq_n_s16(v2289, 16629);
+    int16x8_t v2291 = vaddq_s16(v2267, v2290);
+    int16x8_t v2292 = vqrdmulhq_n_s16(v2291, 16445);
+    int16x8_t v2293 = vaddq_s16(v2245, v2292);
+    int16x8_t v2294 = vqrdmulhq_n_s16(v2293, 16399);
+    int16x8_t v2295 = vaddq_s16(v2199, v2294);
+    int16x8_t v2296 = vsubq_s16(v2106, v2108);
+    int16x8_t v2297 = vsubq_s16(v2110, v2112);
+    int16x8_t v2298 = vqrdmulhq_n_s16(v2297, 25826);
+    int16x8_t v2299 = vaddq_s16(v2296, v2298);
+    int16x8_t v2300 = vsubq_s16(v2116, v2118);
+    int16x8_t v2301 = vsubq_s16(v2120, v2122);
+    int16x8_t v2302 = vqrdmulhq_n_s16(v2301, 25826);
+    int16x8_t v2303 = vaddq_s16(v2300, v2302);
+    int16x8_t v2304 = vqrdmulhq_n_s16(v2303, 18124);
+    int16x8_t v2305 = vaddq_s16(v2299, v2304);
+    int16x8_t v2306 = vsubq_s16(v2128, v2130);
+    int16x8_t v2307 = vsubq_s16(v2132, v2134);
+    int16x8_t v2308 = vqrdmulhq_n_s16(v2307, 25826);
+    int16x8_t v2309 = vaddq_s16(v2306, v2308);
+    int16x8_t v2310 = vsubq_s16(v2138, v2140);
+    int16x8_t v2311 = vsubq_s16(v2142, v2144);
+    int16x8_t v2312 = vqrdmulhq_n_s16(v2311, 25826);
+    int16x8_t v2313 = vaddq_s16(v2310, v2312);
+    int16x8_t v2314 = vqrdmulhq_n_s16(v2313, 18124);
+    int16x8_t v2315 = vaddq_s16(v2309, v2314);
+    int16x8_t v2316 = vqrdmulhq_n_s16(v2315, 16792);
+    int16x8_t v2317 = vaddq_s16(v2305, v2316);
+    int16x8_t v2318 = vsubq_s16(v2152, v2154);
+    int16x8_t v2319 = vsubq_s16(v2156, v2158);
+    int16x8_t v2320 = vqrdmulhq_n_s16(v2319, 25826);
+    int16x8_t v2321 = vaddq_s16(v2318, v2320);
+    int16x8_t v2322 = vsubq_s16(v2162, v2164);
+    int16x8_t v2323 = vsubq_s16(v2166, v2168);
+    int16x8_t v2324 = vqrdmulhq_n_s16(v2323, 25826);
+    int16x8_t v2325 = vaddq_s16(v2322, v2324);
+    int16x8_t v2326 = vqrdmulhq_n_s16(v2325, 18124);
+    int16x8_t v2327 = vaddq_s16(v2321, v2326);
+    int16x8_t v2328 = vsubq_s16(v2174, v2176);
+    int16x8_t v2329 = vsubq_s16(v2178, v2180);
+    int16x8_t v2330 = vqrdmulhq_n_s16(v2329, 25826);
+    int16x8_t v2331 = vaddq_s16(v2328, v2330);
+    int16x8_t v2332 = vsubq_s16(v2184, v2186);
+    int16x8_t v2333 = vsubq_s16(v2188, v2190);
+    int16x8_t v2334 = vqrdmulhq_n_s16(v2333, 25826);
+    int16x8_t v2335 = vaddq_s16(v2332, v2334);
+    int16x8_t v2336 = vqrdmulhq_n_s16(v2335, 18124);
+    int16x8_t v2337 = vaddq_s16(v2331, v2336);
+    int16x8_t v2338 = vqrdmulhq_n_s16(v2337, 16792);
+    int16x8_t v2339 = vaddq_s16(v2327, v2338);
+    int16x8_t v2340 = vqrdmulhq_n_s16(v2339, 16484);
+    int16x8_t v2341 = vaddq_s16(v2317, v2340);
+    int16x8_t v2342 = vsubq_s16(v2200, v2202);
+    int16x8_t v2343 = vsubq_s16(v2204, v2206);
+    int16x8_t v2344 = vqrdmulhq_n_s16(v2343, 25826);
+    int16x8_t v2345 = vaddq_s16(v2342, v2344);
+    int16x8_t v2346 = vsubq_s16(v2210, v2212);
+    int16x8_t v2347 = vsubq_s16(v2214, v2216);
+    int16x8_t v2348 = vqrdmulhq_n_s16(v2347, 25826);
+    int16x8_t v2349 = vaddq_s16(v2346, v2348);
+    int16x8_t v2350 = vqrdmulhq_n_s16(v2349, 18124);
+    int16x8_t v2351 = vaddq_s16(v2345, v2350);
+    int16x8_t v2352 = vsubq_s16(v2222, v2224);
+    int16x8_t v2353 = vsubq_s16(v2226, v2228);
+    int16x8_t v2354 = vqrdmulhq_n_s16(v2353, 25826);
+    int16x8_t v2355 = vaddq_s16(v2352, v2354);
+    int16x8_t v2356 = vsubq_s16(v2232, v2234);
+    int16x8_t v2357 = vsubq_s16(v2236, v2238);
+    int16x8_t v2358 = vqrdmulhq_n_s16(v2357, 25826);
+    int16x8_t v2359 = vaddq_s16(v2356, v2358);
+    int16x8_t v2360 = vqrdmulhq_n_s16(v2359, 18124);
+    int16x8_t v2361 = vaddq_s16(v2355, v2360);
+    int16x8_t v2362 = vqrdmulhq_n_s16(v2361, 16792);
+    int16x8_t v2363 = vaddq_s16(v2351, v2362);
+    int16x8_t v2364 = vsubq_s16(v2246, v2248);
+    int16x8_t v2365 = vsubq_s16(v2250, v2252);
+    int16x8_t v2366 = vqrdmulhq_n_s16(v2365, 25826);
+    int16x8_t v2367 = vaddq_s16(v2364, v2366);
+    int16x8_t v2368 = vsubq_s16(v2256, v2258);
+    int16x8_t v2369 = vsubq_s16(v2260, v2262);
+    int16x8_t v2370 = vqrdmulhq_n_s16(v2369, 25826);
+    int16x8_t v2371 = vaddq_s16(v2368, v2370);
+    int16x8_t v2372 = vqrdmulhq_n_s16(v2371, 18124);
+    int16x8_t v2373 = vaddq_s16(v2367, v2372);
+    int16x8_t v2374 = vsubq_s16(v2268, v2270);
+    int16x8_t v2375 = vsubq_s16(v2272, v2274);
+    int16x8_t v2376 = vqrdmulhq_n_s16(v2375, 25826);
+    int16x8_t v2377 = vaddq_s16(v2374, v2376);
+    int16x8_t v2378 = vsubq_s16(v2278, v2280);
+    int16x8_t v2379 = vsubq_s16(v2282, v2284);
+    int16x8_t v2380 = vqrdmulhq_n_s16(v2379, 25826);
+    int16x8_t v2381 = vaddq_s16(v2378, v2380);
+    int16x8_t v2382 = vqrdmulhq_n_s16(v2381, 18124);
+    int16x8_t v2383 = vaddq_s16(v2377, v2382);
+    int16x8_t v2384 = vqrdmulhq_n_s16(v2383, 16792);
+    int16x8_t v2385 = vaddq_s16(v2373, v2384);
+    int16x8_t v2386 = vqrdmulhq_n_s16(v2385, 16484);
+    int16x8_t v2387 = vaddq_s16(v2363, v2386);
+    int16x8_t v2388 = vqrdmulhq_n_s16(v2387, 16409);
+    int16x8_t v2389 = vaddq_s16(v2341, v2388);
+    int16x8_t v2390 = vsubq_s16(v1916, v1918);
+    int16x8_t v2391 = vsubq_s16(v1920, v1922);
+    int16x8_t v2392_tmp = vqrdmulhq_n_s16(v2391, 1988);
+    int16x8_t v2392 = vaddq_s16(v2392_tmp, v2391);
+    int16x8_t v2393 = vaddq_s16(v2390, v2392);
+    int16x8_t v2394 = vsubq_s16(v1926, v1928);
+    int16x8_t v2395 = vsubq_s16(v1930, v1932);
+    int16x8_t v2396_tmp = vqrdmulhq_n_s16(v2395, 1988);
+    int16x8_t v2396 = vaddq_s16(v2396_tmp, v2395);
+    int16x8_t v2397 = vaddq_s16(v2394, v2396);
+    int16x8_t v2398 = vqrdmulhq_n_s16(v2397, 19102);
+    int16x8_t v2399 = vaddq_s16(v2393, v2398);
+    int16x8_t v2400 = vsubq_s16(v1938, v1940);
+    int16x8_t v2401 = vsubq_s16(v1942, v1944);
+    int16x8_t v2402_tmp = vqrdmulhq_n_s16(v2401, 1988);
+    int16x8_t v2402 = vaddq_s16(v2402_tmp, v2401);
+    int16x8_t v2403 = vaddq_s16(v2400, v2402);
+    int16x8_t v2404 = vsubq_s16(v1948, v1950);
+    int16x8_t v2405 = vsubq_s16(v1952, v1954);
+    int16x8_t v2406_tmp = vqrdmulhq_n_s16(v2405, 1988);
+    int16x8_t v2406 = vaddq_s16(v2406_tmp, v2405);
+    int16x8_t v2407 = vaddq_s16(v2404, v2406);
+    int16x8_t v2408 = vqrdmulhq_n_s16(v2407, 19102);
+    int16x8_t v2409 = vaddq_s16(v2403, v2408);
+    int16x8_t v2410 = vqrdmulhq_n_s16(v2409, 17000);
+    int16x8_t v2411 = vaddq_s16(v2399, v2410);
+    int16x8_t v2412 = vsubq_s16(v1962, v1964);
+    int16x8_t v2413 = vsubq_s16(v1966, v1968);
+    int16x8_t v2414_tmp = vqrdmulhq_n_s16(v2413, 1988);
+    int16x8_t v2414 = vaddq_s16(v2414_tmp, v2413);
+    int16x8_t v2415 = vaddq_s16(v2412, v2414);
+    int16x8_t v2416 = vsubq_s16(v1972, v1974);
+    int16x8_t v2417 = vsubq_s16(v1976, v1978);
+    int16x8_t v2418_tmp = vqrdmulhq_n_s16(v2417, 1988);
+    int16x8_t v2418 = vaddq_s16(v2418_tmp, v2417);
+    int16x8_t v2419 = vaddq_s16(v2416, v2418);
+    int16x8_t v2420 = vqrdmulhq_n_s16(v2419, 19102);
+    int16x8_t v2421 = vaddq_s16(v2415, v2420);
+    int16x8_t v2422 = vsubq_s16(v1984, v1986);
+    int16x8_t v2423 = vsubq_s16(v1988, v1990);
+    int16x8_t v2424_tmp = vqrdmulhq_n_s16(v2423, 1988);
+    int16x8_t v2424 = vaddq_s16(v2424_tmp, v2423);
+    int16x8_t v2425 = vaddq_s16(v2422, v2424);
+    int16x8_t v2426 = vsubq_s16(v1994, v1996);
+    int16x8_t v2427 = vsubq_s16(v1998, v2000);
+    int16x8_t v2428_tmp = vqrdmulhq_n_s16(v2427, 1988);
+    int16x8_t v2428 = vaddq_s16(v2428_tmp, v2427);
+    int16x8_t v2429 = vaddq_s16(v2426, v2428);
+    int16x8_t v2430 = vqrdmulhq_n_s16(v2429, 19102);
+    int16x8_t v2431 = vaddq_s16(v2425, v2430);
+    int16x8_t v2432 = vqrdmulhq_n_s16(v2431, 17000);
+    int16x8_t v2433 = vaddq_s16(v2421, v2432);
+    int16x8_t v2434 = vqrdmulhq_n_s16(v2433, 16534);
+    int16x8_t v2435 = vaddq_s16(v2411, v2434);
+    int16x8_t v2436 = vsubq_s16(v2010, v2012);
+    int16x8_t v2437 = vsubq_s16(v2014, v2016);
+    int16x8_t v2438_tmp = vqrdmulhq_n_s16(v2437, 1988);
+    int16x8_t v2438 = vaddq_s16(v2438_tmp, v2437);
+    int16x8_t v2439 = vaddq_s16(v2436, v2438);
+    int16x8_t v2440 = vsubq_s16(v2020, v2022);
+    int16x8_t v2441 = vsubq_s16(v2024, v2026);
+    int16x8_t v2442_tmp = vqrdmulhq_n_s16(v2441, 1988);
+    int16x8_t v2442 = vaddq_s16(v2442_tmp, v2441);
+    int16x8_t v2443 = vaddq_s16(v2440, v2442);
+    int16x8_t v2444 = vqrdmulhq_n_s16(v2443, 19102);
+    int16x8_t v2445 = vaddq_s16(v2439, v2444);
+    int16x8_t v2446 = vsubq_s16(v2032, v2034);
+    int16x8_t v2447 = vsubq_s16(v2036, v2038);
+    int16x8_t v2448_tmp = vqrdmulhq_n_s16(v2447, 1988);
+    int16x8_t v2448 = vaddq_s16(v2448_tmp, v2447);
+    int16x8_t v2449 = vaddq_s16(v2446, v2448);
+    int16x8_t v2450 = vsubq_s16(v2042, v2044);
+    int16x8_t v2451 = vsubq_s16(v2046, v2048);
+    int16x8_t v2452_tmp = vqrdmulhq_n_s16(v2451, 1988);
+    int16x8_t v2452 = vaddq_s16(v2452_tmp, v2451);
+    int16x8_t v2453 = vaddq_s16(v2450, v2452);
+    int16x8_t v2454 = vqrdmulhq_n_s16(v2453, 19102);
+    int16x8_t v2455 = vaddq_s16(v2449, v2454);
+    int16x8_t v2456 = vqrdmulhq_n_s16(v2455, 17000);
+    int16x8_t v2457 = vaddq_s16(v2445, v2456);
+    int16x8_t v2458 = vsubq_s16(v2056, v2058);
+    int16x8_t v2459 = vsubq_s16(v2060, v2062);
+    int16x8_t v2460_tmp = vqrdmulhq_n_s16(v2459, 1988);
+    int16x8_t v2460 = vaddq_s16(v2460_tmp, v2459);
+    int16x8_t v2461 = vaddq_s16(v2458, v2460);
+    int16x8_t v2462 = vsubq_s16(v2066, v2068);
+    int16x8_t v2463 = vqrdmulhq_n_s16(v2072, 29490);
+    int16x8_t v2464 = vsubq_s16(v2070, v2463);
+    int16x8_t v2465_tmp = vqrdmulhq_n_s16(v2464, 1988);
+    int16x8_t v2465 = vaddq_s16(v2465_tmp, v2464);
+    int16x8_t v2466 = vaddq_s16(v2462, v2465);
+    int16x8_t v2467 = vqrdmulhq_n_s16(v2466, 19102);
+    int16x8_t v2468 = vaddq_s16(v2461, v2467);
+    int16x8_t v2469 = vsubq_s16(v2078, v2080);
+    int16x8_t v2470 = vsubq_s16(v2082, v2084);
+    int16x8_t v2471_tmp = vqrdmulhq_n_s16(v2470, 1988);
+    int16x8_t v2471 = vaddq_s16(v2471_tmp, v2470);
+    int16x8_t v2472 = vaddq_s16(v2469, v2471);
+    int16x8_t v2473 = vsubq_s16(v2088, v2090);
+    int16x8_t v2474 = vsubq_s16(v2092, v2094);
+    int16x8_t v2475_tmp = vqrdmulhq_n_s16(v2474, 1988);
+    int16x8_t v2475 = vaddq_s16(v2475_tmp, v2474);
+    int16x8_t v2476 = vaddq_s16(v2473, v2475);
+    int16x8_t v2477 = vqrdmulhq_n_s16(v2476, 19102);
+    int16x8_t v2478 = vaddq_s16(v2472, v2477);
+    int16x8_t v2479 = vqrdmulhq_n_s16(v2478, 17000);
+    int16x8_t v2480 = vaddq_s16(v2468, v2479);
+    int16x8_t v2481 = vqrdmulhq_n_s16(v2480, 16534);
+    int16x8_t v2482 = vaddq_s16(v2457, v2481);
+    int16x8_t v2483 = vqrdmulhq_n_s16(v2482, 16421);
+    int16x8_t v2484 = vaddq_s16(v2435, v2483);
+    int16x8_t v2485 = vsubq_s16(v1537, v1542);
+    int16x8_t v2486 = vsubq_s16(v1547, v1552);
+    int16x8_t v2487_tmp = vqrdmulhq_n_s16(v2486, 23673);
+    int16x8_t v2487 = vaddq_s16(v2487_tmp, v2486);
+    int16x8_t v2488 = vaddq_s16(v2485, v2487);
+    int16x8_t v2489 = vsubq_s16(v1559, v1564);
+    int16x8_t v2490 = vsubq_s16(v1569, v1574);
+    int16x8_t v2491_tmp = vqrdmulhq_n_s16(v2490, 23673);
+    int16x8_t v2491 = vaddq_s16(v2491_tmp, v2490);
+    int16x8_t v2492 = vaddq_s16(v2489, v2491);
+    int16x8_t v2493 = vqrdmulhq_n_s16(v2492, 20398);
+    int16x8_t v2494 = vaddq_s16(v2488, v2493);
+    int16x8_t v2495 = vsubq_s16(v1583, v1588);
+    int16x8_t v2496 = vsubq_s16(v1593, v1598);
+    int16x8_t v2497_tmp = vqrdmulhq_n_s16(v2496, 23673);
+    int16x8_t v2497 = vaddq_s16(v2497_tmp, v2496);
+    int16x8_t v2498 = vaddq_s16(v2495, v2497);
+    int16x8_t v2499 = vsubq_s16(v1605, v1610);
+    int16x8_t v2500 = vsubq_s16(v1615, v1620);
+    int16x8_t v2501_tmp = vqrdmulhq_n_s16(v2500, 23673);
+    int16x8_t v2501 = vaddq_s16(v2501_tmp, v2500);
+    int16x8_t v2502 = vaddq_s16(v2499, v2501);
+    int16x8_t v2503 = vqrdmulhq_n_s16(v2502, 20398);
+    int16x8_t v2504 = vaddq_s16(v2498, v2503);
+    int16x8_t v2505 = vqrdmulhq_n_s16(v2504, 17255);
+    int16x8_t v2506 = vaddq_s16(v2494, v2505);
+    int16x8_t v2507 = vsubq_s16(v1631, v1636);
+    int16x8_t v2508 = vsubq_s16(v1641, v1646);
+    int16x8_t v2509_tmp = vqrdmulhq_n_s16(v2508, 23673);
+    int16x8_t v2509 = vaddq_s16(v2509_tmp, v2508);
+    int16x8_t v2510 = vaddq_s16(v2507, v2509);
+    int16x8_t v2511 = vsubq_s16(v1653, v1658);
+    int16x8_t v2512 = vsubq_s16(v1663, v1668);
+    int16x8_t v2513_tmp = vqrdmulhq_n_s16(v2512, 23673);
+    int16x8_t v2513 = vaddq_s16(v2513_tmp, v2512);
+    int16x8_t v2514 = vaddq_s16(v2511, v2513);
+    int16x8_t v2515 = vqrdmulhq_n_s16(v2514, 20398);
+    int16x8_t v2516 = vaddq_s16(v2510, v2515);
+    int16x8_t v2517 = vsubq_s16(v1677, v1682);
+    int16x8_t v2518 = vsubq_s16(v1687, v1692);
+    int16x8_t v2519_tmp = vqrdmulhq_n_s16(v2518, 23673);
+    int16x8_t v2519 = vaddq_s16(v2519_tmp, v2518);
+    int16x8_t v2520 = vaddq_s16(v2517, v2519);
+    int16x8_t v2521 = vsubq_s16(v1699, v1704);
+    int16x8_t v2522 = vsubq_s16(v1709, v1714);
+    int16x8_t v2523_tmp = vqrdmulhq_n_s16(v2522, 23673);
+    int16x8_t v2523 = vaddq_s16(v2523_tmp, v2522);
+    int16x8_t v2524 = vaddq_s16(v2521, v2523);
+    int16x8_t v2525 = vqrdmulhq_n_s16(v2524, 20398);
+    int16x8_t v2526 = vaddq_s16(v2520, v2525);
+    int16x8_t v2527 = vqrdmulhq_n_s16(v2526, 17255);
+    int16x8_t v2528 = vaddq_s16(v2516, v2527);
+    int16x8_t v2529 = vqrdmulhq_n_s16(v2528, 16595);
+    int16x8_t v2530 = vaddq_s16(v2506, v2529);
+    int16x8_t v2531 = vsubq_s16(v1727, v1732);
+    int16x8_t v2532 = vsubq_s16(v1737, v1742);
+    int16x8_t v2533_tmp = vqrdmulhq_n_s16(v2532, 23673);
+    int16x8_t v2533 = vaddq_s16(v2533_tmp, v2532);
+    int16x8_t v2534 = vaddq_s16(v2531, v2533);
+    int16x8_t v2535 = vsubq_s16(v1749, v1754);
+    int16x8_t v2536 = vsubq_s16(v1759, v1764);
+    int16x8_t v2537_tmp = vqrdmulhq_n_s16(v2536, 23673);
+    int16x8_t v2537 = vaddq_s16(v2537_tmp, v2536);
+    int16x8_t v2538 = vaddq_s16(v2535, v2537);
+    int16x8_t v2539 = vqrdmulhq_n_s16(v2538, 20398);
+    int16x8_t v2540 = vaddq_s16(v2534, v2539);
+    int16x8_t v2541 = vsubq_s16(v1773, v1778);
+    int16x8_t v2542 = vsubq_s16(v1783, v1788);
+    int16x8_t v2543_tmp = vqrdmulhq_n_s16(v2542, 23673);
+    int16x8_t v2543 = vaddq_s16(v2543_tmp, v2542);
+    int16x8_t v2544 = vaddq_s16(v2541, v2543);
+    int16x8_t v2545 = vsubq_s16(v1795, v1800);
+    int16x8_t v2546 = vsubq_s16(v1805, v1810);
+    int16x8_t v2547_tmp = vqrdmulhq_n_s16(v2546, 23673);
+    int16x8_t v2547 = vaddq_s16(v2547_tmp, v2546);
+    int16x8_t v2548 = vaddq_s16(v2545, v2547);
+    int16x8_t v2549 = vqrdmulhq_n_s16(v2548, 20398);
+    int16x8_t v2550 = vaddq_s16(v2544, v2549);
+    int16x8_t v2551 = vqrdmulhq_n_s16(v2550, 17255);
+    int16x8_t v2552 = vaddq_s16(v2540, v2551);
+    int16x8_t v2553 = vsubq_s16(v1821, v1826);
+    int16x8_t v2554 = vsubq_s16(v1831, v1836);
+    int16x8_t v2555_tmp = vqrdmulhq_n_s16(v2554, 23673);
+    int16x8_t v2555 = vaddq_s16(v2555_tmp, v2554);
+    int16x8_t v2556 = vaddq_s16(v2553, v2555);
+    int16x8_t v2557 = vsubq_s16(v1843, v1848);
+    int16x8_t v2558 = vsubq_s16(v1853, v1858);
+    int16x8_t v2559_tmp = vqrdmulhq_n_s16(v2558, 23673);
+    int16x8_t v2559 = vaddq_s16(v2559_tmp, v2558);
+    int16x8_t v2560 = vaddq_s16(v2557, v2559);
+    int16x8_t v2561 = vqrdmulhq_n_s16(v2560, 20398);
+    int16x8_t v2562 = vaddq_s16(v2556, v2561);
+    int16x8_t v2563 = vsubq_s16(v1867, v1872);
+    int16x8_t v2564 = vsubq_s16(v1877, v1882);
+    int16x8_t v2565_tmp = vqrdmulhq_n_s16(v2564, 23673);
+    int16x8_t v2565 = vaddq_s16(v2565_tmp, v2564);
+    int16x8_t v2566 = vaddq_s16(v2563, v2565);
+    int16x8_t v2567 = vsubq_s16(v1889, v1894);
+    int16x8_t v2568 = vsubq_s16(v1899, v1904);
+    int16x8_t v2569_tmp = vqrdmulhq_n_s16(v2568, 23673);
+    int16x8_t v2569 = vaddq_s16(v2569_tmp, v2568);
+    int16x8_t v2570 = vaddq_s16(v2567, v2569);
+    int16x8_t v2571 = vqrdmulhq_n_s16(v2570, 20398);
+    int16x8_t v2572 = vaddq_s16(v2566, v2571);
+    int16x8_t v2573 = vqrdmulhq_n_s16(v2572, 17255);
+    int16x8_t v2574 = vaddq_s16(v2562, v2573);
+    int16x8_t v2575 = vqrdmulhq_n_s16(v2574, 16595);
+    int16x8_t v2576 = vaddq_s16(v2552, v2575);
+    int16x8_t v2577 = vqrdmulhq_n_s16(v2576, 16436);
+    int16x8_t v2578 = vaddq_s16(v2530, v2577);
+    int16x8_t v2579 = vsubq_s16(v9, v24);
+    int16x8_t v2580 = vsubq_s16(v42, v58);
+    int16x8_t v2581_tmp = vqrdmulhq_n_s16(v2580, 3314);
+    int16x8_t v2581 = vmlaq_n_s16(v2581_tmp, v2580, 5);
+    int16x8_t v2582 = vaddq_s16(v2579, v2581);
+    int16x8_t v2583 = vsubq_s16(v78, v101);
+    int16x8_t v2584 = vsubq_s16(v119, v136);
+    int16x8_t v2585_tmp = vqrdmulhq_n_s16(v2584, 3314);
+    int16x8_t v2585 = vmlaq_n_s16(v2585_tmp, v2584, 5);
+    int16x8_t v2586 = vaddq_s16(v2583, v2585);
+    int16x8_t v2587 = vqrdmulhq_n_s16(v2586, 22112);
+    int16x8_t v2588 = vaddq_s16(v2582, v2587);
+    int16x8_t v2589 = vsubq_s16(v158, v181);
+    int16x8_t v2590 = vsubq_s16(v213, v231);
+    int16x8_t v2591_tmp = vqrdmulhq_n_s16(v2590, 3314);
+    int16x8_t v2591 = vmlaq_n_s16(v2591_tmp, v2590, 5);
+    int16x8_t v2592 = vaddq_s16(v2589, v2591);
+    int16x8_t v2593 = vsubq_s16(v251, v274);
+    int16x8_t v2594 = vsubq_s16(v292, v310);
+    int16x8_t v2595_tmp = vqrdmulhq_n_s16(v2594, 3314);
+    int16x8_t v2595 = vmlaq_n_s16(v2595_tmp, v2594, 5);
+    int16x8_t v2596 = vaddq_s16(v2593, v2595);
+    int16x8_t v2597 = vqrdmulhq_n_s16(v2596, 22112);
+    int16x8_t v2598 = vaddq_s16(v2592, v2597);
+    int16x8_t v2599 = vqrdmulhq_n_s16(v2598, 17561);
+    int16x8_t v2600 = vaddq_s16(v2588, v2599);
+    int16x8_t v2601 = vsubq_s16(v334, v357);
+    int16x8_t v2602 = vsubq_s16(v389, v407);
+    int16x8_t v2603_tmp = vqrdmulhq_n_s16(v2602, 3314);
+    int16x8_t v2603 = vmlaq_n_s16(v2603_tmp, v2602, 5);
+    int16x8_t v2604 = vaddq_s16(v2601, v2603);
+    int16x8_t v2605 = vsubq_s16(v441, v480);
+    int16x8_t v2606 = vsubq_s16(v498, v517);
+    int16x8_t v2607_tmp = vqrdmulhq_n_s16(v2606, 3314);
+    int16x8_t v2607 = vmlaq_n_s16(v2607_tmp, v2606, 5);
+    int16x8_t v2608 = vaddq_s16(v2605, v2607);
+    int16x8_t v2609 = vqrdmulhq_n_s16(v2608, 22112);
+    int16x8_t v2610 = vaddq_s16(v2604, v2609);
+    int16x8_t v2611 = vsubq_s16(v539, v562);
+    int16x8_t v2612 = vsubq_s16(v594, v612);
+    int16x8_t v2613_tmp = vqrdmulhq_n_s16(v2612, 3314);
+    int16x8_t v2613 = vmlaq_n_s16(v2613_tmp, v2612, 5);
+    int16x8_t v2614 = vaddq_s16(v2611, v2613);
+    int16x8_t v2615 = vsubq_s16(v632, v655);
+    int16x8_t v2616 = vsubq_s16(v673, v692);
+    int16x8_t v2617_tmp = vqrdmulhq_n_s16(v2616, 3314);
+    int16x8_t v2617 = vmlaq_n_s16(v2617_tmp, v2616, 5);
+    int16x8_t v2618 = vaddq_s16(v2615, v2617);
+    int16x8_t v2619 = vqrdmulhq_n_s16(v2618, 22112);
+    int16x8_t v2620 = vaddq_s16(v2614, v2619);
+    int16x8_t v2621 = vqrdmulhq_n_s16(v2620, 17561);
+    int16x8_t v2622 = vaddq_s16(v2610, v2621);
+    int16x8_t v2623 = vqrdmulhq_n_s16(v2622, 16666);
+    int16x8_t v2624 = vaddq_s16(v2600, v2623);
+    int16x8_t v2625 = vsubq_s16(v718, v741);
+    int16x8_t v2626 = vsubq_s16(v773, v791);
+    int16x8_t v2627_tmp = vqrdmulhq_n_s16(v2626, 3314);
+    int16x8_t v2627 = vmlaq_n_s16(v2627_tmp, v2626, 5);
+    int16x8_t v2628 = vaddq_s16(v2625, v2627);
+    int16x8_t v2629 = vsubq_s16(v825, v864);
+    int16x8_t v2630 = vsubq_s16(v882, v901);
+    int16x8_t v2631_tmp = vqrdmulhq_n_s16(v2630, 3314);
+    int16x8_t v2631 = vmlaq_n_s16(v2631_tmp, v2630, 5);
+    int16x8_t v2632 = vaddq_s16(v2629, v2631);
+    int16x8_t v2633 = vqrdmulhq_n_s16(v2632, 22112);
+    int16x8_t v2634 = vaddq_s16(v2628, v2633);
+    int16x8_t v2635 = vsubq_s16(v937, v976);
+    int16x8_t v2636 = vsubq_s16(v1036, v1058);
+    int16x8_t v2637_tmp = vqrdmulhq_n_s16(v2636, 3314);
+    int16x8_t v2637 = vmlaq_n_s16(v2637_tmp, v2636, 5);
+    int16x8_t v2638 = vaddq_s16(v2635, v2637);
+    int16x8_t v2639 = vsubq_s16(v1078, v1101);
+    int16x8_t v2640 = vsubq_s16(v1119, v1139);
+    int16x8_t v2641_tmp = vqrdmulhq_n_s16(v2640, 3314);
+    int16x8_t v2641 = vmlaq_n_s16(v2641_tmp, v2640, 5);
+    int16x8_t v2642 = vaddq_s16(v2639, v2641);
+    int16x8_t v2643 = vqrdmulhq_n_s16(v2642, 22112);
+    int16x8_t v2644 = vaddq_s16(v2638, v2643);
+    int16x8_t v2645 = vqrdmulhq_n_s16(v2644, 17561);
+    int16x8_t v2646 = vaddq_s16(v2634, v2645);
+    int16x8_t v2647 = vsubq_s16(v1163, v1186);
+    int16x8_t v2648 = vsubq_s16(v1218, v1236);
+    int16x8_t v2649_tmp = vqrdmulhq_n_s16(v2648, 3314);
+    int16x8_t v2649 = vmlaq_n_s16(v2649_tmp, v2648, 5);
+    int16x8_t v2650 = vaddq_s16(v2647, v2649);
+    int16x8_t v2651 = vsubq_s16(v1270, v1309);
+    int16x8_t v2652 = vsubq_s16(v1327, v1346);
+    int16x8_t v2653_tmp = vqrdmulhq_n_s16(v2652, 3314);
+    int16x8_t v2653 = vmlaq_n_s16(v2653_tmp, v2652, 5);
+    int16x8_t v2654 = vaddq_s16(v2651, v2653);
+    int16x8_t v2655 = vqrdmulhq_n_s16(v2654, 22112);
+    int16x8_t v2656 = vaddq_s16(v2650, v2655);
+    int16x8_t v2657 = vsubq_s16(v1368, v1391);
+    int16x8_t v2658 = vsubq_s16(v1423, v1441);
+    int16x8_t v2659_tmp = vqrdmulhq_n_s16(v2658, 3314);
+    int16x8_t v2659 = vmlaq_n_s16(v2659_tmp, v2658, 5);
+    int16x8_t v2660 = vaddq_s16(v2657, v2659);
+    int16x8_t v2661 = vsubq_s16(v1461, v1484);
+    int16x8_t v2662 = vsubq_s16(v1502, v1522);
+    int16x8_t v2663_tmp = vqrdmulhq_n_s16(v2662, 3314);
+    int16x8_t v2663 = vmlaq_n_s16(v2663_tmp, v2662, 5);
+    int16x8_t v2664 = vaddq_s16(v2661, v2663);
+    int16x8_t v2665 = vqrdmulhq_n_s16(v2664, 22112);
+    int16x8_t v2666 = vaddq_s16(v2660, v2665);
+    int16x8_t v2667 = vqrdmulhq_n_s16(v2666, 17561);
+    int16x8_t v2668 = vaddq_s16(v2656, v2667);
+    int16x8_t v2669 = vqrdmulhq_n_s16(v2668, 16666);
+    int16x8_t v2670 = vaddq_s16(v2646, v2669);
+    int16x8_t v2671 = vqrdmulhq_n_s16(v2670, 16454);
+    int16x8_t v2672 = vaddq_s16(v2624, v2671);
+    int16x8_t v2673 = vsubq_s16(v2579, v2581);
+    int16x8_t v2674 = vsubq_s16(v2583, v2585);
+    int16x8_t v2675 = vqrdmulhq_n_s16(v2674, 24397);
+    int16x8_t v2676 = vaddq_s16(v2673, v2675);
+    int16x8_t v2677 = vsubq_s16(v2589, v2591);
+    int16x8_t v2678 = vsubq_s16(v2593, v2595);
+    int16x8_t v2679 = vqrdmulhq_n_s16(v2678, 24397);
+    int16x8_t v2680 = vaddq_s16(v2677, v2679);
+    int16x8_t v2681 = vqrdmulhq_n_s16(v2680, 17921);
+    int16x8_t v2682 = vaddq_s16(v2676, v2681);
+    int16x8_t v2683 = vsubq_s16(v2601, v2603);
+    int16x8_t v2684 = vsubq_s16(v2605, v2607);
+    int16x8_t v2685 = vqrdmulhq_n_s16(v2684, 24397);
+    int16x8_t v2686 = vaddq_s16(v2683, v2685);
+    int16x8_t v2687 = vsubq_s16(v2611, v2613);
+    int16x8_t v2688 = vsubq_s16(v2615, v2617);
+    int16x8_t v2689 = vqrdmulhq_n_s16(v2688, 24397);
+    int16x8_t v2690 = vaddq_s16(v2687, v2689);
+    int16x8_t v2691 = vqrdmulhq_n_s16(v2690, 17921);
+    int16x8_t v2692 = vaddq_s16(v2686, v2691);
+    int16x8_t v2693 = vqrdmulhq_n_s16(v2692, 16747);
+    int16x8_t v2694 = vaddq_s16(v2682, v2693);
+    int16x8_t v2695 = vsubq_s16(v2625, v2627);
+    int16x8_t v2696 = vsubq_s16(v2629, v2631);
+    int16x8_t v2697 = vqrdmulhq_n_s16(v2696, 24397);
+    int16x8_t v2698 = vaddq_s16(v2695, v2697);
+    int16x8_t v2699 = vsubq_s16(v2635, v2637);
+    int16x8_t v2700 = vsubq_s16(v2639, v2641);
+    int16x8_t v2701 = vqrdmulhq_n_s16(v2700, 24397);
+    int16x8_t v2702 = vaddq_s16(v2699, v2701);
+    int16x8_t v2703 = vqrdmulhq_n_s16(v2702, 17921);
+    int16x8_t v2704 = vaddq_s16(v2698, v2703);
+    int16x8_t v2705 = vsubq_s16(v2647, v2649);
+    int16x8_t v2706 = vsubq_s16(v2651, v2653);
+    int16x8_t v2707 = vqrdmulhq_n_s16(v2706, 24397);
+    int16x8_t v2708 = vaddq_s16(v2705, v2707);
+    int16x8_t v2709 = vsubq_s16(v2657, v2659);
+    int16x8_t v2710 = vsubq_s16(v2661, v2663);
+    int16x8_t v2711 = vqrdmulhq_n_s16(v2710, 24397);
+    int16x8_t v2712 = vaddq_s16(v2709, v2711);
+    int16x8_t v2713 = vqrdmulhq_n_s16(v2712, 17921);
+    int16x8_t v2714 = vaddq_s16(v2708, v2713);
+    int16x8_t v2715 = vqrdmulhq_n_s16(v2714, 16747);
+    int16x8_t v2716 = vaddq_s16(v2704, v2715);
+    int16x8_t v2717 = vqrdmulhq_n_s16(v2716, 16474);
+    int16x8_t v2718 = vaddq_s16(v2694, v2717);
+    int16x8_t v2719 = vsubq_s16(v2485, v2487);
+    int16x8_t v2720 = vsubq_s16(v2489, v2491);
+    int16x8_t v2721 = vqrdmulhq_n_s16(v2720, 27504);
+    int16x8_t v2722 = vaddq_s16(v2719, v2721);
+    int16x8_t v2723 = vsubq_s16(v2495, v2497);
+    int16x8_t v2724 = vsubq_s16(v2499, v2501);
+    int16x8_t v2725 = vqrdmulhq_n_s16(v2724, 27504);
+    int16x8_t v2726 = vaddq_s16(v2723, v2725);
+    int16x8_t v2727 = vqrdmulhq_n_s16(v2726, 18343);
+    int16x8_t v2728 = vaddq_s16(v2722, v2727);
+    int16x8_t v2729 = vsubq_s16(v2507, v2509);
+    int16x8_t v2730 = vsubq_s16(v2511, v2513);
+    int16x8_t v2731 = vqrdmulhq_n_s16(v2730, 27504);
+    int16x8_t v2732 = vaddq_s16(v2729, v2731);
+    int16x8_t v2733 = vsubq_s16(v2517, v2519);
+    int16x8_t v2734 = vsubq_s16(v2521, v2523);
+    int16x8_t v2735 = vqrdmulhq_n_s16(v2734, 27504);
+    int16x8_t v2736 = vaddq_s16(v2733, v2735);
+    int16x8_t v2737 = vqrdmulhq_n_s16(v2736, 18343);
+    int16x8_t v2738 = vaddq_s16(v2732, v2737);
+    int16x8_t v2739 = vqrdmulhq_n_s16(v2738, 16840);
+    int16x8_t v2740 = vaddq_s16(v2728, v2739);
+    int16x8_t v2741 = vsubq_s16(v2531, v2533);
+    int16x8_t v2742 = vsubq_s16(v2535, v2537);
+    int16x8_t v2743 = vqrdmulhq_n_s16(v2742, 27504);
+    int16x8_t v2744 = vaddq_s16(v2741, v2743);
+    int16x8_t v2745 = vsubq_s16(v2541, v2543);
+    int16x8_t v2746 = vsubq_s16(v2545, v2547);
+    int16x8_t v2747 = vqrdmulhq_n_s16(v2746, 27504);
+    int16x8_t v2748 = vaddq_s16(v2745, v2747);
+    int16x8_t v2749 = vqrdmulhq_n_s16(v2748, 18343);
+    int16x8_t v2750 = vaddq_s16(v2744, v2749);
+    int16x8_t v2751 = vsubq_s16(v2553, v2555);
+    int16x8_t v2752 = vsubq_s16(v2557, v2559);
+    int16x8_t v2753 = vqrdmulhq_n_s16(v2752, 27504);
+    int16x8_t v2754 = vaddq_s16(v2751, v2753);
+    int16x8_t v2755 = vsubq_s16(v2563, v2565);
+    int16x8_t v2756 = vsubq_s16(v2567, v2569);
+    int16x8_t v2757 = vqrdmulhq_n_s16(v2756, 27504);
+    int16x8_t v2758 = vaddq_s16(v2755, v2757);
+    int16x8_t v2759 = vqrdmulhq_n_s16(v2758, 18343);
+    int16x8_t v2760 = vaddq_s16(v2754, v2759);
+    int16x8_t v2761 = vqrdmulhq_n_s16(v2760, 16840);
+    int16x8_t v2762 = vaddq_s16(v2750, v2761);
+    int16x8_t v2763 = vqrdmulhq_n_s16(v2762, 16496);
+    int16x8_t v2764 = vaddq_s16(v2740, v2763);
+    int16x8_t v2765 = vsubq_s16(v2390, v2392);
+    int16x8_t v2766 = vsubq_s16(v2394, v2396);
+    int16x8_t v2767 = vqrdmulhq_n_s16(v2766, 31869);
+    int16x8_t v2768 = vaddq_s16(v2765, v2767);
+    int16x8_t v2769 = vsubq_s16(v2400, v2402);
+    int16x8_t v2770 = vsubq_s16(v2404, v2406);
+    int16x8_t v2771 = vqrdmulhq_n_s16(v2770, 31869);
+    int16x8_t v2772 = vaddq_s16(v2769, v2771);
+    int16x8_t v2773 = vqrdmulhq_n_s16(v2772, 18830);
+    int16x8_t v2774 = vaddq_s16(v2768, v2773);
+    int16x8_t v2775 = vsubq_s16(v2412, v2414);
+    int16x8_t v2776 = vsubq_s16(v2416, v2418);
+    int16x8_t v2777 = vqrdmulhq_n_s16(v2776, 31869);
+    int16x8_t v2778 = vaddq_s16(v2775, v2777);
+    int16x8_t v2779 = vsubq_s16(v2422, v2424);
+    int16x8_t v2780 = vsubq_s16(v2426, v2428);
+    int16x8_t v2781 = vqrdmulhq_n_s16(v2780, 31869);
+    int16x8_t v2782 = vaddq_s16(v2779, v2781);
+    int16x8_t v2783 = vqrdmulhq_n_s16(v2782, 18830);
+    int16x8_t v2784 = vaddq_s16(v2778, v2783);
+    int16x8_t v2785 = vqrdmulhq_n_s16(v2784, 16944);
+    int16x8_t v2786 = vaddq_s16(v2774, v2785);
+    int16x8_t v2787 = vsubq_s16(v2436, v2438);
+    int16x8_t v2788 = vsubq_s16(v2440, v2442);
+    int16x8_t v2789 = vqrdmulhq_n_s16(v2788, 31869);
+    int16x8_t v2790 = vaddq_s16(v2787, v2789);
+    int16x8_t v2791 = vsubq_s16(v2446, v2448);
+    int16x8_t v2792 = vsubq_s16(v2450, v2452);
+    int16x8_t v2793 = vqrdmulhq_n_s16(v2792, 31869);
+    int16x8_t v2794 = vaddq_s16(v2791, v2793);
+    int16x8_t v2795 = vqrdmulhq_n_s16(v2794, 18830);
+    int16x8_t v2796 = vaddq_s16(v2790, v2795);
+    int16x8_t v2797 = vsubq_s16(v2458, v2460);
+    int16x8_t v2798 = vsubq_s16(v2462, v2465);
+    int16x8_t v2799 = vqrdmulhq_n_s16(v2798, 31869);
+    int16x8_t v2800 = vaddq_s16(v2797, v2799);
+    int16x8_t v2801 = vsubq_s16(v2469, v2471);
+    int16x8_t v2802 = vsubq_s16(v2473, v2475);
+    int16x8_t v2803 = vqrdmulhq_n_s16(v2802, 31869);
+    int16x8_t v2804 = vaddq_s16(v2801, v2803);
+    int16x8_t v2805 = vqrdmulhq_n_s16(v2804, 18830);
+    int16x8_t v2806 = vaddq_s16(v2800, v2805);
+    int16x8_t v2807 = vqrdmulhq_n_s16(v2806, 16944);
+    int16x8_t v2808 = vaddq_s16(v2796, v2807);
+    int16x8_t v2809 = vqrdmulhq_n_s16(v2808, 16521);
+    int16x8_t v2810 = vaddq_s16(v2786, v2809);
+    int16x8_t v2811 = vsubq_s16(v2296, v2298);
+    int16x8_t v2812 = vsubq_s16(v2300, v2302);
+    int16x8_t v2813_tmp = vqrdmulhq_n_s16(v2812, 5552);
+    int16x8_t v2813 = vaddq_s16(v2813_tmp, v2812);
+    int16x8_t v2814 = vaddq_s16(v2811, v2813);
+    int16x8_t v2815 = vsubq_s16(v2306, v2308);
+    int16x8_t v2816 = vsubq_s16(v2310, v2312);
+    int16x8_t v2817_tmp = vqrdmulhq_n_s16(v2816, 5552);
+    int16x8_t v2817 = vaddq_s16(v2817_tmp, v2816);
+    int16x8_t v2818 = vaddq_s16(v2815, v2817);
+    int16x8_t v2819 = vqrdmulhq_n_s16(v2818, 19393);
+    int16x8_t v2820 = vaddq_s16(v2814, v2819);
+    int16x8_t v2821 = vsubq_s16(v2318, v2320);
+    int16x8_t v2822 = vsubq_s16(v2322, v2324);
+    int16x8_t v2823_tmp = vqrdmulhq_n_s16(v2822, 5552);
+    int16x8_t v2823 = vaddq_s16(v2823_tmp, v2822);
+    int16x8_t v2824 = vaddq_s16(v2821, v2823);
+    int16x8_t v2825 = vsubq_s16(v2328, v2330);
+    int16x8_t v2826 = vsubq_s16(v2332, v2334);
+    int16x8_t v2827_tmp = vqrdmulhq_n_s16(v2826, 5552);
+    int16x8_t v2827 = vaddq_s16(v2827_tmp, v2826);
+    int16x8_t v2828 = vaddq_s16(v2825, v2827);
+    int16x8_t v2829 = vqrdmulhq_n_s16(v2828, 19393);
+    int16x8_t v2830 = vaddq_s16(v2824, v2829);
+    int16x8_t v2831 = vqrdmulhq_n_s16(v2830, 17059);
+    int16x8_t v2832 = vaddq_s16(v2820, v2831);
+    int16x8_t v2833 = vsubq_s16(v2342, v2344);
+    int16x8_t v2834 = vsubq_s16(v2346, v2348);
+    int16x8_t v2835_tmp = vqrdmulhq_n_s16(v2834, 5552);
+    int16x8_t v2835 = vaddq_s16(v2835_tmp, v2834);
+    int16x8_t v2836 = vaddq_s16(v2833, v2835);
+    int16x8_t v2837 = vsubq_s16(v2352, v2354);
+    int16x8_t v2838 = vsubq_s16(v2356, v2358);
+    int16x8_t v2839_tmp = vqrdmulhq_n_s16(v2838, 5552);
+    int16x8_t v2839 = vaddq_s16(v2839_tmp, v2838);
+    int16x8_t v2840 = vaddq_s16(v2837, v2839);
+    int16x8_t v2841 = vqrdmulhq_n_s16(v2840, 19393);
+    int16x8_t v2842 = vaddq_s16(v2836, v2841);
+    int16x8_t v2843 = vsubq_s16(v2364, v2366);
+    int16x8_t v2844 = vsubq_s16(v2368, v2370);
+    int16x8_t v2845_tmp = vqrdmulhq_n_s16(v2844, 5552);
+    int16x8_t v2845 = vaddq_s16(v2845_tmp, v2844);
+    int16x8_t v2846 = vaddq_s16(v2843, v2845);
+    int16x8_t v2847 = vsubq_s16(v2374, v2376);
+    int16x8_t v2848 = vsubq_s16(v2378, v2380);
+    int16x8_t v2849_tmp = vqrdmulhq_n_s16(v2848, 5552);
+    int16x8_t v2849 = vaddq_s16(v2849_tmp, v2848);
+    int16x8_t v2850 = vaddq_s16(v2847, v2849);
+    int16x8_t v2851 = vqrdmulhq_n_s16(v2850, 19393);
+    int16x8_t v2852 = vaddq_s16(v2846, v2851);
+    int16x8_t v2853 = vqrdmulhq_n_s16(v2852, 17059);
+    int16x8_t v2854 = vaddq_s16(v2842, v2853);
+    int16x8_t v2855 = vqrdmulhq_n_s16(v2854, 16549);
+    int16x8_t v2856 = vaddq_s16(v2832, v2855);
+    int16x8_t v2857 = vsubq_s16(v2109, v2114);
+    int16x8_t v2858 = vsubq_s16(v2119, v2124);
+    int16x8_t v2859_tmp = vqrdmulhq_n_s16(v2858, 15865);
+    int16x8_t v2859 = vaddq_s16(v2859_tmp, v2858);
+    int16x8_t v2860 = vaddq_s16(v2857, v2859);
+    int16x8_t v2861 = vsubq_s16(v2131, v2136);
+    int16x8_t v2862 = vsubq_s16(v2141, v2146);
+    int16x8_t v2863_tmp = vqrdmulhq_n_s16(v2862, 15865);
+    int16x8_t v2863 = vaddq_s16(v2863_tmp, v2862);
+    int16x8_t v2864 = vaddq_s16(v2861, v2863);
+    int16x8_t v2865 = vqrdmulhq_n_s16(v2864, 20040);
+    int16x8_t v2866 = vaddq_s16(v2860, v2865);
+    int16x8_t v2867 = vsubq_s16(v2155, v2160);
+    int16x8_t v2868 = vsubq_s16(v2165, v2170);
+    int16x8_t v2869_tmp = vqrdmulhq_n_s16(v2868, 15865);
+    int16x8_t v2869 = vaddq_s16(v2869_tmp, v2868);
+    int16x8_t v2870 = vaddq_s16(v2867, v2869);
+    int16x8_t v2871 = vsubq_s16(v2177, v2182);
+    int16x8_t v2872 = vsubq_s16(v2187, v2192);
+    int16x8_t v2873_tmp = vqrdmulhq_n_s16(v2872, 15865);
+    int16x8_t v2873 = vaddq_s16(v2873_tmp, v2872);
+    int16x8_t v2874 = vaddq_s16(v2871, v2873);
+    int16x8_t v2875 = vqrdmulhq_n_s16(v2874, 20040);
+    int16x8_t v2876 = vaddq_s16(v2870, v2875);
+    int16x8_t v2877 = vqrdmulhq_n_s16(v2876, 17187);
+    int16x8_t v2878 = vaddq_s16(v2866, v2877);
+    int16x8_t v2879 = vsubq_s16(v2203, v2208);
+    int16x8_t v2880 = vsubq_s16(v2213, v2218);
+    int16x8_t v2881_tmp = vqrdmulhq_n_s16(v2880, 15865);
+    int16x8_t v2881 = vaddq_s16(v2881_tmp, v2880);
+    int16x8_t v2882 = vaddq_s16(v2879, v2881);
+    int16x8_t v2883 = vsubq_s16(v2225, v2230);
+    int16x8_t v2884 = vsubq_s16(v2235, v2240);
+    int16x8_t v2885_tmp = vqrdmulhq_n_s16(v2884, 15865);
+    int16x8_t v2885 = vaddq_s16(v2885_tmp, v2884);
+    int16x8_t v2886 = vaddq_s16(v2883, v2885);
+    int16x8_t v2887 = vqrdmulhq_n_s16(v2886, 20040);
+    int16x8_t v2888 = vaddq_s16(v2882, v2887);
+    int16x8_t v2889 = vsubq_s16(v2249, v2254);
+    int16x8_t v2890 = vsubq_s16(v2259, v2264);
+    int16x8_t v2891_tmp = vqrdmulhq_n_s16(v2890, 15865);
+    int16x8_t v2891 = vaddq_s16(v2891_tmp, v2890);
+    int16x8_t v2892 = vaddq_s16(v2889, v2891);
+    int16x8_t v2893 = vsubq_s16(v2271, v2276);
+    int16x8_t v2894 = vsubq_s16(v2281, v2286);
+    int16x8_t v2895_tmp = vqrdmulhq_n_s16(v2894, 15865);
+    int16x8_t v2895 = vaddq_s16(v2895_tmp, v2894);
+    int16x8_t v2896 = vaddq_s16(v2893, v2895);
+    int16x8_t v2897 = vqrdmulhq_n_s16(v2896, 20040);
+    int16x8_t v2898 = vaddq_s16(v2892, v2897);
+    int16x8_t v2899 = vqrdmulhq_n_s16(v2898, 17187);
+    int16x8_t v2900 = vaddq_s16(v2888, v2899);
+    int16x8_t v2901 = vqrdmulhq_n_s16(v2900, 16579);
+    int16x8_t v2902 = vaddq_s16(v2878, v2901);
+    int16x8_t v2903 = vsubq_s16(v1919, v1924);
+    int16x8_t v2904 = vsubq_s16(v1929, v1934);
+    int16x8_t v2905_tmp = vqrdmulhq_n_s16(v2904, 1893);
+    int16x8_t v2905 = vmlaq_n_s16(v2905_tmp, v2904, 2);
+    int16x8_t v2906 = vaddq_s16(v2903, v2905);
+    int16x8_t v2907 = vsubq_s16(v1941, v1946);
+    int16x8_t v2908 = vsubq_s16(v1951, v1956);
+    int16x8_t v2909_tmp = vqrdmulhq_n_s16(v2908, 1893);
+    int16x8_t v2909 = vmlaq_n_s16(v2909_tmp, v2908, 2);
+    int16x8_t v2910 = vaddq_s16(v2907, v2909);
+    int16x8_t v2911 = vqrdmulhq_n_s16(v2910, 20783);
+    int16x8_t v2912 = vaddq_s16(v2906, v2911);
+    int16x8_t v2913 = vsubq_s16(v1965, v1970);
+    int16x8_t v2914 = vsubq_s16(v1975, v1980);
+    int16x8_t v2915_tmp = vqrdmulhq_n_s16(v2914, 1893);
+    int16x8_t v2915 = vmlaq_n_s16(v2915_tmp, v2914, 2);
+    int16x8_t v2916 = vaddq_s16(v2913, v2915);
+    int16x8_t v2917 = vsubq_s16(v1987, v1992);
+    int16x8_t v2918 = vsubq_s16(v1997, v2002);
+    int16x8_t v2919_tmp = vqrdmulhq_n_s16(v2918, 1893);
+    int16x8_t v2919 = vmlaq_n_s16(v2919_tmp, v2918, 2);
+    int16x8_t v2920 = vaddq_s16(v2917, v2919);
+    int16x8_t v2921 = vqrdmulhq_n_s16(v2920, 20783);
+    int16x8_t v2922 = vaddq_s16(v2916, v2921);
+    int16x8_t v2923 = vqrdmulhq_n_s16(v2922, 17326);
+    int16x8_t v2924 = vaddq_s16(v2912, v2923);
+    int16x8_t v2925 = vsubq_s16(v2013, v2018);
+    int16x8_t v2926 = vsubq_s16(v2023, v2028);
+    int16x8_t v2927_tmp = vqrdmulhq_n_s16(v2926, 1893);
+    int16x8_t v2927 = vmlaq_n_s16(v2927_tmp, v2926, 2);
+    int16x8_t v2928 = vaddq_s16(v2925, v2927);
+    int16x8_t v2929 = vsubq_s16(v2035, v2040);
+    int16x8_t v2930 = vsubq_s16(v2045, v2050);
+    int16x8_t v2931_tmp = vqrdmulhq_n_s16(v2930, 1893);
+    int16x8_t v2931 = vmlaq_n_s16(v2931_tmp, v2930, 2);
+    int16x8_t v2932 = vaddq_s16(v2929, v2931);
+    int16x8_t v2933 = vqrdmulhq_n_s16(v2932, 20783);
+    int16x8_t v2934 = vaddq_s16(v2928, v2933);
+    int16x8_t v2935 = vsubq_s16(v2059, v2064);
+    int16x8_t v2936 = vsubq_s16(v2069, v2074);
+    int16x8_t v2937_tmp = vqrdmulhq_n_s16(v2936, 1893);
+    int16x8_t v2937 = vmlaq_n_s16(v2937_tmp, v2936, 2);
+    int16x8_t v2938 = vaddq_s16(v2935, v2937);
+    int16x8_t v2939 = vsubq_s16(v2081, v2086);
+    int16x8_t v2940 = vsubq_s16(v2091, v2096);
+    int16x8_t v2941_tmp = vqrdmulhq_n_s16(v2940, 1893);
+    int16x8_t v2941 = vmlaq_n_s16(v2941_tmp, v2940, 2);
+    int16x8_t v2942 = vaddq_s16(v2939, v2941);
+    int16x8_t v2943 = vqrdmulhq_n_s16(v2942, 20783);
+    int16x8_t v2944 = vaddq_s16(v2938, v2943);
+    int16x8_t v2945 = vqrdmulhq_n_s16(v2944, 17326);
+    int16x8_t v2946 = vaddq_s16(v2934, v2945);
+    int16x8_t v2947 = vqrdmulhq_n_s16(v2946, 16611);
+    int16x8_t v2948 = vaddq_s16(v2924, v2947);
+    int16x8_t v2949 = vsubq_s16(v1543, v1554);
+    int16x8_t v2950 = vsubq_s16(v1565, v1576);
+    int16x8_t v2951_tmp = vqrdmulhq_n_s16(v2950, 13357);
+    int16x8_t v2951 = vmlaq_n_s16(v2951_tmp, v2950, 3);
+    int16x8_t v2952 = vaddq_s16(v2949, v2951);
+    int16x8_t v2953 = vsubq_s16(v1589, v1600);
+    int16x8_t v2954 = vsubq_s16(v1611, v1622);
+    int16x8_t v2955_tmp = vqrdmulhq_n_s16(v2954, 13357);
+    int16x8_t v2955 = vmlaq_n_s16(v2955_tmp, v2954, 3);
+    int16x8_t v2956 = vaddq_s16(v2953, v2955);
+    int16x8_t v2957 = vqrdmulhq_n_s16(v2956, 21637);
+    int16x8_t v2958 = vaddq_s16(v2952, v2957);
+    int16x8_t v2959 = vsubq_s16(v1637, v1648);
+    int16x8_t v2960 = vsubq_s16(v1659, v1670);
+    int16x8_t v2961_tmp = vqrdmulhq_n_s16(v2960, 13357);
+    int16x8_t v2961 = vmlaq_n_s16(v2961_tmp, v2960, 3);
+    int16x8_t v2962 = vaddq_s16(v2959, v2961);
+    int16x8_t v2963 = vsubq_s16(v1683, v1694);
+    int16x8_t v2964 = vsubq_s16(v1705, v1716);
+    int16x8_t v2965_tmp = vqrdmulhq_n_s16(v2964, 13357);
+    int16x8_t v2965 = vmlaq_n_s16(v2965_tmp, v2964, 3);
+    int16x8_t v2966 = vaddq_s16(v2963, v2965);
+    int16x8_t v2967 = vqrdmulhq_n_s16(v2966, 21637);
+    int16x8_t v2968 = vaddq_s16(v2962, v2967);
+    int16x8_t v2969 = vqrdmulhq_n_s16(v2968, 17479);
+    int16x8_t v2970 = vaddq_s16(v2958, v2969);
+    int16x8_t v2971 = vsubq_s16(v1733, v1744);
+    int16x8_t v2972 = vsubq_s16(v1755, v1766);
+    int16x8_t v2973_tmp = vqrdmulhq_n_s16(v2972, 13357);
+    int16x8_t v2973 = vmlaq_n_s16(v2973_tmp, v2972, 3);
+    int16x8_t v2974 = vaddq_s16(v2971, v2973);
+    int16x8_t v2975 = vsubq_s16(v1779, v1790);
+    int16x8_t v2976 = vsubq_s16(v1801, v1812);
+    int16x8_t v2977_tmp = vqrdmulhq_n_s16(v2976, 13357);
+    int16x8_t v2977 = vmlaq_n_s16(v2977_tmp, v2976, 3);
+    int16x8_t v2978 = vaddq_s16(v2975, v2977);
+    int16x8_t v2979 = vqrdmulhq_n_s16(v2978, 21637);
+    int16x8_t v2980 = vaddq_s16(v2974, v2979);
+    int16x8_t v2981 = vsubq_s16(v1827, v1838);
+    int16x8_t v2982 = vsubq_s16(v1849, v1860);
+    int16x8_t v2983_tmp = vqrdmulhq_n_s16(v2982, 13357);
+    int16x8_t v2983 = vmlaq_n_s16(v2983_tmp, v2982, 3);
+    int16x8_t v2984 = vaddq_s16(v2981, v2983);
+    int16x8_t v2985 = vsubq_s16(v1873, v1884);
+    int16x8_t v2986 = vsubq_s16(v1895, v1906);
+    int16x8_t v2987_tmp = vqrdmulhq_n_s16(v2986, 13357);
+    int16x8_t v2987 = vmlaq_n_s16(v2987_tmp, v2986, 3);
+    int16x8_t v2988 = vaddq_s16(v2985, v2987);
+    int16x8_t v2989 = vqrdmulhq_n_s16(v2988, 21637);
+    int16x8_t v2990 = vaddq_s16(v2984, v2989);
+    int16x8_t v2991 = vqrdmulhq_n_s16(v2990, 17479);
+    int16x8_t v2992 = vaddq_s16(v2980, v2991);
+    int16x8_t v2993 = vqrdmulhq_n_s16(v2992, 16647);
+    int16x8_t v2994 = vaddq_s16(v2970, v2993);
+    int16x8_t v2995 = vsubq_s16(v25, v60);
+    int16x8_t v2996 = vsubq_s16(v102, v138);
+    int16x8_t v2997_tmp = vqrdmulhq_n_s16(v2996, 6226);
+    int16x8_t v2997 = vmlaq_n_s16(v2997_tmp, v2996, 10);
+    int16x8_t v2998 = vaddq_s16(v2995, v2997);
+    int16x8_t v2999 = vsubq_s16(v182, v233);
+    int16x8_t v3000 = vsubq_s16(v275, v312);
+    int16x8_t v3001_tmp = vqrdmulhq_n_s16(v3000, 6226);
+    int16x8_t v3001 = vmlaq_n_s16(v3001_tmp, v3000, 10);
+    int16x8_t v3002 = vaddq_s16(v2999, v3001);
+    int16x8_t v3003 = vqrdmulhq_n_s16(v3002, 22622);
+    int16x8_t v3004 = vaddq_s16(v2998, v3003);
+    int16x8_t v3005 = vsubq_s16(v358, v409);
+    int16x8_t v3006 = vsubq_s16(v481, v519);
+    int16x8_t v3007_tmp = vqrdmulhq_n_s16(v3006, 6226);
+    int16x8_t v3007 = vmlaq_n_s16(v3007_tmp, v3006, 10);
+    int16x8_t v3008 = vaddq_s16(v3005, v3007);
+    int16x8_t v3009 = vsubq_s16(v563, v614);
+    int16x8_t v3010 = vsubq_s16(v656, v694);
+    int16x8_t v3011_tmp = vqrdmulhq_n_s16(v3010, 6226);
+    int16x8_t v3011 = vmlaq_n_s16(v3011_tmp, v3010, 10);
+    int16x8_t v3012 = vaddq_s16(v3009, v3011);
+    int16x8_t v3013 = vqrdmulhq_n_s16(v3012, 22622);
+    int16x8_t v3014 = vaddq_s16(v3008, v3013);
+    int16x8_t v3015 = vqrdmulhq_n_s16(v3014, 17646);
+    int16x8_t v3016 = vaddq_s16(v3004, v3015);
+    int16x8_t v3017 = vsubq_s16(v742, v793);
+    int16x8_t v3018 = vsubq_s16(v865, v903);
+    int16x8_t v3019_tmp = vqrdmulhq_n_s16(v3018, 6226);
+    int16x8_t v3019 = vmlaq_n_s16(v3019_tmp, v3018, 10);
+    int16x8_t v3020 = vaddq_s16(v3017, v3019);
+    int16x8_t v3021 = vsubq_s16(v977, v1060);
+    int16x8_t v3022 = vsubq_s16(v1102, v1141);
+    int16x8_t v3023_tmp = vqrdmulhq_n_s16(v3022, 6226);
+    int16x8_t v3023 = vmlaq_n_s16(v3023_tmp, v3022, 10);
+    int16x8_t v3024 = vaddq_s16(v3021, v3023);
+    int16x8_t v3025 = vqrdmulhq_n_s16(v3024, 22622);
+    int16x8_t v3026 = vaddq_s16(v3020, v3025);
+    int16x8_t v3027 = vsubq_s16(v1187, v1238);
+    int16x8_t v3028 = vsubq_s16(v1310, v1348);
+    int16x8_t v3029_tmp = vqrdmulhq_n_s16(v3028, 6226);
+    int16x8_t v3029 = vmlaq_n_s16(v3029_tmp, v3028, 10);
+    int16x8_t v3030 = vaddq_s16(v3027, v3029);
+    int16x8_t v3031 = vsubq_s16(v1392, v1443);
+    int16x8_t v3032 = vsubq_s16(v1485, v1524);
+    int16x8_t v3033_tmp = vqrdmulhq_n_s16(v3032, 6226);
+    int16x8_t v3033 = vmlaq_n_s16(v3033_tmp, v3032, 10);
+    int16x8_t v3034 = vaddq_s16(v3031, v3033);
+    int16x8_t v3035 = vqrdmulhq_n_s16(v3034, 22622);
+    int16x8_t v3036 = vaddq_s16(v3030, v3035);
+    int16x8_t v3037 = vqrdmulhq_n_s16(v3036, 17646);
+    int16x8_t v3038 = vaddq_s16(v3026, v3037);
+    int16x8_t v3039 = vqrdmulhq_n_s16(v3038, 16685);
+    int16x8_t v3040 = vaddq_s16(v3016, v3039);
+    int16x8_t v3041 = vsubq_s16(v2995, v2997);
+    int16x8_t v3042 = vsubq_s16(v2999, v3001);
+    int16x8_t v3043 = vqrdmulhq_n_s16(v3042, 23761);
+    int16x8_t v3044 = vaddq_s16(v3041, v3043);
+    int16x8_t v3045 = vsubq_s16(v3005, v3007);
+    int16x8_t v3046 = vsubq_s16(v3009, v3011);
+    int16x8_t v3047 = vqrdmulhq_n_s16(v3046, 23761);
+    int16x8_t v3048 = vaddq_s16(v3045, v3047);
+    int16x8_t v3049 = vqrdmulhq_n_s16(v3048, 17826);
+    int16x8_t v3050 = vaddq_s16(v3044, v3049);
+    int16x8_t v3051 = vsubq_s16(v3017, v3019);
+    int16x8_t v3052 = vsubq_s16(v3021, v3023);
+    int16x8_t v3053 = vqrdmulhq_n_s16(v3052, 23761);
+    int16x8_t v3054 = vaddq_s16(v3051, v3053);
+    int16x8_t v3055 = vsubq_s16(v3027, v3029);
+    int16x8_t v3056 = vsubq_s16(v3031, v3033);
+    int16x8_t v3057 = vqrdmulhq_n_s16(v3056, 23761);
+    int16x8_t v3058 = vaddq_s16(v3055, v3057);
+    int16x8_t v3059 = vqrdmulhq_n_s16(v3058, 17826);
+    int16x8_t v3060 = vaddq_s16(v3054, v3059);
+    int16x8_t v3061 = vqrdmulhq_n_s16(v3060, 16726);
+    int16x8_t v3062 = vaddq_s16(v3050, v3061);
+    int16x8_t v3063 = vsubq_s16(v2949, v2951);
+    int16x8_t v3064 = vsubq_s16(v2953, v2955);
+    int16x8_t v3065 = vqrdmulhq_n_s16(v3064, 25084);
+    int16x8_t v3066 = vaddq_s16(v3063, v3065);
+    int16x8_t v3067 = vsubq_s16(v2959, v2961);
+    int16x8_t v3068 = vsubq_s16(v2963, v2965);
+    int16x8_t v3069 = vqrdmulhq_n_s16(v3068, 25084);
+    int16x8_t v3070 = vaddq_s16(v3067, v3069);
+    int16x8_t v3071 = vqrdmulhq_n_s16(v3070, 18021);
+    int16x8_t v3072 = vaddq_s16(v3066, v3071);
+    int16x8_t v3073 = vsubq_s16(v2971, v2973);
+    int16x8_t v3074 = vsubq_s16(v2975, v2977);
+    int16x8_t v3075 = vqrdmulhq_n_s16(v3074, 25084);
+    int16x8_t v3076 = vaddq_s16(v3073, v3075);
+    int16x8_t v3077 = vsubq_s16(v2981, v2983);
+    int16x8_t v3078 = vsubq_s16(v2985, v2987);
+    int16x8_t v3079 = vqrdmulhq_n_s16(v3078, 25084);
+    int16x8_t v3080 = vaddq_s16(v3077, v3079);
+    int16x8_t v3081 = vqrdmulhq_n_s16(v3080, 18021);
+    int16x8_t v3082 = vaddq_s16(v3076, v3081);
+    int16x8_t v3083 = vqrdmulhq_n_s16(v3082, 16769);
+    int16x8_t v3084 = vaddq_s16(v3072, v3083);
+    int16x8_t v3085 = vsubq_s16(v2903, v2905);
+    int16x8_t v3086 = vsubq_s16(v2907, v2909);
+    int16x8_t v3087 = vqrdmulhq_n_s16(v3086, 26631);
+    int16x8_t v3088 = vaddq_s16(v3085, v3087);
+    int16x8_t v3089 = vsubq_s16(v2913, v2915);
+    int16x8_t v3090 = vsubq_s16(v2917, v2919);
+    int16x8_t v3091 = vqrdmulhq_n_s16(v3090, 26631);
+    int16x8_t v3092 = vaddq_s16(v3089, v3091);
+    int16x8_t v3093 = vqrdmulhq_n_s16(v3092, 18231);
+    int16x8_t v3094 = vaddq_s16(v3088, v3093);
+    int16x8_t v3095 = vsubq_s16(v2925, v2927);
+    int16x8_t v3096 = vsubq_s16(v2929, v2931);
+    int16x8_t v3097 = vqrdmulhq_n_s16(v3096, 26631);
+    int16x8_t v3098 = vaddq_s16(v3095, v3097);
+    int16x8_t v3099 = vsubq_s16(v2935, v2937);
+    int16x8_t v3100 = vsubq_s16(v2939, v2941);
+    int16x8_t v3101 = vqrdmulhq_n_s16(v3100, 26631);
+    int16x8_t v3102 = vaddq_s16(v3099, v3101);
+    int16x8_t v3103 = vqrdmulhq_n_s16(v3102, 18231);
+    int16x8_t v3104 = vaddq_s16(v3098, v3103);
+    int16x8_t v3105 = vqrdmulhq_n_s16(v3104, 16815);
+    int16x8_t v3106 = vaddq_s16(v3094, v3105);
+    int16x8_t v3107 = vsubq_s16(v2857, v2859);
+    int16x8_t v3108 = vsubq_s16(v2861, v2863);
+    int16x8_t v3109 = vqrdmulhq_n_s16(v3108, 28454);
+    int16x8_t v3110 = vaddq_s16(v3107, v3109);
+    int16x8_t v3111 = vsubq_s16(v2867, v2869);
+    int16x8_t v3112 = vsubq_s16(v2871, v2873);
+    int16x8_t v3113 = vqrdmulhq_n_s16(v3112, 28454);
+    int16x8_t v3114 = vaddq_s16(v3111, v3113);
+    int16x8_t v3115 = vqrdmulhq_n_s16(v3114, 18458);
+    int16x8_t v3116 = vaddq_s16(v3110, v3115);
+    int16x8_t v3117 = vsubq_s16(v2879, v2881);
+    int16x8_t v3118 = vsubq_s16(v2883, v2885);
+    int16x8_t v3119 = vqrdmulhq_n_s16(v3118, 28454);
+    int16x8_t v3120 = vaddq_s16(v3117, v3119);
+    int16x8_t v3121 = vsubq_s16(v2889, v2891);
+    int16x8_t v3122 = vsubq_s16(v2893, v2895);
+    int16x8_t v3123 = vqrdmulhq_n_s16(v3122, 28454);
+    int16x8_t v3124 = vaddq_s16(v3121, v3123);
+    int16x8_t v3125 = vqrdmulhq_n_s16(v3124, 18458);
+    int16x8_t v3126 = vaddq_s16(v3120, v3125);
+    int16x8_t v3127 = vqrdmulhq_n_s16(v3126, 16865);
+    int16x8_t v3128 = vaddq_s16(v3116, v3127);
+    int16x8_t v3129 = vsubq_s16(v2811, v2813);
+    int16x8_t v3130 = vsubq_s16(v2815, v2817);
+    int16x8_t v3131 = vqrdmulhq_n_s16(v3130, 30624);
+    int16x8_t v3132 = vaddq_s16(v3129, v3131);
+    int16x8_t v3133 = vsubq_s16(v2821, v2823);
+    int16x8_t v3134 = vsubq_s16(v2825, v2827);
+    int16x8_t v3135 = vqrdmulhq_n_s16(v3134, 30624);
+    int16x8_t v3136 = vaddq_s16(v3133, v3135);
+    int16x8_t v3137 = vqrdmulhq_n_s16(v3136, 18702);
+    int16x8_t v3138 = vaddq_s16(v3132, v3137);
+    int16x8_t v3139 = vsubq_s16(v2833, v2835);
+    int16x8_t v3140 = vsubq_s16(v2837, v2839);
+    int16x8_t v3141 = vqrdmulhq_n_s16(v3140, 30624);
+    int16x8_t v3142 = vaddq_s16(v3139, v3141);
+    int16x8_t v3143 = vsubq_s16(v2843, v2845);
+    int16x8_t v3144 = vsubq_s16(v2847, v2849);
+    int16x8_t v3145 = vqrdmulhq_n_s16(v3144, 30624);
+    int16x8_t v3146 = vaddq_s16(v3143, v3145);
+    int16x8_t v3147 = vqrdmulhq_n_s16(v3146, 18702);
+    int16x8_t v3148 = vaddq_s16(v3142, v3147);
+    int16x8_t v3149 = vqrdmulhq_n_s16(v3148, 16916);
+    int16x8_t v3150 = vaddq_s16(v3138, v3149);
+    int16x8_t v3151 = vsubq_s16(v2765, v2767);
+    int16x8_t v3152 = vsubq_s16(v2769, v2771);
+    int16x8_t v3153_tmp = vqrdmulhq_n_s16(v3152, 472);
+    int16x8_t v3153 = vaddq_s16(v3153_tmp, v3152);
+    int16x8_t v3154 = vaddq_s16(v3151, v3153);
+    int16x8_t v3155 = vsubq_s16(v2775, v2777);
+    int16x8_t v3156 = vsubq_s16(v2779, v2781);
+    int16x8_t v3157_tmp = vqrdmulhq_n_s16(v3156, 472);
+    int16x8_t v3157 = vaddq_s16(v3157_tmp, v3156);
+    int16x8_t v3158 = vaddq_s16(v3155, v3157);
+    int16x8_t v3159 = vqrdmulhq_n_s16(v3158, 18964);
+    int16x8_t v3160 = vaddq_s16(v3154, v3159);
+    int16x8_t v3161 = vsubq_s16(v2787, v2789);
+    int16x8_t v3162 = vsubq_s16(v2791, v2793);
+    int16x8_t v3163_tmp = vqrdmulhq_n_s16(v3162, 472);
+    int16x8_t v3163 = vaddq_s16(v3163_tmp, v3162);
+    int16x8_t v3164 = vaddq_s16(v3161, v3163);
+    int16x8_t v3165 = vsubq_s16(v2797, v2799);
+    int16x8_t v3166 = vsubq_s16(v2801, v2803);
+    int16x8_t v3167_tmp = vqrdmulhq_n_s16(v3166, 472);
+    int16x8_t v3167 = vaddq_s16(v3167_tmp, v3166);
+    int16x8_t v3168 = vaddq_s16(v3165, v3167);
+    int16x8_t v3169 = vqrdmulhq_n_s16(v3168, 18964);
+    int16x8_t v3170 = vaddq_s16(v3164, v3169);
+    int16x8_t v3171 = vqrdmulhq_n_s16(v3170, 16971);
+    int16x8_t v3172 = vaddq_s16(v3160, v3171);
+    int16x8_t v3173 = vsubq_s16(v2719, v2721);
+    int16x8_t v3174 = vsubq_s16(v2723, v2725);
+    int16x8_t v3175_tmp = vqrdmulhq_n_s16(v3174, 3672);
+    int16x8_t v3175 = vaddq_s16(v3175_tmp, v3174);
+    int16x8_t v3176 = vaddq_s16(v3173, v3175);
+    int16x8_t v3177 = vsubq_s16(v2729, v2731);
+    int16x8_t v3178 = vsubq_s16(v2733, v2735);
+    int16x8_t v3179_tmp = vqrdmulhq_n_s16(v3178, 3672);
+    int16x8_t v3179 = vaddq_s16(v3179_tmp, v3178);
+    int16x8_t v3180 = vaddq_s16(v3177, v3179);
+    int16x8_t v3181 = vqrdmulhq_n_s16(v3180, 19245);
+    int16x8_t v3182 = vaddq_s16(v3176, v3181);
+    int16x8_t v3183 = vsubq_s16(v2741, v2743);
+    int16x8_t v3184 = vsubq_s16(v2745, v2747);
+    int16x8_t v3185_tmp = vqrdmulhq_n_s16(v3184, 3672);
+    int16x8_t v3185 = vaddq_s16(v3185_tmp, v3184);
+    int16x8_t v3186 = vaddq_s16(v3183, v3185);
+    int16x8_t v3187 = vsubq_s16(v2751, v2753);
+    int16x8_t v3188 = vsubq_s16(v2755, v2757);
+    int16x8_t v3189_tmp = vqrdmulhq_n_s16(v3188, 3672);
+    int16x8_t v3189 = vaddq_s16(v3189_tmp, v3188);
+    int16x8_t v3190 = vaddq_s16(v3187, v3189);
+    int16x8_t v3191 = vqrdmulhq_n_s16(v3190, 19245);
+    int16x8_t v3192 = vaddq_s16(v3186, v3191);
+    int16x8_t v3193 = vqrdmulhq_n_s16(v3192, 17029);
+    int16x8_t v3194 = vaddq_s16(v3182, v3193);
+    int16x8_t v3195 = vsubq_s16(v2673, v2675);
+    int16x8_t v3196 = vsubq_s16(v2677, v2679);
+    int16x8_t v3197_tmp = vqrdmulhq_n_s16(v3196, 7662);
+    int16x8_t v3197 = vaddq_s16(v3197_tmp, v3196);
+    int16x8_t v3198 = vaddq_s16(v3195, v3197);
+    int16x8_t v3199 = vsubq_s16(v2683, v2685);
+    int16x8_t v3200 = vsubq_s16(v2687, v2689);
+    int16x8_t v3201_tmp = vqrdmulhq_n_s16(v3200, 7662);
+    int16x8_t v3201 = vaddq_s16(v3201_tmp, v3200);
+    int16x8_t v3202 = vaddq_s16(v3199, v3201);
+    int16x8_t v3203 = vqrdmulhq_n_s16(v3202, 19546);
+    int16x8_t v3204 = vaddq_s16(v3198, v3203);
+    int16x8_t v3205 = vsubq_s16(v2695, v2697);
+    int16x8_t v3206 = vsubq_s16(v2699, v2701);
+    int16x8_t v3207_tmp = vqrdmulhq_n_s16(v3206, 7662);
+    int16x8_t v3207 = vaddq_s16(v3207_tmp, v3206);
+    int16x8_t v3208 = vaddq_s16(v3205, v3207);
+    int16x8_t v3209 = vsubq_s16(v2705, v2707);
+    int16x8_t v3210 = vsubq_s16(v2709, v2711);
+    int16x8_t v3211_tmp = vqrdmulhq_n_s16(v3210, 7662);
+    int16x8_t v3211 = vaddq_s16(v3211_tmp, v3210);
+    int16x8_t v3212 = vaddq_s16(v3209, v3211);
+    int16x8_t v3213 = vqrdmulhq_n_s16(v3212, 19546);
+    int16x8_t v3214 = vaddq_s16(v3208, v3213);
+    int16x8_t v3215 = vqrdmulhq_n_s16(v3214, 17090);
+    int16x8_t v3216 = vaddq_s16(v3204, v3215);
+    int16x8_t v3217 = vsubq_s16(v2582, v2587);
+    int16x8_t v3218 = vsubq_s16(v2592, v2597);
+    int16x8_t v3219_tmp = vqrdmulhq_n_s16(v3218, 12756);
+    int16x8_t v3219 = vaddq_s16(v3219_tmp, v3218);
+    int16x8_t v3220 = vaddq_s16(v3217, v3219);
+    int16x8_t v3221 = vsubq_s16(v2604, v2609);
+    int16x8_t v3222 = vsubq_s16(v2614, v2619);
+    int16x8_t v3223_tmp = vqrdmulhq_n_s16(v3222, 12756);
+    int16x8_t v3223 = vaddq_s16(v3223_tmp, v3222);
+    int16x8_t v3224 = vaddq_s16(v3221, v3223);
+    int16x8_t v3225 = vqrdmulhq_n_s16(v3224, 19869);
+    int16x8_t v3226 = vaddq_s16(v3220, v3225);
+    int16x8_t v3227 = vsubq_s16(v2628, v2633);
+    int16x8_t v3228 = vsubq_s16(v2638, v2643);
+    int16x8_t v3229_tmp = vqrdmulhq_n_s16(v3228, 12756);
+    int16x8_t v3229 = vaddq_s16(v3229_tmp, v3228);
+    int16x8_t v3230 = vaddq_s16(v3227, v3229);
+    int16x8_t v3231 = vsubq_s16(v2650, v2655);
+    int16x8_t v3232 = vsubq_s16(v2660, v2665);
+    int16x8_t v3233_tmp = vqrdmulhq_n_s16(v3232, 12756);
+    int16x8_t v3233 = vaddq_s16(v3233_tmp, v3232);
+    int16x8_t v3234 = vaddq_s16(v3231, v3233);
+    int16x8_t v3235 = vqrdmulhq_n_s16(v3234, 19869);
+    int16x8_t v3236 = vaddq_s16(v3230, v3235);
+    int16x8_t v3237 = vqrdmulhq_n_s16(v3236, 17153);
+    int16x8_t v3238 = vaddq_s16(v3226, v3237);
+    int16x8_t v3239 = vsubq_s16(v2488, v2493);
+    int16x8_t v3240 = vsubq_s16(v2498, v2503);
+    int16x8_t v3241_tmp = vqrdmulhq_n_s16(v3240, 19463);
+    int16x8_t v3241 = vaddq_s16(v3241_tmp, v3240);
+    int16x8_t v3242 = vaddq_s16(v3239, v3241);
+    int16x8_t v3243 = vsubq_s16(v2510, v2515);
+    int16x8_t v3244 = vsubq_s16(v2520, v2525);
+    int16x8_t v3245_tmp = vqrdmulhq_n_s16(v3244, 19463);
+    int16x8_t v3245 = vaddq_s16(v3245_tmp, v3244);
+    int16x8_t v3246 = vaddq_s16(v3243, v3245);
+    int16x8_t v3247 = vqrdmulhq_n_s16(v3246, 20216);
+    int16x8_t v3248 = vaddq_s16(v3242, v3247);
+    int16x8_t v3249 = vsubq_s16(v2534, v2539);
+    int16x8_t v3250 = vsubq_s16(v2544, v2549);
+    int16x8_t v3251_tmp = vqrdmulhq_n_s16(v3250, 19463);
+    int16x8_t v3251 = vaddq_s16(v3251_tmp, v3250);
+    int16x8_t v3252 = vaddq_s16(v3249, v3251);
+    int16x8_t v3253 = vsubq_s16(v2556, v2561);
+    int16x8_t v3254 = vsubq_s16(v2566, v2571);
+    int16x8_t v3255_tmp = vqrdmulhq_n_s16(v3254, 19463);
+    int16x8_t v3255 = vaddq_s16(v3255_tmp, v3254);
+    int16x8_t v3256 = vaddq_s16(v3253, v3255);
+    int16x8_t v3257 = vqrdmulhq_n_s16(v3256, 20216);
+    int16x8_t v3258 = vaddq_s16(v3252, v3257);
+    int16x8_t v3259 = vqrdmulhq_n_s16(v3258, 17220);
+    int16x8_t v3260 = vaddq_s16(v3248, v3259);
+    int16x8_t v3261 = vsubq_s16(v2393, v2398);
+    int16x8_t v3262 = vsubq_s16(v2403, v2408);
+    int16x8_t v3263_tmp = vqrdmulhq_n_s16(v3262, 28661);
+    int16x8_t v3263 = vaddq_s16(v3263_tmp, v3262);
+    int16x8_t v3264 = vaddq_s16(v3261, v3263);
+    int16x8_t v3265 = vsubq_s16(v2415, v2420);
+    int16x8_t v3266 = vsubq_s16(v2425, v2430);
+    int16x8_t v3267_tmp = vqrdmulhq_n_s16(v3266, 28661);
+    int16x8_t v3267 = vaddq_s16(v3267_tmp, v3266);
+    int16x8_t v3268 = vaddq_s16(v3265, v3267);
+    int16x8_t v3269 = vqrdmulhq_n_s16(v3268, 20587);
+    int16x8_t v3270 = vaddq_s16(v3264, v3269);
+    int16x8_t v3271 = vsubq_s16(v2439, v2444);
+    int16x8_t v3272 = vsubq_s16(v2449, v2454);
+    int16x8_t v3273_tmp = vqrdmulhq_n_s16(v3272, 28661);
+    int16x8_t v3273 = vaddq_s16(v3273_tmp, v3272);
+    int16x8_t v3274 = vaddq_s16(v3271, v3273);
+    int16x8_t v3275 = vsubq_s16(v2461, v2467);
+    int16x8_t v3276 = vsubq_s16(v2472, v2477);
+    int16x8_t v3277_tmp = vqrdmulhq_n_s16(v3276, 28661);
+    int16x8_t v3277 = vaddq_s16(v3277_tmp, v3276);
+    int16x8_t v3278 = vaddq_s16(v3275, v3277);
+    int16x8_t v3279 = vqrdmulhq_n_s16(v3278, 20587);
+    int16x8_t v3280 = vaddq_s16(v3274, v3279);
+    int16x8_t v3281 = vqrdmulhq_n_s16(v3280, 17290);
+    int16x8_t v3282 = vaddq_s16(v3270, v3281);
+    int16x8_t v3283 = vsubq_s16(v2299, v2304);
+    int16x8_t v3284 = vsubq_s16(v2309, v2314);
+    int16x8_t v3285_tmp = vqrdmulhq_n_s16(v3284, 9242);
+    int16x8_t v3285 = vmlaq_n_s16(v3285_tmp, v3284, 2);
+    int16x8_t v3286 = vaddq_s16(v3283, v3285);
+    int16x8_t v3287 = vsubq_s16(v2321, v2326);
+    int16x8_t v3288 = vsubq_s16(v2331, v2336);
+    int16x8_t v3289_tmp = vqrdmulhq_n_s16(v3288, 9242);
+    int16x8_t v3289 = vmlaq_n_s16(v3289_tmp, v3288, 2);
+    int16x8_t v3290 = vaddq_s16(v3287, v3289);
+    int16x8_t v3291 = vqrdmulhq_n_s16(v3290, 20985);
+    int16x8_t v3292 = vaddq_s16(v3286, v3291);
+    int16x8_t v3293 = vsubq_s16(v2345, v2350);
+    int16x8_t v3294 = vsubq_s16(v2355, v2360);
+    int16x8_t v3295_tmp = vqrdmulhq_n_s16(v3294, 9242);
+    int16x8_t v3295 = vmlaq_n_s16(v3295_tmp, v3294, 2);
+    int16x8_t v3296 = vaddq_s16(v3293, v3295);
+    int16x8_t v3297 = vsubq_s16(v2367, v2372);
+    int16x8_t v3298 = vsubq_s16(v2377, v2382);
+    int16x8_t v3299_tmp = vqrdmulhq_n_s16(v3298, 9242);
+    int16x8_t v3299 = vmlaq_n_s16(v3299_tmp, v3298, 2);
+    int16x8_t v3300 = vaddq_s16(v3297, v3299);
+    int16x8_t v3301 = vqrdmulhq_n_s16(v3300, 20985);
+    int16x8_t v3302 = vaddq_s16(v3296, v3301);
+    int16x8_t v3303 = vqrdmulhq_n_s16(v3302, 17363);
+    int16x8_t v3304 = vaddq_s16(v3292, v3303);
+    int16x8_t v3305 = vsubq_s16(v2115, v2126);
+    int16x8_t v3306 = vsubq_s16(v2137, v2148);
+    int16x8_t v3307_tmp = vqrdmulhq_n_s16(v3306, 30298);
+    int16x8_t v3307 = vmlaq_n_s16(v3307_tmp, v3306, 2);
+    int16x8_t v3308 = vaddq_s16(v3305, v3307);
+    int16x8_t v3309 = vsubq_s16(v2161, v2172);
+    int16x8_t v3310 = vsubq_s16(v2183, v2194);
+    int16x8_t v3311_tmp = vqrdmulhq_n_s16(v3310, 30298);
+    int16x8_t v3311 = vmlaq_n_s16(v3311_tmp, v3310, 2);
+    int16x8_t v3312 = vaddq_s16(v3309, v3311);
+    int16x8_t v3313 = vqrdmulhq_n_s16(v3312, 21412);
+    int16x8_t v3314 = vaddq_s16(v3308, v3313);
+    int16x8_t v3315 = vsubq_s16(v2209, v2220);
+    int16x8_t v3316 = vsubq_s16(v2231, v2242);
+    int16x8_t v3317_tmp = vqrdmulhq_n_s16(v3316, 30298);
+    int16x8_t v3317 = vmlaq_n_s16(v3317_tmp, v3316, 2);
+    int16x8_t v3318 = vaddq_s16(v3315, v3317);
+    int16x8_t v3319 = vsubq_s16(v2255, v2266);
+    int16x8_t v3320 = vsubq_s16(v2277, v2288);
+    int16x8_t v3321_tmp = vqrdmulhq_n_s16(v3320, 30298);
+    int16x8_t v3321 = vmlaq_n_s16(v3321_tmp, v3320, 2);
+    int16x8_t v3322 = vaddq_s16(v3319, v3321);
+    int16x8_t v3323 = vqrdmulhq_n_s16(v3322, 21412);
+    int16x8_t v3324 = vaddq_s16(v3318, v3323);
+    int16x8_t v3325 = vqrdmulhq_n_s16(v3324, 17440);
+    int16x8_t v3326 = vaddq_s16(v3314, v3325);
+    int16x8_t v3327 = vsubq_s16(v1925, v1936);
+    int16x8_t v3328 = vsubq_s16(v1947, v1958);
+    int16x8_t v3329_tmp = vqrdmulhq_n_s16(v3328, 2773);
+    int16x8_t v3329 = vmlaq_n_s16(v3329_tmp, v3328, 4);
+    int16x8_t v3330 = vaddq_s16(v3327, v3329);
+    int16x8_t v3331 = vsubq_s16(v1971, v1982);
+    int16x8_t v3332 = vsubq_s16(v1993, v2004);
+    int16x8_t v3333_tmp = vqrdmulhq_n_s16(v3332, 2773);
+    int16x8_t v3333 = vmlaq_n_s16(v3333_tmp, v3332, 4);
+    int16x8_t v3334 = vaddq_s16(v3331, v3333);
+    int16x8_t v3335 = vqrdmulhq_n_s16(v3334, 21871);
+    int16x8_t v3336 = vaddq_s16(v3330, v3335);
+    int16x8_t v3337 = vsubq_s16(v2019, v2030);
+    int16x8_t v3338 = vsubq_s16(v2041, v2052);
+    int16x8_t v3339_tmp = vqrdmulhq_n_s16(v3338, 2773);
+    int16x8_t v3339 = vmlaq_n_s16(v3339_tmp, v3338, 4);
+    int16x8_t v3340 = vaddq_s16(v3337, v3339);
+    int16x8_t v3341 = vsubq_s16(v2065, v2076);
+    int16x8_t v3342 = vsubq_s16(v2087, v2098);
+    int16x8_t v3343_tmp = vqrdmulhq_n_s16(v3342, 2773);
+    int16x8_t v3343 = vmlaq_n_s16(v3343_tmp, v3342, 4);
+    int16x8_t v3344 = vaddq_s16(v3341, v3343);
+    int16x8_t v3345 = vqrdmulhq_n_s16(v3344, 21871);
+    int16x8_t v3346 = vaddq_s16(v3340, v3345);
+    int16x8_t v3347 = vqrdmulhq_n_s16(v3346, 17520);
+    int16x8_t v3348 = vaddq_s16(v3336, v3347);
+    int16x8_t v3349 = vsubq_s16(v1555, v1578);
+    int16x8_t v3350 = vsubq_s16(v1601, v1624);
+    int16x8_t v3351_tmp = vqrdmulhq_n_s16(v3350, 26108);
+    int16x8_t v3351 = vmlaq_n_s16(v3351_tmp, v3350, 6);
+    int16x8_t v3352 = vaddq_s16(v3349, v3351);
+    int16x8_t v3353 = vsubq_s16(v1649, v1672);
+    int16x8_t v3354 = vsubq_s16(v1695, v1718);
+    int16x8_t v3355_tmp = vqrdmulhq_n_s16(v3354, 26108);
+    int16x8_t v3355 = vmlaq_n_s16(v3355_tmp, v3354, 6);
+    int16x8_t v3356 = vaddq_s16(v3353, v3355);
+    int16x8_t v3357 = vqrdmulhq_n_s16(v3356, 22363);
+    int16x8_t v3358 = vaddq_s16(v3352, v3357);
+    int16x8_t v3359 = vsubq_s16(v1745, v1768);
+    int16x8_t v3360 = vsubq_s16(v1791, v1814);
+    int16x8_t v3361_tmp = vqrdmulhq_n_s16(v3360, 26108);
+    int16x8_t v3361 = vmlaq_n_s16(v3361_tmp, v3360, 6);
+    int16x8_t v3362 = vaddq_s16(v3359, v3361);
+    int16x8_t v3363 = vsubq_s16(v1839, v1862);
+    int16x8_t v3364 = vsubq_s16(v1885, v1908);
+    int16x8_t v3365_tmp = vqrdmulhq_n_s16(v3364, 26108);
+    int16x8_t v3365 = vmlaq_n_s16(v3365_tmp, v3364, 6);
+    int16x8_t v3366 = vaddq_s16(v3363, v3365);
+    int16x8_t v3367 = vqrdmulhq_n_s16(v3366, 22363);
+    int16x8_t v3368 = vaddq_s16(v3362, v3367);
+    int16x8_t v3369 = vqrdmulhq_n_s16(v3368, 17603);
+    int16x8_t v3370 = vaddq_s16(v3358, v3369);
+    int16x8_t v3371 = vsubq_s16(v61, v140);
+    int16x8_t v3372 = vsubq_s16(v234, v314);
+    int16x8_t v3373_tmp = vqrdmulhq_n_s16(v3372, 12251);
+    int16x8_t v3373 = vmlaq_n_s16(v3373_tmp, v3372, 20);
+    int16x8_t v3374 = vaddq_s16(v3371, v3373);
+    int16x8_t v3375 = vsubq_s16(v410, v521);
+    int16x8_t v3376 = vsubq_s16(v615, v696);
+    int16x8_t v3377_tmp = vqrdmulhq_n_s16(v3376, 12251);
+    int16x8_t v3377 = vmlaq_n_s16(v3377_tmp, v3376, 20);
+    int16x8_t v3378 = vaddq_s16(v3375, v3377);
+    int16x8_t v3379 = vqrdmulhq_n_s16(v3378, 22891);
+    int16x8_t v3380 = vaddq_s16(v3374, v3379);
+    int16x8_t v3381 = vsubq_s16(v794, v905);
+    int16x8_t v3382 = vsubq_s16(v1061, v1143);
+    int16x8_t v3383_tmp = vqrdmulhq_n_s16(v3382, 12251);
+    int16x8_t v3383 = vmlaq_n_s16(v3383_tmp, v3382, 20);
+    int16x8_t v3384 = vaddq_s16(v3381, v3383);
+    int16x8_t v3385 = vsubq_s16(v1239, v1350);
+    int16x8_t v3386 = vsubq_s16(v1444, v1526);
+    int16x8_t v3387_tmp = vqrdmulhq_n_s16(v3386, 12251);
+    int16x8_t v3387 = vmlaq_n_s16(v3387_tmp, v3386, 20);
+    int16x8_t v3388 = vaddq_s16(v3385, v3387);
+    int16x8_t v3389 = vqrdmulhq_n_s16(v3388, 22891);
+    int16x8_t v3390 = vaddq_s16(v3384, v3389);
+    int16x8_t v3391 = vqrdmulhq_n_s16(v3390, 17689);
+    int16x8_t v3392 = vaddq_s16(v3380, v3391);
+    int16x8_t v3393 = vsubq_s16(v3371, v3373);
+    int16x8_t v3394 = vsubq_s16(v3375, v3377);
+    int16x8_t v3395 = vqrdmulhq_n_s16(v3394, 23460);
+    int16x8_t v3396 = vaddq_s16(v3393, v3395);
+    int16x8_t v3397 = vsubq_s16(v3381, v3383);
+    int16x8_t v3398 = vsubq_s16(v3385, v3387);
+    int16x8_t v3399 = vqrdmulhq_n_s16(v3398, 23460);
+    int16x8_t v3400 = vaddq_s16(v3397, v3399);
+    int16x8_t v3401 = vqrdmulhq_n_s16(v3400, 17779);
+    int16x8_t v3402 = vaddq_s16(v3396, v3401);
+    int16x8_t v3403 = vsubq_s16(v3349, v3351);
+    int16x8_t v3404 = vsubq_s16(v3353, v3355);
+    int16x8_t v3405 = vqrdmulhq_n_s16(v3404, 24073);
+    int16x8_t v3406 = vaddq_s16(v3403, v3405);
+    int16x8_t v3407 = vsubq_s16(v3359, v3361);
+    int16x8_t v3408 = vsubq_s16(v3363, v3365);
+    int16x8_t v3409 = vqrdmulhq_n_s16(v3408, 24073);
+    int16x8_t v3410 = vaddq_s16(v3407, v3409);
+    int16x8_t v3411 = vqrdmulhq_n_s16(v3410, 17873);
+    int16x8_t v3412 = vaddq_s16(v3406, v3411);
+    int16x8_t v3413 = vsubq_s16(v3327, v3329);
+    int16x8_t v3414 = vsubq_s16(v3331, v3333);
+    int16x8_t v3415 = vqrdmulhq_n_s16(v3414, 24734);
+    int16x8_t v3416 = vaddq_s16(v3413, v3415);
+    int16x8_t v3417 = vsubq_s16(v3337, v3339);
+    int16x8_t v3418 = vsubq_s16(v3341, v3343);
+    int16x8_t v3419 = vqrdmulhq_n_s16(v3418, 24734);
+    int16x8_t v3420 = vaddq_s16(v3417, v3419);
+    int16x8_t v3421 = vqrdmulhq_n_s16(v3420, 17971);
+    int16x8_t v3422 = vaddq_s16(v3416, v3421);
+    int16x8_t v3423 = vsubq_s16(v3305, v3307);
+    int16x8_t v3424 = vsubq_s16(v3309, v3311);
+    int16x8_t v3425 = vqrdmulhq_n_s16(v3424, 25448);
+    int16x8_t v3426 = vaddq_s16(v3423, v3425);
+    int16x8_t v3427 = vsubq_s16(v3315, v3317);
+    int16x8_t v3428 = vsubq_s16(v3319, v3321);
+    int16x8_t v3429 = vqrdmulhq_n_s16(v3428, 25448);
+    int16x8_t v3430 = vaddq_s16(v3427, v3429);
+    int16x8_t v3431 = vqrdmulhq_n_s16(v3430, 18072);
+    int16x8_t v3432 = vaddq_s16(v3426, v3431);
+    int16x8_t v3433 = vsubq_s16(v3283, v3285);
+    int16x8_t v3434 = vsubq_s16(v3287, v3289);
+    int16x8_t v3435 = vqrdmulhq_n_s16(v3434, 26220);
+    int16x8_t v3436 = vaddq_s16(v3433, v3435);
+    int16x8_t v3437 = vsubq_s16(v3293, v3295);
+    int16x8_t v3438 = vsubq_s16(v3297, v3299);
+    int16x8_t v3439 = vqrdmulhq_n_s16(v3438, 26220);
+    int16x8_t v3440 = vaddq_s16(v3437, v3439);
+    int16x8_t v3441 = vqrdmulhq_n_s16(v3440, 18177);
+    int16x8_t v3442 = vaddq_s16(v3436, v3441);
+    int16x8_t v3443 = vsubq_s16(v3261, v3263);
+    int16x8_t v3444 = vsubq_s16(v3265, v3267);
+    int16x8_t v3445 = vqrdmulhq_n_s16(v3444, 27058);
+    int16x8_t v3446 = vaddq_s16(v3443, v3445);
+    int16x8_t v3447 = vsubq_s16(v3271, v3273);
+    int16x8_t v3448 = vsubq_s16(v3275, v3277);
+    int16x8_t v3449 = vqrdmulhq_n_s16(v3448, 27058);
+    int16x8_t v3450 = vaddq_s16(v3447, v3449);
+    int16x8_t v3451 = vqrdmulhq_n_s16(v3450, 18286);
+    int16x8_t v3452 = vaddq_s16(v3446, v3451);
+    int16x8_t v3453 = vsubq_s16(v3239, v3241);
+    int16x8_t v3454 = vsubq_s16(v3243, v3245);
+    int16x8_t v3455 = vqrdmulhq_n_s16(v3454, 27969);
+    int16x8_t v3456 = vaddq_s16(v3453, v3455);
+    int16x8_t v3457 = vsubq_s16(v3249, v3251);
+    int16x8_t v3458 = vsubq_s16(v3253, v3255);
+    int16x8_t v3459 = vqrdmulhq_n_s16(v3458, 27969);
+    int16x8_t v3460 = vaddq_s16(v3457, v3459);
+    int16x8_t v3461 = vqrdmulhq_n_s16(v3460, 18400);
+    int16x8_t v3462 = vaddq_s16(v3456, v3461);
+    int16x8_t v3463 = vsubq_s16(v3217, v3219);
+    int16x8_t v3464 = vsubq_s16(v3221, v3223);
+    int16x8_t v3465 = vqrdmulhq_n_s16(v3464, 28961);
+    int16x8_t v3466 = vaddq_s16(v3463, v3465);
+    int16x8_t v3467 = vsubq_s16(v3227, v3229);
+    int16x8_t v3468 = vsubq_s16(v3231, v3233);
+    int16x8_t v3469 = vqrdmulhq_n_s16(v3468, 28961);
+    int16x8_t v3470 = vaddq_s16(v3467, v3469);
+    int16x8_t v3471 = vqrdmulhq_n_s16(v3470, 18517);
+    int16x8_t v3472 = vaddq_s16(v3466, v3471);
+    int16x8_t v3473 = vsubq_s16(v3195, v3197);
+    int16x8_t v3474 = vsubq_s16(v3199, v3201);
+    int16x8_t v3475 = vqrdmulhq_n_s16(v3474, 30044);
+    int16x8_t v3476 = vaddq_s16(v3473, v3475);
+    int16x8_t v3477 = vsubq_s16(v3205, v3207);
+    int16x8_t v3478 = vsubq_s16(v3209, v3211);
+    int16x8_t v3479 = vqrdmulhq_n_s16(v3478, 30044);
+    int16x8_t v3480 = vaddq_s16(v3477, v3479);
+    int16x8_t v3481 = vqrdmulhq_n_s16(v3480, 18639);
+    int16x8_t v3482 = vaddq_s16(v3476, v3481);
+    int16x8_t v3483 = vsubq_s16(v3173, v3175);
+    int16x8_t v3484 = vsubq_s16(v3177, v3179);
+    int16x8_t v3485 = vqrdmulhq_n_s16(v3484, 31232);
+    int16x8_t v3486 = vaddq_s16(v3483, v3485);
+    int16x8_t v3487 = vsubq_s16(v3183, v3185);
+    int16x8_t v3488 = vsubq_s16(v3187, v3189);
+    int16x8_t v3489 = vqrdmulhq_n_s16(v3488, 31232);
+    int16x8_t v3490 = vaddq_s16(v3487, v3489);
+    int16x8_t v3491 = vqrdmulhq_n_s16(v3490, 18765);
+    int16x8_t v3492 = vaddq_s16(v3486, v3491);
+    int16x8_t v3493 = vsubq_s16(v3151, v3153);
+    int16x8_t v3494 = vsubq_s16(v3155, v3157);
+    int16x8_t v3495 = vqrdmulhq_n_s16(v3494, 32538);
+    int16x8_t v3496 = vaddq_s16(v3493, v3495);
+    int16x8_t v3497 = vsubq_s16(v3161, v3163);
+    int16x8_t v3498 = vsubq_s16(v3165, v3167);
+    int16x8_t v3499 = vqrdmulhq_n_s16(v3498, 32538);
+    int16x8_t v3500 = vaddq_s16(v3497, v3499);
+    int16x8_t v3501 = vqrdmulhq_n_s16(v3500, 18896);
+    int16x8_t v3502 = vaddq_s16(v3496, v3501);
+    int16x8_t v3503 = vsubq_s16(v3129, v3131);
+    int16x8_t v3504 = vsubq_s16(v3133, v3135);
+    int16x8_t v3505_tmp = vqrdmulhq_n_s16(v3504, 1211);
+    int16x8_t v3505 = vaddq_s16(v3505_tmp, v3504);
+    int16x8_t v3506 = vaddq_s16(v3503, v3505);
+    int16x8_t v3507 = vsubq_s16(v3139, v3141);
+    int16x8_t v3508 = vsubq_s16(v3143, v3145);
+    int16x8_t v3509_tmp = vqrdmulhq_n_s16(v3508, 1211);
+    int16x8_t v3509 = vaddq_s16(v3509_tmp, v3508);
+    int16x8_t v3510 = vaddq_s16(v3507, v3509);
+    int16x8_t v3511 = vqrdmulhq_n_s16(v3510, 19032);
+    int16x8_t v3512 = vaddq_s16(v3506, v3511);
+    int16x8_t v3513 = vsubq_s16(v3107, v3109);
+    int16x8_t v3514 = vsubq_s16(v3111, v3113);
+    int16x8_t v3515_tmp = vqrdmulhq_n_s16(v3514, 2808);
+    int16x8_t v3515 = vaddq_s16(v3515_tmp, v3514);
+    int16x8_t v3516 = vaddq_s16(v3513, v3515);
+    int16x8_t v3517 = vsubq_s16(v3117, v3119);
+    int16x8_t v3518 = vsubq_s16(v3121, v3123);
+    int16x8_t v3519_tmp = vqrdmulhq_n_s16(v3518, 2808);
+    int16x8_t v3519 = vaddq_s16(v3519_tmp, v3518);
+    int16x8_t v3520 = vaddq_s16(v3517, v3519);
+    int16x8_t v3521 = vqrdmulhq_n_s16(v3520, 19172);
+    int16x8_t v3522 = vaddq_s16(v3516, v3521);
+    int16x8_t v3523 = vsubq_s16(v3085, v3087);
+    int16x8_t v3524 = vsubq_s16(v3089, v3091);
+    int16x8_t v3525_tmp = vqrdmulhq_n_s16(v3524, 4586);
+    int16x8_t v3525 = vaddq_s16(v3525_tmp, v3524);
+    int16x8_t v3526 = vaddq_s16(v3523, v3525);
+    int16x8_t v3527 = vsubq_s16(v3095, v3097);
+    int16x8_t v3528 = vsubq_s16(v3099, v3101);
+    int16x8_t v3529_tmp = vqrdmulhq_n_s16(v3528, 4586);
+    int16x8_t v3529 = vaddq_s16(v3529_tmp, v3528);
+    int16x8_t v3530 = vaddq_s16(v3527, v3529);
+    int16x8_t v3531 = vqrdmulhq_n_s16(v3530, 19318);
+    int16x8_t v3532 = vaddq_s16(v3526, v3531);
+    int16x8_t v3533 = vsubq_s16(v3063, v3065);
+    int16x8_t v3534 = vsubq_s16(v3067, v3069);
+    int16x8_t v3535_tmp = vqrdmulhq_n_s16(v3534, 6576);
+    int16x8_t v3535 = vaddq_s16(v3535_tmp, v3534);
+    int16x8_t v3536 = vaddq_s16(v3533, v3535);
+    int16x8_t v3537 = vsubq_s16(v3073, v3075);
+    int16x8_t v3538 = vsubq_s16(v3077, v3079);
+    int16x8_t v3539_tmp = vqrdmulhq_n_s16(v3538, 6576);
+    int16x8_t v3539 = vaddq_s16(v3539_tmp, v3538);
+    int16x8_t v3540 = vaddq_s16(v3537, v3539);
+    int16x8_t v3541 = vqrdmulhq_n_s16(v3540, 19469);
+    int16x8_t v3542 = vaddq_s16(v3536, v3541);
+    int16x8_t v3543 = vsubq_s16(v3041, v3043);
+    int16x8_t v3544 = vsubq_s16(v3045, v3047);
+    int16x8_t v3545_tmp = vqrdmulhq_n_s16(v3544, 8817);
+    int16x8_t v3545 = vaddq_s16(v3545_tmp, v3544);
+    int16x8_t v3546 = vaddq_s16(v3543, v3545);
+    int16x8_t v3547 = vsubq_s16(v3051, v3053);
+    int16x8_t v3548 = vsubq_s16(v3055, v3057);
+    int16x8_t v3549_tmp = vqrdmulhq_n_s16(v3548, 8817);
+    int16x8_t v3549 = vaddq_s16(v3549_tmp, v3548);
+    int16x8_t v3550 = vaddq_s16(v3547, v3549);
+    int16x8_t v3551 = vqrdmulhq_n_s16(v3550, 19625);
+    int16x8_t v3552 = vaddq_s16(v3546, v3551);
+    int16x8_t v3553 = vsubq_s16(v2998, v3003);
+    int16x8_t v3554 = vsubq_s16(v3008, v3013);
+    int16x8_t v3555_tmp = vqrdmulhq_n_s16(v3554, 11356);
+    int16x8_t v3555 = vaddq_s16(v3555_tmp, v3554);
+    int16x8_t v3556 = vaddq_s16(v3553, v3555);
+    int16x8_t v3557 = vsubq_s16(v3020, v3025);
+    int16x8_t v3558 = vsubq_s16(v3030, v3035);
+    int16x8_t v3559_tmp = vqrdmulhq_n_s16(v3558, 11356);
+    int16x8_t v3559 = vaddq_s16(v3559_tmp, v3558);
+    int16x8_t v3560 = vaddq_s16(v3557, v3559);
+    int16x8_t v3561 = vqrdmulhq_n_s16(v3560, 19786);
+    int16x8_t v3562 = vaddq_s16(v3556, v3561);
+    int16x8_t v3563 = vsubq_s16(v2952, v2957);
+    int16x8_t v3564 = vsubq_s16(v2962, v2967);
+    int16x8_t v3565_tmp = vqrdmulhq_n_s16(v3564, 14256);
+    int16x8_t v3565 = vaddq_s16(v3565_tmp, v3564);
+    int16x8_t v3566 = vaddq_s16(v3563, v3565);
+    int16x8_t v3567 = vsubq_s16(v2974, v2979);
+    int16x8_t v3568 = vsubq_s16(v2984, v2989);
+    int16x8_t v3569_tmp = vqrdmulhq_n_s16(v3568, 14256);
+    int16x8_t v3569 = vaddq_s16(v3569_tmp, v3568);
+    int16x8_t v3570 = vaddq_s16(v3567, v3569);
+    int16x8_t v3571 = vqrdmulhq_n_s16(v3570, 19954);
+    int16x8_t v3572 = vaddq_s16(v3566, v3571);
+    int16x8_t v3573 = vsubq_s16(v2906, v2911);
+    int16x8_t v3574 = vsubq_s16(v2916, v2921);
+    int16x8_t v3575_tmp = vqrdmulhq_n_s16(v3574, 17596);
+    int16x8_t v3575 = vaddq_s16(v3575_tmp, v3574);
+    int16x8_t v3576 = vaddq_s16(v3573, v3575);
+    int16x8_t v3577 = vsubq_s16(v2928, v2933);
+    int16x8_t v3578 = vsubq_s16(v2938, v2943);
+    int16x8_t v3579_tmp = vqrdmulhq_n_s16(v3578, 17596);
+    int16x8_t v3579 = vaddq_s16(v3579_tmp, v3578);
+    int16x8_t v3580 = vaddq_s16(v3577, v3579);
+    int16x8_t v3581 = vqrdmulhq_n_s16(v3580, 20127);
+    int16x8_t v3582 = vaddq_s16(v3576, v3581);
+    int16x8_t v3583 = vsubq_s16(v2860, v2865);
+    int16x8_t v3584 = vsubq_s16(v2870, v2875);
+    int16x8_t v3585_tmp = vqrdmulhq_n_s16(v3584, 21483);
+    int16x8_t v3585 = vaddq_s16(v3585_tmp, v3584);
+    int16x8_t v3586 = vaddq_s16(v3583, v3585);
+    int16x8_t v3587 = vsubq_s16(v2882, v2887);
+    int16x8_t v3588 = vsubq_s16(v2892, v2897);
+    int16x8_t v3589_tmp = vqrdmulhq_n_s16(v3588, 21483);
+    int16x8_t v3589 = vaddq_s16(v3589_tmp, v3588);
+    int16x8_t v3590 = vaddq_s16(v3587, v3589);
+    int16x8_t v3591 = vqrdmulhq_n_s16(v3590, 20306);
+    int16x8_t v3592 = vaddq_s16(v3586, v3591);
+    int16x8_t v3593 = vsubq_s16(v2814, v2819);
+    int16x8_t v3594 = vsubq_s16(v2824, v2829);
+    int16x8_t v3595_tmp = vqrdmulhq_n_s16(v3594, 26057);
+    int16x8_t v3595 = vaddq_s16(v3595_tmp, v3594);
+    int16x8_t v3596 = vaddq_s16(v3593, v3595);
+    int16x8_t v3597 = vsubq_s16(v2836, v2841);
+    int16x8_t v3598 = vsubq_s16(v2846, v2851);
+    int16x8_t v3599_tmp = vqrdmulhq_n_s16(v3598, 26057);
+    int16x8_t v3599 = vaddq_s16(v3599_tmp, v3598);
+    int16x8_t v3600 = vaddq_s16(v3597, v3599);
+    int16x8_t v3601 = vqrdmulhq_n_s16(v3600, 20492);
+    int16x8_t v3602 = vaddq_s16(v3596, v3601);
+    int16x8_t v3603 = vsubq_s16(v2768, v2773);
+    int16x8_t v3604 = vsubq_s16(v2778, v2783);
+    int16x8_t v3605_tmp = vqrdmulhq_n_s16(v3604, 31517);
+    int16x8_t v3605 = vaddq_s16(v3605_tmp, v3604);
+    int16x8_t v3606 = vaddq_s16(v3603, v3605);
+    int16x8_t v3607 = vsubq_s16(v2790, v2795);
+    int16x8_t v3608 = vsubq_s16(v2800, v2805);
+    int16x8_t v3609_tmp = vqrdmulhq_n_s16(v3608, 31517);
+    int16x8_t v3609 = vaddq_s16(v3609_tmp, v3608);
+    int16x8_t v3610 = vaddq_s16(v3607, v3609);
+    int16x8_t v3611 = vqrdmulhq_n_s16(v3610, 20684);
+    int16x8_t v3612 = vaddq_s16(v3606, v3611);
+    int16x8_t v3613 = vsubq_s16(v2722, v2727);
+    int16x8_t v3614 = vsubq_s16(v2732, v2737);
+    int16x8_t v3615_tmp = vqrdmulhq_n_s16(v3614, 5373);
+    int16x8_t v3615 = vmlaq_n_s16(v3615_tmp, v3614, 2);
+    int16x8_t v3616 = vaddq_s16(v3613, v3615);
+    int16x8_t v3617 = vsubq_s16(v2744, v2749);
+    int16x8_t v3618 = vsubq_s16(v2754, v2759);
+    int16x8_t v3619_tmp = vqrdmulhq_n_s16(v3618, 5373);
+    int16x8_t v3619 = vmlaq_n_s16(v3619_tmp, v3618, 2);
+    int16x8_t v3620 = vaddq_s16(v3617, v3619);
+    int16x8_t v3621 = vqrdmulhq_n_s16(v3620, 20883);
+    int16x8_t v3622 = vaddq_s16(v3616, v3621);
+    int16x8_t v3623 = vsubq_s16(v2676, v2681);
+    int16x8_t v3624 = vsubq_s16(v2686, v2691);
+    int16x8_t v3625_tmp = vqrdmulhq_n_s16(v3624, 13571);
+    int16x8_t v3625 = vmlaq_n_s16(v3625_tmp, v3624, 2);
+    int16x8_t v3626 = vaddq_s16(v3623, v3625);
+    int16x8_t v3627 = vsubq_s16(v2698, v2703);
+    int16x8_t v3628 = vsubq_s16(v2708, v2713);
+    int16x8_t v3629_tmp = vqrdmulhq_n_s16(v3628, 13571);
+    int16x8_t v3629 = vmlaq_n_s16(v3629_tmp, v3628, 2);
+    int16x8_t v3630 = vaddq_s16(v3627, v3629);
+    int16x8_t v3631 = vqrdmulhq_n_s16(v3630, 21089);
+    int16x8_t v3632 = vaddq_s16(v3626, v3631);
+    int16x8_t v3633 = vsubq_s16(v2588, v2599);
+    int16x8_t v3634 = vsubq_s16(v2610, v2621);
+    int16x8_t v3635_tmp = vqrdmulhq_n_s16(v3634, 23975);
+    int16x8_t v3635 = vmlaq_n_s16(v3635_tmp, v3634, 2);
+    int16x8_t v3636 = vaddq_s16(v3633, v3635);
+    int16x8_t v3637 = vsubq_s16(v2634, v2645);
+    int16x8_t v3638 = vsubq_s16(v2656, v2667);
+    int16x8_t v3639_tmp = vqrdmulhq_n_s16(v3638, 23975);
+    int16x8_t v3639 = vmlaq_n_s16(v3639_tmp, v3638, 2);
+    int16x8_t v3640 = vaddq_s16(v3637, v3639);
+    int16x8_t v3641 = vqrdmulhq_n_s16(v3640, 21303);
+    int16x8_t v3642 = vaddq_s16(v3636, v3641);
+    int16x8_t v3643 = vsubq_s16(v2494, v2505);
+    int16x8_t v3644 = vsubq_s16(v2516, v2527);
+    int16x8_t v3645_tmp = vqrdmulhq_n_s16(v3644, 4832);
+    int16x8_t v3645 = vmlaq_n_s16(v3645_tmp, v3644, 3);
+    int16x8_t v3646 = vaddq_s16(v3643, v3645);
+    int16x8_t v3647 = vsubq_s16(v2540, v2551);
+    int16x8_t v3648 = vsubq_s16(v2562, v2573);
+    int16x8_t v3649_tmp = vqrdmulhq_n_s16(v3648, 4832);
+    int16x8_t v3649 = vmlaq_n_s16(v3649_tmp, v3648, 3);
+    int16x8_t v3650 = vaddq_s16(v3647, v3649);
+    int16x8_t v3651 = vqrdmulhq_n_s16(v3650, 21524);
+    int16x8_t v3652 = vaddq_s16(v3646, v3651);
+    int16x8_t v3653 = vsubq_s16(v2399, v2410);
+    int16x8_t v3654 = vsubq_s16(v2421, v2432);
+    int16x8_t v3655_tmp = vqrdmulhq_n_s16(v3654, 23437);
+    int16x8_t v3655 = vmlaq_n_s16(v3655_tmp, v3654, 3);
+    int16x8_t v3656 = vaddq_s16(v3653, v3655);
+    int16x8_t v3657 = vsubq_s16(v2445, v2456);
+    int16x8_t v3658 = vsubq_s16(v2468, v2479);
+    int16x8_t v3659_tmp = vqrdmulhq_n_s16(v3658, 23437);
+    int16x8_t v3659 = vmlaq_n_s16(v3659_tmp, v3658, 3);
+    int16x8_t v3660 = vaddq_s16(v3657, v3659);
+    int16x8_t v3661 = vqrdmulhq_n_s16(v3660, 21753);
+    int16x8_t v3662 = vaddq_s16(v3656, v3661);
+    int16x8_t v3663 = vsubq_s16(v2305, v2316);
+    int16x8_t v3664 = vsubq_s16(v2327, v2338);
+    int16x8_t v3665_tmp = vqrdmulhq_n_s16(v3664, 17573);
+    int16x8_t v3665 = vmlaq_n_s16(v3665_tmp, v3664, 4);
+    int16x8_t v3666 = vaddq_s16(v3663, v3665);
+    int16x8_t v3667 = vsubq_s16(v2351, v2362);
+    int16x8_t v3668 = vsubq_s16(v2373, v2384);
+    int16x8_t v3669_tmp = vqrdmulhq_n_s16(v3668, 17573);
+    int16x8_t v3669 = vmlaq_n_s16(v3669_tmp, v3668, 4);
+    int16x8_t v3670 = vaddq_s16(v3667, v3669);
+    int16x8_t v3671 = vqrdmulhq_n_s16(v3670, 21990);
+    int16x8_t v3672 = vaddq_s16(v3666, v3671);
+    int16x8_t v3673 = vsubq_s16(v2127, v2150);
+    int16x8_t v3674 = vsubq_s16(v2173, v2196);
+    int16x8_t v3675_tmp = vqrdmulhq_n_s16(v3674, 27122);
+    int16x8_t v3675 = vmlaq_n_s16(v3675_tmp, v3674, 5);
+    int16x8_t v3676 = vaddq_s16(v3673, v3675);
+    int16x8_t v3677 = vsubq_s16(v2221, v2244);
+    int16x8_t v3678 = vsubq_s16(v2267, v2290);
+    int16x8_t v3679_tmp = vqrdmulhq_n_s16(v3678, 27122);
+    int16x8_t v3679 = vmlaq_n_s16(v3679_tmp, v3678, 5);
+    int16x8_t v3680 = vaddq_s16(v3677, v3679);
+    int16x8_t v3681 = vqrdmulhq_n_s16(v3680, 22236);
+    int16x8_t v3682 = vaddq_s16(v3676, v3681);
+    int16x8_t v3683 = vsubq_s16(v1937, v1960);
+    int16x8_t v3684 = vsubq_s16(v1983, v2006);
+    int16x8_t v3685_tmp = vqrdmulhq_n_s16(v3684, 5041);
+    int16x8_t v3685 = vmlaq_n_s16(v3685_tmp, v3684, 8);
+    int16x8_t v3686 = vaddq_s16(v3683, v3685);
+    int16x8_t v3687 = vsubq_s16(v2031, v2054);
+    int16x8_t v3688 = vsubq_s16(v2077, v2100);
+    int16x8_t v3689_tmp = vqrdmulhq_n_s16(v3688, 5041);
+    int16x8_t v3689 = vmlaq_n_s16(v3689_tmp, v3688, 8);
+    int16x8_t v3690 = vaddq_s16(v3687, v3689);
+    int16x8_t v3691 = vqrdmulhq_n_s16(v3690, 22491);
+    int16x8_t v3692 = vaddq_s16(v3686, v3691);
+    int16x8_t v3693 = vsubq_s16(v1579, v1626);
+    int16x8_t v3694 = vsubq_s16(v1673, v1720);
+    int16x8_t v3695_tmp = vqrdmulhq_n_s16(v3694, 19146);
+    int16x8_t v3695 = vmlaq_n_s16(v3695_tmp, v3694, 13);
+    int16x8_t v3696 = vaddq_s16(v3693, v3695);
+    int16x8_t v3697 = vsubq_s16(v1769, v1816);
+    int16x8_t v3698 = vsubq_s16(v1863, v1910);
+    int16x8_t v3699_tmp = vqrdmulhq_n_s16(v3698, 19146);
+    int16x8_t v3699 = vmlaq_n_s16(v3699_tmp, v3698, 13);
+    int16x8_t v3700 = vaddq_s16(v3697, v3699);
+    int16x8_t v3701 = vqrdmulhq_n_s16(v3700, 22755);
+    int16x8_t v3702 = vaddq_s16(v3696, v3701);
+    int16x8_t v3703 = vsubq_s16(v141, v316);
+    int16x8_t v3704 = vsubq_s16(v522, v698);
+    int16x8_t v3705_tmp = vqrdmulhq_n_s16(v3704, 24402);
+    int16x8_t v3705 = vmlaq_n_s16(v3705_tmp, v3704, 40);
+    int16x8_t v3706 = vaddq_s16(v3703, v3705);
+    int16x8_t v3707 = vsubq_s16(v906, v1145);
+    int16x8_t v3708 = vsubq_s16(v1351, v1528);
+    int16x8_t v3709_tmp = vqrdmulhq_n_s16(v3708, 24402);
+    int16x8_t v3709 = vmlaq_n_s16(v3709_tmp, v3708, 40);
+    int16x8_t v3710 = vaddq_s16(v3707, v3709);
+    int16x8_t v3711 = vqrdmulhq_n_s16(v3710, 23030);
+    int16x8_t v3712 = vaddq_s16(v3706, v3711);
+    int16x8_t v3713 = vsubq_s16(v3703, v3705);
+    int16x8_t v3714 = vsubq_s16(v3707, v3709);
+    int16x8_t v3715 = vqrdmulhq_n_s16(v3714, 23314);
+    int16x8_t v3716 = vaddq_s16(v3713, v3715);
+    int16x8_t v3717 = vsubq_s16(v3693, v3695);
+    int16x8_t v3718 = vsubq_s16(v3697, v3699);
+    int16x8_t v3719 = vqrdmulhq_n_s16(v3718, 23609);
+    int16x8_t v3720 = vaddq_s16(v3717, v3719);
+    int16x8_t v3721 = vsubq_s16(v3683, v3685);
+    int16x8_t v3722 = vsubq_s16(v3687, v3689);
+    int16x8_t v3723 = vqrdmulhq_n_s16(v3722, 23915);
+    int16x8_t v3724 = vaddq_s16(v3721, v3723);
+    int16x8_t v3725 = vsubq_s16(v3673, v3675);
+    int16x8_t v3726 = vsubq_s16(v3677, v3679);
+    int16x8_t v3727 = vqrdmulhq_n_s16(v3726, 24233);
+    int16x8_t v3728 = vaddq_s16(v3725, v3727);
+    int16x8_t v3729 = vsubq_s16(v3663, v3665);
+    int16x8_t v3730 = vsubq_s16(v3667, v3669);
+    int16x8_t v3731 = vqrdmulhq_n_s16(v3730, 24564);
+    int16x8_t v3732 = vaddq_s16(v3729, v3731);
+    int16x8_t v3733 = vsubq_s16(v3653, v3655);
+    int16x8_t v3734 = vsubq_s16(v3657, v3659);
+    int16x8_t v3735 = vqrdmulhq_n_s16(v3734, 24907);
+    int16x8_t v3736 = vaddq_s16(v3733, v3735);
+    int16x8_t v3737 = vsubq_s16(v3643, v3645);
+    int16x8_t v3738 = vsubq_s16(v3647, v3649);
+    int16x8_t v3739 = vqrdmulhq_n_s16(v3738, 25264);
+    int16x8_t v3740 = vaddq_s16(v3737, v3739);
+    int16x8_t v3741 = vsubq_s16(v3633, v3635);
+    int16x8_t v3742 = vsubq_s16(v3637, v3639);
+    int16x8_t v3743 = vqrdmulhq_n_s16(v3742, 25635);
+    int16x8_t v3744 = vaddq_s16(v3741, v3743);
+    int16x8_t v3745 = vsubq_s16(v3623, v3625);
+    int16x8_t v3746 = vsubq_s16(v3627, v3629);
+    int16x8_t v3747 = vqrdmulhq_n_s16(v3746, 26021);
+    int16x8_t v3748 = vaddq_s16(v3745, v3747);
+    int16x8_t v3749 = vsubq_s16(v3613, v3615);
+    int16x8_t v3750 = vsubq_s16(v3617, v3619);
+    int16x8_t v3751 = vqrdmulhq_n_s16(v3750, 26423);
+    int16x8_t v3752 = vaddq_s16(v3749, v3751);
+    int16x8_t v3753 = vsubq_s16(v3603, v3605);
+    int16x8_t v3754 = vsubq_s16(v3607, v3609);
+    int16x8_t v3755 = vqrdmulhq_n_s16(v3754, 26842);
+    int16x8_t v3756 = vaddq_s16(v3753, v3755);
+    int16x8_t v3757 = vsubq_s16(v3593, v3595);
+    int16x8_t v3758 = vsubq_s16(v3597, v3599);
+    int16x8_t v3759 = vqrdmulhq_n_s16(v3758, 27279);
+    int16x8_t v3760 = vaddq_s16(v3757, v3759);
+    int16x8_t v3761 = vsubq_s16(v3583, v3585);
+    int16x8_t v3762 = vsubq_s16(v3587, v3589);
+    int16x8_t v3763 = vqrdmulhq_n_s16(v3762, 27734);
+    int16x8_t v3764 = vaddq_s16(v3761, v3763);
+    int16x8_t v3765 = vsubq_s16(v3573, v3575);
+    int16x8_t v3766 = vsubq_s16(v3577, v3579);
+    int16x8_t v3767 = vqrdmulhq_n_s16(v3766, 28209);
+    int16x8_t v3768 = vaddq_s16(v3765, v3767);
+    int16x8_t v3769 = vsubq_s16(v3563, v3565);
+    int16x8_t v3770 = vsubq_s16(v3567, v3569);
+    int16x8_t v3771 = vqrdmulhq_n_s16(v3770, 28705);
+    int16x8_t v3772 = vaddq_s16(v3769, v3771);
+    int16x8_t v3773 = vsubq_s16(v3553, v3555);
+    int16x8_t v3774 = vsubq_s16(v3557, v3559);
+    int16x8_t v3775 = vqrdmulhq_n_s16(v3774, 29223);
+    int16x8_t v3776 = vaddq_s16(v3773, v3775);
+    int16x8_t v3777 = vsubq_s16(v3543, v3545);
+    int16x8_t v3778 = vsubq_s16(v3547, v3549);
+    int16x8_t v3779 = vqrdmulhq_n_s16(v3778, 29764);
+    int16x8_t v3780 = vaddq_s16(v3777, v3779);
+    int16x8_t v3781 = vsubq_s16(v3533, v3535);
+    int16x8_t v3782 = vsubq_s16(v3537, v3539);
+    int16x8_t v3783 = vqrdmulhq_n_s16(v3782, 30331);
+    int16x8_t v3784 = vaddq_s16(v3781, v3783);
+    int16x8_t v3785 = vsubq_s16(v3523, v3525);
+    int16x8_t v3786 = vsubq_s16(v3527, v3529);
+    int16x8_t v3787 = vqrdmulhq_n_s16(v3786, 30925);
+    int16x8_t v3788 = vaddq_s16(v3785, v3787);
+    int16x8_t v3789 = vsubq_s16(v3513, v3515);
+    int16x8_t v3790 = vsubq_s16(v3517, v3519);
+    int16x8_t v3791 = vqrdmulhq_n_s16(v3790, 31547);
+    int16x8_t v3792 = vaddq_s16(v3789, v3791);
+    int16x8_t v3793 = vsubq_s16(v3503, v3505);
+    int16x8_t v3794 = vsubq_s16(v3507, v3509);
+    int16x8_t v3795 = vqrdmulhq_n_s16(v3794, 32199);
+    int16x8_t v3796 = vaddq_s16(v3793, v3795);
+    int16x8_t v3797 = vsubq_s16(v3493, v3495);
+    int16x8_t v3798 = vsubq_s16(v3497, v3499);
+    int16x8_t v3799_tmp = vqrdmulhq_n_s16(v3798, 117);
+    int16x8_t v3799 = vaddq_s16(v3799_tmp, v3798);
+    int16x8_t v3800 = vaddq_s16(v3797, v3799);
+    int16x8_t v3801 = vsubq_s16(v3483, v3485);
+    int16x8_t v3802 = vsubq_s16(v3487, v3489);
+    int16x8_t v3803_tmp = vqrdmulhq_n_s16(v3802, 837);
+    int16x8_t v3803 = vaddq_s16(v3803_tmp, v3802);
+    int16x8_t v3804 = vaddq_s16(v3801, v3803);
+    int16x8_t v3805 = vsubq_s16(v3473, v3475);
+    int16x8_t v3806 = vsubq_s16(v3477, v3479);
+    int16x8_t v3807_tmp = vqrdmulhq_n_s16(v3806, 1594);
+    int16x8_t v3807 = vaddq_s16(v3807_tmp, v3806);
+    int16x8_t v3808 = vaddq_s16(v3805, v3807);
+    int16x8_t v3809 = vsubq_s16(v3463, v3465);
+    int16x8_t v3810 = vsubq_s16(v3467, v3469);
+    int16x8_t v3811_tmp = vqrdmulhq_n_s16(v3810, 2393);
+    int16x8_t v3811 = vaddq_s16(v3811_tmp, v3810);
+    int16x8_t v3812 = vaddq_s16(v3809, v3811);
+    int16x8_t v3813 = vsubq_s16(v3453, v3455);
+    int16x8_t v3814 = vsubq_s16(v3457, v3459);
+    int16x8_t v3815_tmp = vqrdmulhq_n_s16(v3814, 3234);
+    int16x8_t v3815 = vaddq_s16(v3815_tmp, v3814);
+    int16x8_t v3816 = vaddq_s16(v3813, v3815);
+    int16x8_t v3817 = vsubq_s16(v3443, v3445);
+    int16x8_t v3818 = vsubq_s16(v3447, v3449);
+    int16x8_t v3819_tmp = vqrdmulhq_n_s16(v3818, 4123);
+    int16x8_t v3819 = vaddq_s16(v3819_tmp, v3818);
+    int16x8_t v3820 = vaddq_s16(v3817, v3819);
+    int16x8_t v3821 = vsubq_s16(v3433, v3435);
+    int16x8_t v3822 = vsubq_s16(v3437, v3439);
+    int16x8_t v3823_tmp = vqrdmulhq_n_s16(v3822, 5062);
+    int16x8_t v3823 = vaddq_s16(v3823_tmp, v3822);
+    int16x8_t v3824 = vaddq_s16(v3821, v3823);
+    int16x8_t v3825 = vsubq_s16(v3423, v3425);
+    int16x8_t v3826 = vsubq_s16(v3427, v3429);
+    int16x8_t v3827_tmp = vqrdmulhq_n_s16(v3826, 6057);
+    int16x8_t v3827 = vaddq_s16(v3827_tmp, v3826);
+    int16x8_t v3828 = vaddq_s16(v3825, v3827);
+    int16x8_t v3829 = vsubq_s16(v3413, v3415);
+    int16x8_t v3830 = vsubq_s16(v3417, v3419);
+    int16x8_t v3831_tmp = vqrdmulhq_n_s16(v3830, 7111);
+    int16x8_t v3831 = vaddq_s16(v3831_tmp, v3830);
+    int16x8_t v3832 = vaddq_s16(v3829, v3831);
+    int16x8_t v3833 = vsubq_s16(v3403, v3405);
+    int16x8_t v3834 = vsubq_s16(v3407, v3409);
+    int16x8_t v3835_tmp = vqrdmulhq_n_s16(v3834, 8231);
+    int16x8_t v3835 = vaddq_s16(v3835_tmp, v3834);
+    int16x8_t v3836 = vaddq_s16(v3833, v3835);
+    int16x8_t v3837 = vsubq_s16(v3393, v3395);
+    int16x8_t v3838 = vsubq_s16(v3397, v3399);
+    int16x8_t v3839_tmp = vqrdmulhq_n_s16(v3838, 9421);
+    int16x8_t v3839 = vaddq_s16(v3839_tmp, v3838);
+    int16x8_t v3840 = vaddq_s16(v3837, v3839);
+    int16x8_t v3841 = vsubq_s16(v3374, v3379);
+    int16x8_t v3842 = vsubq_s16(v3384, v3389);
+    int16x8_t v3843_tmp = vqrdmulhq_n_s16(v3842, 10690);
+    int16x8_t v3843 = vaddq_s16(v3843_tmp, v3842);
+    int16x8_t v3844 = vaddq_s16(v3841, v3843);
+    int16x8_t v3845 = vsubq_s16(v3352, v3357);
+    int16x8_t v3846 = vsubq_s16(v3362, v3367);
+    int16x8_t v3847_tmp = vqrdmulhq_n_s16(v3846, 12044);
+    int16x8_t v3847 = vaddq_s16(v3847_tmp, v3846);
+    int16x8_t v3848 = vaddq_s16(v3845, v3847);
+    int16x8_t v3849 = vsubq_s16(v3330, v3335);
+    int16x8_t v3850 = vsubq_s16(v3340, v3345);
+    int16x8_t v3851_tmp = vqrdmulhq_n_s16(v3850, 13493);
+    int16x8_t v3851 = vaddq_s16(v3851_tmp, v3850);
+    int16x8_t v3852 = vaddq_s16(v3849, v3851);
+    int16x8_t v3853 = vsubq_s16(v3308, v3313);
+    int16x8_t v3854 = vsubq_s16(v3318, v3323);
+    int16x8_t v3855_tmp = vqrdmulhq_n_s16(v3854, 15046);
+    int16x8_t v3855 = vaddq_s16(v3855_tmp, v3854);
+    int16x8_t v3856 = vaddq_s16(v3853, v3855);
+    int16x8_t v3857 = vsubq_s16(v3286, v3291);
+    int16x8_t v3858 = vsubq_s16(v3296, v3301);
+    int16x8_t v3859_tmp = vqrdmulhq_n_s16(v3858, 16715);
+    int16x8_t v3859 = vaddq_s16(v3859_tmp, v3858);
+    int16x8_t v3860 = vaddq_s16(v3857, v3859);
+    int16x8_t v3861 = vsubq_s16(v3264, v3269);
+    int16x8_t v3862 = vsubq_s16(v3274, v3279);
+    int16x8_t v3863_tmp = vqrdmulhq_n_s16(v3862, 18512);
+    int16x8_t v3863 = vaddq_s16(v3863_tmp, v3862);
+    int16x8_t v3864 = vaddq_s16(v3861, v3863);
+    int16x8_t v3865 = vsubq_s16(v3242, v3247);
+    int16x8_t v3866 = vsubq_s16(v3252, v3257);
+    int16x8_t v3867_tmp = vqrdmulhq_n_s16(v3866, 20453);
+    int16x8_t v3867 = vaddq_s16(v3867_tmp, v3866);
+    int16x8_t v3868 = vaddq_s16(v3865, v3867);
+    int16x8_t v3869 = vsubq_s16(v3220, v3225);
+    int16x8_t v3870 = vsubq_s16(v3230, v3235);
+    int16x8_t v3871_tmp = vqrdmulhq_n_s16(v3870, 22555);
+    int16x8_t v3871 = vaddq_s16(v3871_tmp, v3870);
+    int16x8_t v3872 = vaddq_s16(v3869, v3871);
+    int16x8_t v3873 = vsubq_s16(v3198, v3203);
+    int16x8_t v3874 = vsubq_s16(v3208, v3213);
+    int16x8_t v3875_tmp = vqrdmulhq_n_s16(v3874, 24839);
+    int16x8_t v3875 = vaddq_s16(v3875_tmp, v3874);
+    int16x8_t v3876 = vaddq_s16(v3873, v3875);
+    int16x8_t v3877 = vsubq_s16(v3176, v3181);
+    int16x8_t v3878 = vsubq_s16(v3186, v3191);
+    int16x8_t v3879_tmp = vqrdmulhq_n_s16(v3878, 27330);
+    int16x8_t v3879 = vaddq_s16(v3879_tmp, v3878);
+    int16x8_t v3880 = vaddq_s16(v3877, v3879);
+    int16x8_t v3881 = vsubq_s16(v3154, v3159);
+    int16x8_t v3882 = vsubq_s16(v3164, v3169);
+    int16x8_t v3883_tmp = vqrdmulhq_n_s16(v3882, 30056);
+    int16x8_t v3883 = vaddq_s16(v3883_tmp, v3882);
+    int16x8_t v3884 = vaddq_s16(v3881, v3883);
+    int16x8_t v3885 = vsubq_s16(v3132, v3137);
+    int16x8_t v3886 = vsubq_s16(v3142, v3147);
+    int16x8_t v3887_tmp = vqrdmulhq_n_s16(v3886, 282);
+    int16x8_t v3887 = vmlaq_n_s16(v3887_tmp, v3886, 2);
+    int16x8_t v3888 = vaddq_s16(v3885, v3887);
+    int16x8_t v3889 = vsubq_s16(v3110, v3115);
+    int16x8_t v3890 = vsubq_s16(v3120, v3125);
+    int16x8_t v3891_tmp = vqrdmulhq_n_s16(v3890, 3588);
+    int16x8_t v3891 = vmlaq_n_s16(v3891_tmp, v3890, 2);
+    int16x8_t v3892 = vaddq_s16(v3889, v3891);
+    int16x8_t v3893 = vsubq_s16(v3088, v3093);
+    int16x8_t v3894 = vsubq_s16(v3098, v3103);
+    int16x8_t v3895_tmp = vqrdmulhq_n_s16(v3894, 7255);
+    int16x8_t v3895 = vmlaq_n_s16(v3895_tmp, v3894, 2);
+    int16x8_t v3896 = vaddq_s16(v3893, v3895);
+    int16x8_t v3897 = vsubq_s16(v3066, v3071);
+    int16x8_t v3898 = vsubq_s16(v3076, v3081);
+    int16x8_t v3899_tmp = vqrdmulhq_n_s16(v3898, 11344);
+    int16x8_t v3899 = vmlaq_n_s16(v3899_tmp, v3898, 2);
+    int16x8_t v3900 = vaddq_s16(v3897, v3899);
+    int16x8_t v3901 = vsubq_s16(v3044, v3049);
+    int16x8_t v3902 = vsubq_s16(v3054, v3059);
+    int16x8_t v3903_tmp = vqrdmulhq_n_s16(v3902, 15934);
+    int16x8_t v3903 = vmlaq_n_s16(v3903_tmp, v3902, 2);
+    int16x8_t v3904 = vaddq_s16(v3901, v3903);
+    int16x8_t v3905 = vsubq_s16(v3004, v3015);
+    int16x8_t v3906 = vsubq_s16(v3026, v3037);
+    int16x8_t v3907_tmp = vqrdmulhq_n_s16(v3906, 21120);
+    int16x8_t v3907 = vmlaq_n_s16(v3907_tmp, v3906, 2);
+    int16x8_t v3908 = vaddq_s16(v3905, v3907);
+    int16x8_t v3909 = vsubq_s16(v2958, v2969);
+    int16x8_t v3910 = vsubq_s16(v2980, v2991);
+    int16x8_t v3911_tmp = vqrdmulhq_n_s16(v3910, 27027);
+    int16x8_t v3911 = vmlaq_n_s16(v3911_tmp, v3910, 2);
+    int16x8_t v3912 = vaddq_s16(v3909, v3911);
+    int16x8_t v3913 = vsubq_s16(v2912, v2923);
+    int16x8_t v3914 = vsubq_s16(v2934, v2945);
+    int16x8_t v3915_tmp = vqrdmulhq_n_s16(v3914, 1045);
+    int16x8_t v3915 = vmlaq_n_s16(v3915_tmp, v3914, 3);
+    int16x8_t v3916 = vaddq_s16(v3913, v3915);
+    int16x8_t v3917 = vsubq_s16(v2866, v2877);
+    int16x8_t v3918 = vsubq_s16(v2888, v2899);
+    int16x8_t v3919_tmp = vqrdmulhq_n_s16(v3918, 8923);
+    int16x8_t v3919 = vmlaq_n_s16(v3919_tmp, v3918, 3);
+    int16x8_t v3920 = vaddq_s16(v3917, v3919);
+    int16x8_t v3921 = vsubq_s16(v2820, v2831);
+    int16x8_t v3922 = vsubq_s16(v2842, v2853);
+    int16x8_t v3923_tmp = vqrdmulhq_n_s16(v3922, 18177);
+    int16x8_t v3923 = vmlaq_n_s16(v3923_tmp, v3922, 3);
+    int16x8_t v3924 = vaddq_s16(v3921, v3923);
+    int16x8_t v3925 = vsubq_s16(v2774, v2785);
+    int16x8_t v3926 = vsubq_s16(v2796, v2807);
+    int16x8_t v3927_tmp = vqrdmulhq_n_s16(v3926, 29200);
+    int16x8_t v3927 = vmlaq_n_s16(v3927_tmp, v3926, 3);
+    int16x8_t v3928 = vaddq_s16(v3925, v3927);
+    int16x8_t v3929 = vsubq_s16(v2728, v2739);
+    int16x8_t v3930 = vsubq_s16(v2750, v2761);
+    int16x8_t v3931_tmp = vqrdmulhq_n_s16(v3930, 9782);
+    int16x8_t v3931 = vmlaq_n_s16(v3931_tmp, v3930, 4);
+    int16x8_t v3932 = vaddq_s16(v3929, v3931);
+    int16x8_t v3933 = vsubq_s16(v2682, v2693);
+    int16x8_t v3934 = vsubq_s16(v2704, v2715);
+    int16x8_t v3935_tmp = vqrdmulhq_n_s16(v3934, 26282);
+    int16x8_t v3935 = vmlaq_n_s16(v3935_tmp, v3934, 4);
+    int16x8_t v3936 = vaddq_s16(v3933, v3935);
+    int16x8_t v3937 = vsubq_s16(v2600, v2623);
+    int16x8_t v3938 = vsubq_s16(v2646, v2669);
+    int16x8_t v3939_tmp = vqrdmulhq_n_s16(v3938, 14423);
+    int16x8_t v3939 = vmlaq_n_s16(v3939_tmp, v3938, 5);
+    int16x8_t v3940 = vaddq_s16(v3937, v3939);
+    int16x8_t v3941 = vsubq_s16(v2506, v2529);
+    int16x8_t v3942 = vsubq_s16(v2552, v2575);
+    int16x8_t v3943_tmp = vqrdmulhq_n_s16(v3942, 9008);
+    int16x8_t v3943 = vmlaq_n_s16(v3943_tmp, v3942, 6);
+    int16x8_t v3944 = vaddq_s16(v3941, v3943);
+    int16x8_t v3945 = vsubq_s16(v2411, v2434);
+    int16x8_t v3946 = vsubq_s16(v2457, v2481);
+    int16x8_t v3947_tmp = vqrdmulhq_n_s16(v3946, 13552);
+    int16x8_t v3947 = vmlaq_n_s16(v3947_tmp, v3946, 7);
+    int16x8_t v3948 = vaddq_s16(v3945, v3947);
+    int16x8_t v3949 = vsubq_s16(v2317, v2340);
+    int16x8_t v3950 = vsubq_s16(v2363, v2386);
+    int16x8_t v3951_tmp = vqrdmulhq_n_s16(v3950, 1925);
+    int16x8_t v3951 = vmlaq_n_s16(v3951_tmp, v3950, 9);
+    int16x8_t v3952 = vaddq_s16(v3949, v3951);
+    int16x8_t v3953 = vsubq_s16(v2151, v2198);
+    int16x8_t v3954 = vsubq_s16(v2245, v2292);
+    int16x8_t v3955_tmp = vqrdmulhq_n_s16(v3954, 21123);
+    int16x8_t v3955 = vmlaq_n_s16(v3955_tmp, v3954, 11);
+    int16x8_t v3956 = vaddq_s16(v3953, v3955);
+    int16x8_t v3957 = vsubq_s16(v1961, v2008);
+    int16x8_t v3958 = vsubq_s16(v2055, v2102);
+    int16x8_t v3959_tmp = vqrdmulhq_n_s16(v3958, 9831);
+    int16x8_t v3959 = vmlaq_n_s16(v3959_tmp, v3958, 16);
+    int16x8_t v3960 = vaddq_s16(v3957, v3959);
+    int16x8_t v3961 = vsubq_s16(v1627, v1722);
+    int16x8_t v3962 = vsubq_s16(v1817, v1912);
+    int16x8_t v3963_tmp = vqrdmulhq_n_s16(v3962, 5373);
+    int16x8_t v3963 = vmlaq_n_s16(v3963_tmp, v3962, 27);
+    int16x8_t v3964 = vaddq_s16(v3961, v3963);
+    int16x8_t v3965 = vsubq_s16(v317, v700);
+    int16x8_t v3966 = vsubq_s16(v1146, v1530);
+    int16x8_t v3967_tmp = vqrdmulhq_n_s16(v3966, 15986);
+    int16x8_t v3967 = vmlaq_n_s16(v3967_tmp, v3966, 81);
+    int16x8_t v3968 = vaddq_s16(v3965, v3967);
+    int16x8_t v3969 = vsubq_s16(v3965, v3967);
+    int16x8_t v3970 = vsubq_s16(v3961, v3963);
+    int16x8_t v3971 = vsubq_s16(v3957, v3959);
+    int16x8_t v3972 = vsubq_s16(v3953, v3955);
+    int16x8_t v3973 = vsubq_s16(v3949, v3951);
+    int16x8_t v3974 = vsubq_s16(v3945, v3947);
+    int16x8_t v3975 = vsubq_s16(v3941, v3943);
+    int16x8_t v3976 = vsubq_s16(v3937, v3939);
+    int16x8_t v3977 = vsubq_s16(v3933, v3935);
+    int16x8_t v3978 = vsubq_s16(v3929, v3931);
+    int16x8_t v3979 = vsubq_s16(v3925, v3927);
+    int16x8_t v3980 = vsubq_s16(v3921, v3923);
+    int16x8_t v3981 = vsubq_s16(v3917, v3919);
+    int16x8_t v3982 = vsubq_s16(v3913, v3915);
+    int16x8_t v3983 = vsubq_s16(v3909, v3911);
+    int16x8_t v3984 = vsubq_s16(v3905, v3907);
+    int16x8_t v3985 = vsubq_s16(v3901, v3903);
+    int16x8_t v3986 = vsubq_s16(v3897, v3899);
+    int16x8_t v3987 = vsubq_s16(v3893, v3895);
+    int16x8_t v3988 = vsubq_s16(v3889, v3891);
+    int16x8_t v3989 = vsubq_s16(v3885, v3887);
+    int16x8_t v3990 = vsubq_s16(v3881, v3883);
+    int16x8_t v3991 = vsubq_s16(v3877, v3879);
+    int16x8_t v3992 = vsubq_s16(v3873, v3875);
+    int16x8_t v3993 = vsubq_s16(v3869, v3871);
+    int16x8_t v3994 = vsubq_s16(v3865, v3867);
+    int16x8_t v3995 = vsubq_s16(v3861, v3863);
+    int16x8_t v3996 = vsubq_s16(v3857, v3859);
+    int16x8_t v3997 = vsubq_s16(v3853, v3855);
+    int16x8_t v3998 = vsubq_s16(v3849, v3851);
+    int16x8_t v3999 = vsubq_s16(v3845, v3847);
+    int16x8_t v4000 = vsubq_s16(v3841, v3843);
+    int16x8_t v4001 = vsubq_s16(v3837, v3839);
+    int16x8_t v4002 = vsubq_s16(v3833, v3835);
+    int16x8_t v4003 = vsubq_s16(v3829, v3831);
+    int16x8_t v4004 = vsubq_s16(v3825, v3827);
+    int16x8_t v4005 = vsubq_s16(v3821, v3823);
+    int16x8_t v4006 = vsubq_s16(v3817, v3819);
+    int16x8_t v4007 = vsubq_s16(v3813, v3815);
+    int16x8_t v4008 = vsubq_s16(v3809, v3811);
+    int16x8_t v4009 = vsubq_s16(v3805, v3807);
+    int16x8_t v4010 = vsubq_s16(v3801, v3803);
+    int16x8_t v4011 = vsubq_s16(v3797, v3799);
+    int16x8_t v4012 = vsubq_s16(v3793, v3795);
+    int16x8_t v4013 = vsubq_s16(v3789, v3791);
+    int16x8_t v4014 = vsubq_s16(v3785, v3787);
+    int16x8_t v4015 = vsubq_s16(v3781, v3783);
+    int16x8_t v4016 = vsubq_s16(v3777, v3779);
+    int16x8_t v4017 = vsubq_s16(v3773, v3775);
+    int16x8_t v4018 = vsubq_s16(v3769, v3771);
+    int16x8_t v4019 = vsubq_s16(v3765, v3767);
+    int16x8_t v4020 = vsubq_s16(v3761, v3763);
+    int16x8_t v4021 = vsubq_s16(v3757, v3759);
+    int16x8_t v4022 = vsubq_s16(v3753, v3755);
+    int16x8_t v4023 = vsubq_s16(v3749, v3751);
+    int16x8_t v4024 = vsubq_s16(v3745, v3747);
+    int16x8_t v4025 = vsubq_s16(v3741, v3743);
+    int16x8_t v4026 = vsubq_s16(v3737, v3739);
+    int16x8_t v4027 = vsubq_s16(v3733, v3735);
+    int16x8_t v4028 = vsubq_s16(v3729, v3731);
+    int16x8_t v4029 = vsubq_s16(v3725, v3727);
+    int16x8_t v4030 = vsubq_s16(v3721, v3723);
+    int16x8_t v4031 = vsubq_s16(v3717, v3719);
+    int16x8_t v4032 = vsubq_s16(v3713, v3715);
+    int16x8_t v4033 = vsubq_s16(v3706, v3711);
+    int16x8_t v4034 = vsubq_s16(v3696, v3701);
+    int16x8_t v4035 = vsubq_s16(v3686, v3691);
+    int16x8_t v4036 = vsubq_s16(v3676, v3681);
+    int16x8_t v4037 = vsubq_s16(v3666, v3671);
+    int16x8_t v4038 = vsubq_s16(v3656, v3661);
+    int16x8_t v4039 = vsubq_s16(v3646, v3651);
+    int16x8_t v4040 = vsubq_s16(v3636, v3641);
+    int16x8_t v4041 = vsubq_s16(v3626, v3631);
+    int16x8_t v4042 = vsubq_s16(v3616, v3621);
+    int16x8_t v4043 = vsubq_s16(v3606, v3611);
+    int16x8_t v4044 = vsubq_s16(v3596, v3601);
+    int16x8_t v4045 = vsubq_s16(v3586, v3591);
+    int16x8_t v4046 = vsubq_s16(v3576, v3581);
+    int16x8_t v4047 = vsubq_s16(v3566, v3571);
+    int16x8_t v4048 = vsubq_s16(v3556, v3561);
+    int16x8_t v4049 = vsubq_s16(v3546, v3551);
+    int16x8_t v4050 = vsubq_s16(v3536, v3541);
+    int16x8_t v4051 = vsubq_s16(v3526, v3531);
+    int16x8_t v4052 = vsubq_s16(v3516, v3521);
+    int16x8_t v4053 = vsubq_s16(v3506, v3511);
+    int16x8_t v4054 = vsubq_s16(v3496, v3501);
+    int16x8_t v4055 = vsubq_s16(v3486, v3491);
+    int16x8_t v4056 = vsubq_s16(v3476, v3481);
+    int16x8_t v4057 = vsubq_s16(v3466, v3471);
+    int16x8_t v4058 = vsubq_s16(v3456, v3461);
+    int16x8_t v4059 = vsubq_s16(v3446, v3451);
+    int16x8_t v4060 = vsubq_s16(v3436, v3441);
+    int16x8_t v4061 = vsubq_s16(v3426, v3431);
+    int16x8_t v4062 = vsubq_s16(v3416, v3421);
+    int16x8_t v4063 = vsubq_s16(v3406, v3411);
+    int16x8_t v4064 = vsubq_s16(v3396, v3401);
+    int16x8_t v4065 = vsubq_s16(v3380, v3391);
+    int16x8_t v4066 = vsubq_s16(v3358, v3369);
+    int16x8_t v4067 = vsubq_s16(v3336, v3347);
+    int16x8_t v4068 = vsubq_s16(v3314, v3325);
+    int16x8_t v4069 = vsubq_s16(v3292, v3303);
+    int16x8_t v4070 = vsubq_s16(v3270, v3281);
+    int16x8_t v4071 = vsubq_s16(v3248, v3259);
+    int16x8_t v4072 = vsubq_s16(v3226, v3237);
+    int16x8_t v4073 = vsubq_s16(v3204, v3215);
+    int16x8_t v4074 = vsubq_s16(v3182, v3193);
+    int16x8_t v4075 = vsubq_s16(v3160, v3171);
+    int16x8_t v4076 = vsubq_s16(v3138, v3149);
+    int16x8_t v4077 = vsubq_s16(v3116, v3127);
+    int16x8_t v4078 = vsubq_s16(v3094, v3105);
+    int16x8_t v4079 = vsubq_s16(v3072, v3083);
+    int16x8_t v4080 = vsubq_s16(v3050, v3061);
+    int16x8_t v4081 = vsubq_s16(v3016, v3039);
+    int16x8_t v4082 = vsubq_s16(v2970, v2993);
+    int16x8_t v4083 = vsubq_s16(v2924, v2947);
+    int16x8_t v4084 = vsubq_s16(v2878, v2901);
+    int16x8_t v4085 = vsubq_s16(v2832, v2855);
+    int16x8_t v4086 = vsubq_s16(v2786, v2809);
+    int16x8_t v4087 = vsubq_s16(v2740, v2763);
+    int16x8_t v4088 = vsubq_s16(v2694, v2717);
+    int16x8_t v4089 = vsubq_s16(v2624, v2671);
+    int16x8_t v4090 = vsubq_s16(v2530, v2577);
+    int16x8_t v4091 = vsubq_s16(v2435, v2483);
+    int16x8_t v4092 = vsubq_s16(v2341, v2388);
+    int16x8_t v4093 = vsubq_s16(v2199, v2294);
+    int16x8_t v4094 = vsubq_s16(v2009, v2104);
+    int16x8_t v4095 = vsubq_s16(v1723, v1914);
+    int16x8_t v4096 = vsubq_s16(v701, v1532);
+    vst1q_s16(out + out_stride * 0 + i, v1533);
+    vst1q_s16(out + out_stride * 1 + i, v1915);
+    vst1q_s16(out + out_stride * 2 + i, v2105);
+    vst1q_s16(out + out_stride * 3 + i, v2295);
+    vst1q_s16(out + out_stride * 4 + i, v2389);
+    vst1q_s16(out + out_stride * 5 + i, v2484);
+    vst1q_s16(out + out_stride * 6 + i, v2578);
+    vst1q_s16(out + out_stride * 7 + i, v2672);
+    vst1q_s16(out + out_stride * 8 + i, v2718);
+    vst1q_s16(out + out_stride * 9 + i, v2764);
+    vst1q_s16(out + out_stride * 10 + i, v2810);
+    vst1q_s16(out + out_stride * 11 + i, v2856);
+    vst1q_s16(out + out_stride * 12 + i, v2902);
+    vst1q_s16(out + out_stride * 13 + i, v2948);
+    vst1q_s16(out + out_stride * 14 + i, v2994);
+    vst1q_s16(out + out_stride * 15 + i, v3040);
+    vst1q_s16(out + out_stride * 16 + i, v3062);
+    vst1q_s16(out + out_stride * 17 + i, v3084);
+    vst1q_s16(out + out_stride * 18 + i, v3106);
+    vst1q_s16(out + out_stride * 19 + i, v3128);
+    vst1q_s16(out + out_stride * 20 + i, v3150);
+    vst1q_s16(out + out_stride * 21 + i, v3172);
+    vst1q_s16(out + out_stride * 22 + i, v3194);
+    vst1q_s16(out + out_stride * 23 + i, v3216);
+    vst1q_s16(out + out_stride * 24 + i, v3238);
+    vst1q_s16(out + out_stride * 25 + i, v3260);
+    vst1q_s16(out + out_stride * 26 + i, v3282);
+    vst1q_s16(out + out_stride * 27 + i, v3304);
+    vst1q_s16(out + out_stride * 28 + i, v3326);
+    vst1q_s16(out + out_stride * 29 + i, v3348);
+    vst1q_s16(out + out_stride * 30 + i, v3370);
+    vst1q_s16(out + out_stride * 31 + i, v3392);
+    vst1q_s16(out + out_stride * 32 + i, v3402);
+    vst1q_s16(out + out_stride * 33 + i, v3412);
+    vst1q_s16(out + out_stride * 34 + i, v3422);
+    vst1q_s16(out + out_stride * 35 + i, v3432);
+    vst1q_s16(out + out_stride * 36 + i, v3442);
+    vst1q_s16(out + out_stride * 37 + i, v3452);
+    vst1q_s16(out + out_stride * 38 + i, v3462);
+    vst1q_s16(out + out_stride * 39 + i, v3472);
+    vst1q_s16(out + out_stride * 40 + i, v3482);
+    vst1q_s16(out + out_stride * 41 + i, v3492);
+    vst1q_s16(out + out_stride * 42 + i, v3502);
+    vst1q_s16(out + out_stride * 43 + i, v3512);
+    vst1q_s16(out + out_stride * 44 + i, v3522);
+    vst1q_s16(out + out_stride * 45 + i, v3532);
+    vst1q_s16(out + out_stride * 46 + i, v3542);
+    vst1q_s16(out + out_stride * 47 + i, v3552);
+    vst1q_s16(out + out_stride * 48 + i, v3562);
+    vst1q_s16(out + out_stride * 49 + i, v3572);
+    vst1q_s16(out + out_stride * 50 + i, v3582);
+    vst1q_s16(out + out_stride * 51 + i, v3592);
+    vst1q_s16(out + out_stride * 52 + i, v3602);
+    vst1q_s16(out + out_stride * 53 + i, v3612);
+    vst1q_s16(out + out_stride * 54 + i, v3622);
+    vst1q_s16(out + out_stride * 55 + i, v3632);
+    vst1q_s16(out + out_stride * 56 + i, v3642);
+    vst1q_s16(out + out_stride * 57 + i, v3652);
+    vst1q_s16(out + out_stride * 58 + i, v3662);
+    vst1q_s16(out + out_stride * 59 + i, v3672);
+    vst1q_s16(out + out_stride * 60 + i, v3682);
+    vst1q_s16(out + out_stride * 61 + i, v3692);
+    vst1q_s16(out + out_stride * 62 + i, v3702);
+    vst1q_s16(out + out_stride * 63 + i, v3712);
+    vst1q_s16(out + out_stride * 64 + i, v3716);
+    vst1q_s16(out + out_stride * 65 + i, v3720);
+    vst1q_s16(out + out_stride * 66 + i, v3724);
+    vst1q_s16(out + out_stride * 67 + i, v3728);
+    vst1q_s16(out + out_stride * 68 + i, v3732);
+    vst1q_s16(out + out_stride * 69 + i, v3736);
+    vst1q_s16(out + out_stride * 70 + i, v3740);
+    vst1q_s16(out + out_stride * 71 + i, v3744);
+    vst1q_s16(out + out_stride * 72 + i, v3748);
+    vst1q_s16(out + out_stride * 73 + i, v3752);
+    vst1q_s16(out + out_stride * 74 + i, v3756);
+    vst1q_s16(out + out_stride * 75 + i, v3760);
+    vst1q_s16(out + out_stride * 76 + i, v3764);
+    vst1q_s16(out + out_stride * 77 + i, v3768);
+    vst1q_s16(out + out_stride * 78 + i, v3772);
+    vst1q_s16(out + out_stride * 79 + i, v3776);
+    vst1q_s16(out + out_stride * 80 + i, v3780);
+    vst1q_s16(out + out_stride * 81 + i, v3784);
+    vst1q_s16(out + out_stride * 82 + i, v3788);
+    vst1q_s16(out + out_stride * 83 + i, v3792);
+    vst1q_s16(out + out_stride * 84 + i, v3796);
+    vst1q_s16(out + out_stride * 85 + i, v3800);
+    vst1q_s16(out + out_stride * 86 + i, v3804);
+    vst1q_s16(out + out_stride * 87 + i, v3808);
+    vst1q_s16(out + out_stride * 88 + i, v3812);
+    vst1q_s16(out + out_stride * 89 + i, v3816);
+    vst1q_s16(out + out_stride * 90 + i, v3820);
+    vst1q_s16(out + out_stride * 91 + i, v3824);
+    vst1q_s16(out + out_stride * 92 + i, v3828);
+    vst1q_s16(out + out_stride * 93 + i, v3832);
+    vst1q_s16(out + out_stride * 94 + i, v3836);
+    vst1q_s16(out + out_stride * 95 + i, v3840);
+    vst1q_s16(out + out_stride * 96 + i, v3844);
+    vst1q_s16(out + out_stride * 97 + i, v3848);
+    vst1q_s16(out + out_stride * 98 + i, v3852);
+    vst1q_s16(out + out_stride * 99 + i, v3856);
+    vst1q_s16(out + out_stride * 100 + i, v3860);
+    vst1q_s16(out + out_stride * 101 + i, v3864);
+    vst1q_s16(out + out_stride * 102 + i, v3868);
+    vst1q_s16(out + out_stride * 103 + i, v3872);
+    vst1q_s16(out + out_stride * 104 + i, v3876);
+    vst1q_s16(out + out_stride * 105 + i, v3880);
+    vst1q_s16(out + out_stride * 106 + i, v3884);
+    vst1q_s16(out + out_stride * 107 + i, v3888);
+    vst1q_s16(out + out_stride * 108 + i, v3892);
+    vst1q_s16(out + out_stride * 109 + i, v3896);
+    vst1q_s16(out + out_stride * 110 + i, v3900);
+    vst1q_s16(out + out_stride * 111 + i, v3904);
+    vst1q_s16(out + out_stride * 112 + i, v3908);
+    vst1q_s16(out + out_stride * 113 + i, v3912);
+    vst1q_s16(out + out_stride * 114 + i, v3916);
+    vst1q_s16(out + out_stride * 115 + i, v3920);
+    vst1q_s16(out + out_stride * 116 + i, v3924);
+    vst1q_s16(out + out_stride * 117 + i, v3928);
+    vst1q_s16(out + out_stride * 118 + i, v3932);
+    vst1q_s16(out + out_stride * 119 + i, v3936);
+    vst1q_s16(out + out_stride * 120 + i, v3940);
+    vst1q_s16(out + out_stride * 121 + i, v3944);
+    vst1q_s16(out + out_stride * 122 + i, v3948);
+    vst1q_s16(out + out_stride * 123 + i, v3952);
+    vst1q_s16(out + out_stride * 124 + i, v3956);
+    vst1q_s16(out + out_stride * 125 + i, v3960);
+    vst1q_s16(out + out_stride * 126 + i, v3964);
+    vst1q_s16(out + out_stride * 127 + i, v3968);
+    vst1q_s16(out + out_stride * 128 + i, v3969);
+    vst1q_s16(out + out_stride * 129 + i, v3970);
+    vst1q_s16(out + out_stride * 130 + i, v3971);
+    vst1q_s16(out + out_stride * 131 + i, v3972);
+    vst1q_s16(out + out_stride * 132 + i, v3973);
+    vst1q_s16(out + out_stride * 133 + i, v3974);
+    vst1q_s16(out + out_stride * 134 + i, v3975);
+    vst1q_s16(out + out_stride * 135 + i, v3976);
+    vst1q_s16(out + out_stride * 136 + i, v3977);
+    vst1q_s16(out + out_stride * 137 + i, v3978);
+    vst1q_s16(out + out_stride * 138 + i, v3979);
+    vst1q_s16(out + out_stride * 139 + i, v3980);
+    vst1q_s16(out + out_stride * 140 + i, v3981);
+    vst1q_s16(out + out_stride * 141 + i, v3982);
+    vst1q_s16(out + out_stride * 142 + i, v3983);
+    vst1q_s16(out + out_stride * 143 + i, v3984);
+    vst1q_s16(out + out_stride * 144 + i, v3985);
+    vst1q_s16(out + out_stride * 145 + i, v3986);
+    vst1q_s16(out + out_stride * 146 + i, v3987);
+    vst1q_s16(out + out_stride * 147 + i, v3988);
+    vst1q_s16(out + out_stride * 148 + i, v3989);
+    vst1q_s16(out + out_stride * 149 + i, v3990);
+    vst1q_s16(out + out_stride * 150 + i, v3991);
+    vst1q_s16(out + out_stride * 151 + i, v3992);
+    vst1q_s16(out + out_stride * 152 + i, v3993);
+    vst1q_s16(out + out_stride * 153 + i, v3994);
+    vst1q_s16(out + out_stride * 154 + i, v3995);
+    vst1q_s16(out + out_stride * 155 + i, v3996);
+    vst1q_s16(out + out_stride * 156 + i, v3997);
+    vst1q_s16(out + out_stride * 157 + i, v3998);
+    vst1q_s16(out + out_stride * 158 + i, v3999);
+    vst1q_s16(out + out_stride * 159 + i, v4000);
+    vst1q_s16(out + out_stride * 160 + i, v4001);
+    vst1q_s16(out + out_stride * 161 + i, v4002);
+    vst1q_s16(out + out_stride * 162 + i, v4003);
+    vst1q_s16(out + out_stride * 163 + i, v4004);
+    vst1q_s16(out + out_stride * 164 + i, v4005);
+    vst1q_s16(out + out_stride * 165 + i, v4006);
+    vst1q_s16(out + out_stride * 166 + i, v4007);
+    vst1q_s16(out + out_stride * 167 + i, v4008);
+    vst1q_s16(out + out_stride * 168 + i, v4009);
+    vst1q_s16(out + out_stride * 169 + i, v4010);
+    vst1q_s16(out + out_stride * 170 + i, v4011);
+    vst1q_s16(out + out_stride * 171 + i, v4012);
+    vst1q_s16(out + out_stride * 172 + i, v4013);
+    vst1q_s16(out + out_stride * 173 + i, v4014);
+    vst1q_s16(out + out_stride * 174 + i, v4015);
+    vst1q_s16(out + out_stride * 175 + i, v4016);
+    vst1q_s16(out + out_stride * 176 + i, v4017);
+    vst1q_s16(out + out_stride * 177 + i, v4018);
+    vst1q_s16(out + out_stride * 178 + i, v4019);
+    vst1q_s16(out + out_stride * 179 + i, v4020);
+    vst1q_s16(out + out_stride * 180 + i, v4021);
+    vst1q_s16(out + out_stride * 181 + i, v4022);
+    vst1q_s16(out + out_stride * 182 + i, v4023);
+    vst1q_s16(out + out_stride * 183 + i, v4024);
+    vst1q_s16(out + out_stride * 184 + i, v4025);
+    vst1q_s16(out + out_stride * 185 + i, v4026);
+    vst1q_s16(out + out_stride * 186 + i, v4027);
+    vst1q_s16(out + out_stride * 187 + i, v4028);
+    vst1q_s16(out + out_stride * 188 + i, v4029);
+    vst1q_s16(out + out_stride * 189 + i, v4030);
+    vst1q_s16(out + out_stride * 190 + i, v4031);
+    vst1q_s16(out + out_stride * 191 + i, v4032);
+    vst1q_s16(out + out_stride * 192 + i, v4033);
+    vst1q_s16(out + out_stride * 193 + i, v4034);
+    vst1q_s16(out + out_stride * 194 + i, v4035);
+    vst1q_s16(out + out_stride * 195 + i, v4036);
+    vst1q_s16(out + out_stride * 196 + i, v4037);
+    vst1q_s16(out + out_stride * 197 + i, v4038);
+    vst1q_s16(out + out_stride * 198 + i, v4039);
+    vst1q_s16(out + out_stride * 199 + i, v4040);
+    vst1q_s16(out + out_stride * 200 + i, v4041);
+    vst1q_s16(out + out_stride * 201 + i, v4042);
+    vst1q_s16(out + out_stride * 202 + i, v4043);
+    vst1q_s16(out + out_stride * 203 + i, v4044);
+    vst1q_s16(out + out_stride * 204 + i, v4045);
+    vst1q_s16(out + out_stride * 205 + i, v4046);
+    vst1q_s16(out + out_stride * 206 + i, v4047);
+    vst1q_s16(out + out_stride * 207 + i, v4048);
+    vst1q_s16(out + out_stride * 208 + i, v4049);
+    vst1q_s16(out + out_stride * 209 + i, v4050);
+    vst1q_s16(out + out_stride * 210 + i, v4051);
+    vst1q_s16(out + out_stride * 211 + i, v4052);
+    vst1q_s16(out + out_stride * 212 + i, v4053);
+    vst1q_s16(out + out_stride * 213 + i, v4054);
+    vst1q_s16(out + out_stride * 214 + i, v4055);
+    vst1q_s16(out + out_stride * 215 + i, v4056);
+    vst1q_s16(out + out_stride * 216 + i, v4057);
+    vst1q_s16(out + out_stride * 217 + i, v4058);
+    vst1q_s16(out + out_stride * 218 + i, v4059);
+    vst1q_s16(out + out_stride * 219 + i, v4060);
+    vst1q_s16(out + out_stride * 220 + i, v4061);
+    vst1q_s16(out + out_stride * 221 + i, v4062);
+    vst1q_s16(out + out_stride * 222 + i, v4063);
+    vst1q_s16(out + out_stride * 223 + i, v4064);
+    vst1q_s16(out + out_stride * 224 + i, v4065);
+    vst1q_s16(out + out_stride * 225 + i, v4066);
+    vst1q_s16(out + out_stride * 226 + i, v4067);
+    vst1q_s16(out + out_stride * 227 + i, v4068);
+    vst1q_s16(out + out_stride * 228 + i, v4069);
+    vst1q_s16(out + out_stride * 229 + i, v4070);
+    vst1q_s16(out + out_stride * 230 + i, v4071);
+    vst1q_s16(out + out_stride * 231 + i, v4072);
+    vst1q_s16(out + out_stride * 232 + i, v4073);
+    vst1q_s16(out + out_stride * 233 + i, v4074);
+    vst1q_s16(out + out_stride * 234 + i, v4075);
+    vst1q_s16(out + out_stride * 235 + i, v4076);
+    vst1q_s16(out + out_stride * 236 + i, v4077);
+    vst1q_s16(out + out_stride * 237 + i, v4078);
+    vst1q_s16(out + out_stride * 238 + i, v4079);
+    vst1q_s16(out + out_stride * 239 + i, v4080);
+    vst1q_s16(out + out_stride * 240 + i, v4081);
+    vst1q_s16(out + out_stride * 241 + i, v4082);
+    vst1q_s16(out + out_stride * 242 + i, v4083);
+    vst1q_s16(out + out_stride * 243 + i, v4084);
+    vst1q_s16(out + out_stride * 244 + i, v4085);
+    vst1q_s16(out + out_stride * 245 + i, v4086);
+    vst1q_s16(out + out_stride * 246 + i, v4087);
+    vst1q_s16(out + out_stride * 247 + i, v4088);
+    vst1q_s16(out + out_stride * 248 + i, v4089);
+    vst1q_s16(out + out_stride * 249 + i, v4090);
+    vst1q_s16(out + out_stride * 250 + i, v4091);
+    vst1q_s16(out + out_stride * 251 + i, v4092);
+    vst1q_s16(out + out_stride * 252 + i, v4093);
+    vst1q_s16(out + out_stride * 253 + i, v4094);
+    vst1q_s16(out + out_stride * 254 + i, v4095);
+    vst1q_s16(out + out_stride * 255 + i, v4096);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct32-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct32-inl.h
new file mode 100644
index 0000000000..0f3b31cfea
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct32-inl.h
@@ -0,0 +1,419 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/* This file is automatically generated. Do not modify it directly. */
+#if HWY_TARGET != HWY_NEON
+#error "only include this file from fast_dct-inl.h"
+#endif
+
+constexpr size_t FastIDCTIntegerBits(FastDCTTag<32>) { return 1; }
+
+void FastIDCT(FastDCTTag<32>, const int16_t* in, size_t in_stride, int16_t* out,
+              size_t out_stride, size_t count) {
+  JXL_ASSERT(count % 8 == 0);
+  for (size_t i = 0; i < count; i += 8) {
+    int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
+    int16x8_t v1 = vld1q_s16(in + in_stride * 16 + i);
+    int16x8_t v2 = vaddq_s16(v0, v1);
+    int16x8_t v3 = vld1q_s16(in + in_stride * 8 + i);
+    int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
+    int16x8_t v4 = vaddq_s16(v4_tmp, v3);
+    int16x8_t v5 = vld1q_s16(in + in_stride * 24 + i);
+    int16x8_t v6 = vaddq_s16(v5, v3);
+    int16x8_t v7 = vaddq_s16(v4, v6);
+    int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
+    int16x8_t v9 = vaddq_s16(v2, v8);
+    int16x8_t v10 = vld1q_s16(in + in_stride * 4 + i);
+    int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
+    int16x8_t v11 = vaddq_s16(v11_tmp, v10);
+    int16x8_t v12 = vld1q_s16(in + in_stride * 20 + i);
+    int16x8_t v13 = vld1q_s16(in + in_stride * 12 + i);
+    int16x8_t v14 = vaddq_s16(v12, v13);
+    int16x8_t v15 = vaddq_s16(v11, v14);
+    int16x8_t v16 = vld1q_s16(in + in_stride * 28 + i);
+    int16x8_t v17 = vaddq_s16(v16, v12);
+    int16x8_t v18 = vaddq_s16(v13, v10);
+    int16x8_t v19 = vaddq_s16(v17, v18);
+    int16x8_t v20 = vqrdmulhq_n_s16(v19, 17734);
+    int16x8_t v21 = vqrdmulhq_n_s16(v18, 25080);
+    int16x8_t v22 = vaddq_s16(v20, v21);
+    int16x8_t v23 = vaddq_s16(v15, v22);
+    int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
+    int16x8_t v25 = vaddq_s16(v9, v24);
+    int16x8_t v26 = vld1q_s16(in + in_stride * 2 + i);
+    int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573);
+    int16x8_t v27 = vaddq_s16(v27_tmp, v26);
+    int16x8_t v28 = vld1q_s16(in + in_stride * 18 + i);
+    int16x8_t v29 = vld1q_s16(in + in_stride * 14 + i);
+    int16x8_t v30 = vaddq_s16(v28, v29);
+    int16x8_t v31 = vaddq_s16(v27, v30);
+    int16x8_t v32 = vld1q_s16(in + in_stride * 10 + i);
+    int16x8_t v33 = vld1q_s16(in + in_stride * 6 + i);
+    int16x8_t v34 = vaddq_s16(v32, v33);
+    int16x8_t v35 = vqrdmulhq_n_s16(v34, 25080);
+    int16x8_t v36 = vld1q_s16(in + in_stride * 26 + i);
+    int16x8_t v37 = vld1q_s16(in + in_stride * 22 + i);
+    int16x8_t v38 = vaddq_s16(v36, v37);
+    int16x8_t v39 = vaddq_s16(v38, v34);
+    int16x8_t v40 = vqrdmulhq_n_s16(v39, 17734);
+    int16x8_t v41 = vaddq_s16(v35, v40);
+    int16x8_t v42 = vaddq_s16(v31, v41);
+    int16x8_t v43 = vaddq_s16(v33, v26);
+    int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573);
+    int16x8_t v44 = vaddq_s16(v44_tmp, v43);
+    int16x8_t v45 = vaddq_s16(v29, v32);
+    int16x8_t v46 = vaddq_s16(v37, v28);
+    int16x8_t v47 = vaddq_s16(v45, v46);
+    int16x8_t v48 = vaddq_s16(v44, v47);
+    int16x8_t v49 = vqrdmulhq_n_s16(v48, 16705);
+    int16x8_t v50 = vld1q_s16(in + in_stride * 30 + i);
+    int16x8_t v51 = vaddq_s16(v50, v36);
+    int16x8_t v52 = vaddq_s16(v51, v46);
+    int16x8_t v53 = vqrdmulhq_n_s16(v52, 17734);
+    int16x8_t v54 = vaddq_s16(v45, v43);
+    int16x8_t v55_tmp = vqrdmulhq_n_s16(v54, 10045);
+    int16x8_t v55 = vaddq_s16(v55_tmp, v54);
+    int16x8_t v56 = vaddq_s16(v53, v55);
+    int16x8_t v57 = vqrdmulhq_n_s16(v56, 16705);
+    int16x8_t v58 = vaddq_s16(v49, v57);
+    int16x8_t v59 = vaddq_s16(v42, v58);
+    int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
+    int16x8_t v61 = vaddq_s16(v25, v60);
+    int16x8_t v62 = vld1q_s16(in + in_stride * 13 + i);
+    int16x8_t v63 = vld1q_s16(in + in_stride * 11 + i);
+    int16x8_t v64 = vaddq_s16(v62, v63);
+    int16x8_t v65 = vld1q_s16(in + in_stride * 5 + i);
+    int16x8_t v66 = vld1q_s16(in + in_stride * 3 + i);
+    int16x8_t v67 = vaddq_s16(v65, v66);
+    int16x8_t v68 = vaddq_s16(v64, v67);
+    int16x8_t v69_tmp = vqrdmulhq_n_s16(v68, 10045);
+    int16x8_t v69 = vaddq_s16(v69_tmp, v68);
+    int16x8_t v70 = vld1q_s16(in + in_stride * 21 + i);
+    int16x8_t v71 = vld1q_s16(in + in_stride * 19 + i);
+    int16x8_t v72 = vaddq_s16(v70, v71);
+    int16x8_t v73 = vld1q_s16(in + in_stride * 29 + i);
+    int16x8_t v74 = vld1q_s16(in + in_stride * 27 + i);
+    int16x8_t v75 = vaddq_s16(v73, v74);
+    int16x8_t v76 = vaddq_s16(v72, v75);
+    int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734);
+    int16x8_t v78 = vaddq_s16(v69, v77);
+    int16x8_t v79 = vqrdmulhq_n_s16(v78, 16705);
+    int16x8_t v80_tmp = vqrdmulhq_n_s16(v67, 13573);
+    int16x8_t v80 = vaddq_s16(v80_tmp, v67);
+    int16x8_t v81 = vaddq_s16(v64, v72);
+    int16x8_t v82 = vaddq_s16(v80, v81);
+    int16x8_t v83 = vqrdmulhq_n_s16(v82, 16705);
+    int16x8_t v84 = vaddq_s16(v79, v83);
+    int16x8_t v85 = vld1q_s16(in + in_stride * 1 + i);
+    int16x8_t v86_tmp = vqrdmulhq_n_s16(v85, 13573);
+    int16x8_t v86 = vaddq_s16(v86_tmp, v85);
+    int16x8_t v87 = vld1q_s16(in + in_stride * 17 + i);
+    int16x8_t v88 = vld1q_s16(in + in_stride * 15 + i);
+    int16x8_t v89 = vaddq_s16(v87, v88);
+    int16x8_t v90 = vaddq_s16(v86, v89);
+    int16x8_t v91 = vld1q_s16(in + in_stride * 9 + i);
+    int16x8_t v92 = vld1q_s16(in + in_stride * 7 + i);
+    int16x8_t v93 = vaddq_s16(v91, v92);
+    int16x8_t v94 = vqrdmulhq_n_s16(v93, 25080);
+    int16x8_t v95 = vld1q_s16(in + in_stride * 25 + i);
+    int16x8_t v96 = vld1q_s16(in + in_stride * 23 + i);
+    int16x8_t v97 = vaddq_s16(v95, v96);
+    int16x8_t v98 = vaddq_s16(v97, v93);
+    int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734);
+    int16x8_t v100 = vaddq_s16(v94, v99);
+    int16x8_t v101 = vaddq_s16(v90, v100);
+    int16x8_t v102 = vaddq_s16(v84, v101);
+    int16x8_t v103 = vaddq_s16(v92, v65);
+    int16x8_t v104 = vaddq_s16(v66, v85);
+    int16x8_t v105 = vaddq_s16(v103, v104);
+    int16x8_t v106_tmp = vqrdmulhq_n_s16(v105, 13573);
+    int16x8_t v106 = vaddq_s16(v106_tmp, v105);
+    int16x8_t v107 = vaddq_s16(v96, v70);
+    int16x8_t v108 = vaddq_s16(v71, v87);
+    int16x8_t v109 = vaddq_s16(v107, v108);
+    int16x8_t v110 = vaddq_s16(v63, v91);
+    int16x8_t v111 = vaddq_s16(v88, v62);
+    int16x8_t v112 = vaddq_s16(v110, v111);
+    int16x8_t v113 = vaddq_s16(v109, v112);
+    int16x8_t v114 = vaddq_s16(v106, v113);
+    int16x8_t v115 = vqrdmulhq_n_s16(v114, 16705);
+    int16x8_t v116 = vaddq_s16(v112, v105);
+    int16x8_t v117 = vqrdmulhq_n_s16(v116, 25080);
+    int16x8_t v118 = vqrdmulhq_n_s16(v116, 17734);
+    int16x8_t v119 = vaddq_s16(v74, v95);
+    int16x8_t v120 = vld1q_s16(in + in_stride * 31 + i);
+    int16x8_t v121 = vaddq_s16(v120, v73);
+    int16x8_t v122 = vaddq_s16(v119, v121);
+    int16x8_t v123 = vaddq_s16(v122, v109);
+    int16x8_t v124 = vqrdmulhq_n_s16(v123, 17734);
+    int16x8_t v125 = vaddq_s16(v118, v124);
+    int16x8_t v126 = vaddq_s16(v117, v125);
+    int16x8_t v127 = vqrdmulhq_n_s16(v126, 16705);
+    int16x8_t v128 = vaddq_s16(v115, v127);
+    int16x8_t v129 = vqrdmulhq_n_s16(v128, 16463);
+    int16x8_t v130_tmp = vqrdmulhq_n_s16(v104, 13573);
+    int16x8_t v130 = vaddq_s16(v130_tmp, v104);
+    int16x8_t v131 = vaddq_s16(v108, v111);
+    int16x8_t v132 = vaddq_s16(v130, v131);
+    int16x8_t v133 = vaddq_s16(v119, v107);
+    int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734);
+    int16x8_t v135 = vaddq_s16(v110, v103);
+    int16x8_t v136_tmp = vqrdmulhq_n_s16(v135, 10045);
+    int16x8_t v136 = vaddq_s16(v136_tmp, v135);
+    int16x8_t v137 = vaddq_s16(v134, v136);
+    int16x8_t v138 = vaddq_s16(v132, v137);
+    int16x8_t v139 = vqrdmulhq_n_s16(v138, 16463);
+    int16x8_t v140 = vaddq_s16(v129, v139);
+    int16x8_t v141 = vaddq_s16(v102, v140);
+    int16x8_t v142 = vqrdmulhq_n_s16(v141, 16404);
+    int16x8_t v143 = vaddq_s16(v61, v142);
+    int16x8_t v144 = vsubq_s16(v0, v1);
+    int16x8_t v145 = vsubq_s16(v4, v6);
+    int16x8_t v146_tmp = vqrdmulhq_n_s16(v145, 10045);
+    int16x8_t v146 = vaddq_s16(v146_tmp, v145);
+    int16x8_t v147 = vaddq_s16(v144, v146);
+    int16x8_t v148 = vsubq_s16(v11, v14);
+    int16x8_t v149 = vqrdmulhq_n_s16(v18, 17734);
+    int16x8_t v150_tmp = vqrdmulhq_n_s16(v17, 10045);
+    int16x8_t v150 = vaddq_s16(v150_tmp, v17);
+    int16x8_t v151 = vsubq_s16(v149, v150);
+    int16x8_t v152 = vaddq_s16(v148, v151);
+    int16x8_t v153 = vqrdmulhq_n_s16(v152, 19705);
+    int16x8_t v154 = vaddq_s16(v147, v153);
+    int16x8_t v155 = vsubq_s16(v27, v30);
+    int16x8_t v156 = vqrdmulhq_n_s16(v34, 17734);
+    int16x8_t v157_tmp = vqrdmulhq_n_s16(v38, 10045);
+    int16x8_t v157 = vaddq_s16(v157_tmp, v38);
+    int16x8_t v158 = vsubq_s16(v156, v157);
+    int16x8_t v159 = vaddq_s16(v155, v158);
+    int16x8_t v160 = vqrdmulhq_n_s16(v54, 13573);
+    int16x8_t v161 = vsubq_s16(v160, v52);
+    int16x8_t v162 = vqrdmulhq_n_s16(v161, 25746);
+    int16x8_t v163 = vsubq_s16(v44, v47);
+    int16x8_t v164 = vqrdmulhq_n_s16(v163, 19705);
+    int16x8_t v165 = vaddq_s16(v162, v164);
+    int16x8_t v166 = vaddq_s16(v159, v165);
+    int16x8_t v167 = vqrdmulhq_n_s16(v166, 17121);
+    int16x8_t v168 = vaddq_s16(v154, v167);
+    int16x8_t v169 = vsubq_s16(v86, v89);
+    int16x8_t v170 = vqrdmulhq_n_s16(v93, 17734);
+    int16x8_t v171_tmp = vqrdmulhq_n_s16(v97, 10045);
+    int16x8_t v171 = vaddq_s16(v171_tmp, v97);
+    int16x8_t v172 = vsubq_s16(v170, v171);
+    int16x8_t v173 = vaddq_s16(v169, v172);
+    int16x8_t v174 = vsubq_s16(v80, v81);
+    int16x8_t v175 = vqrdmulhq_n_s16(v174, 19705);
+    int16x8_t v176 = vqrdmulhq_n_s16(v68, 13573);
+    int16x8_t v177 = vsubq_s16(v176, v76);
+    int16x8_t v178 = vqrdmulhq_n_s16(v177, 25746);
+    int16x8_t v179 = vaddq_s16(v175, v178);
+    int16x8_t v180 = vaddq_s16(v173, v179);
+    int16x8_t v181 = vsubq_s16(v130, v131);
+    int16x8_t v182 = vqrdmulhq_n_s16(v135, 13573);
+    int16x8_t v183 = vsubq_s16(v182, v133);
+    int16x8_t v184_tmp = vqrdmulhq_n_s16(v183, 10045);
+    int16x8_t v184 = vaddq_s16(v184_tmp, v183);
+    int16x8_t v185 = vaddq_s16(v181, v184);
+    int16x8_t v186 = vqrdmulhq_n_s16(v185, 17121);
+    int16x8_t v187 = vqrdmulhq_n_s16(v105, 27867);
+    int16x8_t v188 = vqrdmulhq_n_s16(v113, 19705);
+    int16x8_t v189 = vsubq_s16(v187, v188);
+    int16x8_t v190 = vqrdmulhq_n_s16(v116, 13573);
+    int16x8_t v191 = vsubq_s16(v190, v123);
+    int16x8_t v192 = vqrdmulhq_n_s16(v191, 25746);
+    int16x8_t v193 = vaddq_s16(v189, v192);
+    int16x8_t v194 = vqrdmulhq_n_s16(v193, 17121);
+    int16x8_t v195 = vaddq_s16(v186, v194);
+    int16x8_t v196 = vaddq_s16(v180, v195);
+    int16x8_t v197 = vqrdmulhq_n_s16(v196, 16563);
+    int16x8_t v198 = vaddq_s16(v168, v197);
+    int16x8_t v199 = vsubq_s16(v144, v146);
+    int16x8_t v200 = vsubq_s16(v148, v151);
+    int16x8_t v201 = vqrdmulhq_n_s16(v200, 29490);
+    int16x8_t v202 = vaddq_s16(v199, v201);
+    int16x8_t v203 = vsubq_s16(v155, v158);
+    int16x8_t v204 = vqrdmulhq_n_s16(v163, 29490);
+    int16x8_t v205_tmp = vqrdmulhq_n_s16(v161, 5763);
+    int16x8_t v205 = vaddq_s16(v205_tmp, v161);
+    int16x8_t v206 = vsubq_s16(v204, v205);
+    int16x8_t v207 = vaddq_s16(v203, v206);
+    int16x8_t v208 = vqrdmulhq_n_s16(v207, 18578);
+    int16x8_t v209 = vaddq_s16(v202, v208);
+    int16x8_t v210 = vsubq_s16(v169, v172);
+    int16x8_t v211 = vqrdmulhq_n_s16(v174, 29490);
+    int16x8_t v212_tmp = vqrdmulhq_n_s16(v177, 5763);
+    int16x8_t v212 = vaddq_s16(v212_tmp, v177);
+    int16x8_t v213 = vsubq_s16(v211, v212);
+    int16x8_t v214 = vaddq_s16(v210, v213);
+    int16x8_t v215 = vsubq_s16(v181, v184);
+    int16x8_t v216 = vqrdmulhq_n_s16(v215, 18578);
+    int16x8_t v217 = vqrdmulhq_n_s16(v189, 27803);
+    int16x8_t v218 = vqrdmulhq_n_s16(v191, 21845);
+    int16x8_t v219 = vsubq_s16(v217, v218);
+    int16x8_t v220 = vaddq_s16(v216, v219);
+    int16x8_t v221 = vaddq_s16(v214, v220);
+    int16x8_t v222 = vqrdmulhq_n_s16(v221, 16890);
+    int16x8_t v223 = vaddq_s16(v209, v222);
+    int16x8_t v224 = vsubq_s16(v2, v8);
+    int16x8_t v225 = vsubq_s16(v15, v22);
+    int16x8_t v226_tmp = vqrdmulhq_n_s16(v225, 18446);
+    int16x8_t v226 = vmlaq_n_s16(v226_tmp, v225, 2);
+    int16x8_t v227 = vaddq_s16(v224, v226);
+    int16x8_t v228 = vsubq_s16(v31, v41);
+    int16x8_t v229 = vsubq_s16(v48, v56);
+    int16x8_t v230_tmp = vqrdmulhq_n_s16(v229, 18446);
+    int16x8_t v230 = vmlaq_n_s16(v230_tmp, v229, 2);
+    int16x8_t v231 = vaddq_s16(v228, v230);
+    int16x8_t v232 = vqrdmulhq_n_s16(v231, 21195);
+    int16x8_t v233 = vaddq_s16(v227, v232);
+    int16x8_t v234 = vsubq_s16(v82, v78);
+    int16x8_t v235_tmp = vqrdmulhq_n_s16(v234, 18446);
+    int16x8_t v235 = vmlaq_n_s16(v235_tmp, v234, 2);
+    int16x8_t v236 = vsubq_s16(v90, v100);
+    int16x8_t v237 = vaddq_s16(v235, v236);
+    int16x8_t v238 = vsubq_s16(v132, v137);
+    int16x8_t v239 = vsubq_s16(v114, v126);
+    int16x8_t v240_tmp = vqrdmulhq_n_s16(v239, 18446);
+    int16x8_t v240 = vmlaq_n_s16(v240_tmp, v239, 2);
+    int16x8_t v241 = vaddq_s16(v238, v240);
+    int16x8_t v242 = vqrdmulhq_n_s16(v241, 21195);
+    int16x8_t v243 = vaddq_s16(v237, v242);
+    int16x8_t v244 = vqrdmulhq_n_s16(v243, 17401);
+    int16x8_t v245 = vaddq_s16(v233, v244);
+    int16x8_t v246 = vsubq_s16(v228, v230);
+    int16x8_t v247 = vqrdmulhq_n_s16(v246, 25826);
+    int16x8_t v248 = vsubq_s16(v224, v226);
+    int16x8_t v249 = vaddq_s16(v247, v248);
+    int16x8_t v250 = vsubq_s16(v238, v240);
+    int16x8_t v251 = vqrdmulhq_n_s16(v250, 25826);
+    int16x8_t v252 = vsubq_s16(v236, v235);
+    int16x8_t v253 = vaddq_s16(v251, v252);
+    int16x8_t v254 = vqrdmulhq_n_s16(v253, 18124);
+    int16x8_t v255 = vaddq_s16(v249, v254);
+    int16x8_t v256 = vsubq_s16(v199, v201);
+    int16x8_t v257 = vsubq_s16(v203, v206);
+    int16x8_t v258_tmp = vqrdmulhq_n_s16(v257, 1988);
+    int16x8_t v258 = vaddq_s16(v258_tmp, v257);
+    int16x8_t v259 = vaddq_s16(v256, v258);
+    int16x8_t v260 = vsubq_s16(v210, v213);
+    int16x8_t v261_tmp = vqrdmulhq_n_s16(v219, 25030);
+    int16x8_t v261 = vaddq_s16(v261_tmp, v219);
+    int16x8_t v262 = vsubq_s16(v215, v261);
+    int16x8_t v263_tmp = vqrdmulhq_n_s16(v262, 1988);
+    int16x8_t v263 = vaddq_s16(v263_tmp, v262);
+    int16x8_t v264 = vaddq_s16(v260, v263);
+    int16x8_t v265 = vqrdmulhq_n_s16(v264, 19102);
+    int16x8_t v266 = vaddq_s16(v259, v265);
+    int16x8_t v267 = vsubq_s16(v147, v153);
+    int16x8_t v268 = vsubq_s16(v159, v165);
+    int16x8_t v269_tmp = vqrdmulhq_n_s16(v268, 23673);
+    int16x8_t v269 = vaddq_s16(v269_tmp, v268);
+    int16x8_t v270 = vaddq_s16(v267, v269);
+    int16x8_t v271 = vsubq_s16(v173, v179);
+    int16x8_t v272 = vsubq_s16(v185, v193);
+    int16x8_t v273_tmp = vqrdmulhq_n_s16(v272, 23673);
+    int16x8_t v273 = vaddq_s16(v273_tmp, v272);
+    int16x8_t v274 = vaddq_s16(v271, v273);
+    int16x8_t v275 = vqrdmulhq_n_s16(v274, 20398);
+    int16x8_t v276 = vaddq_s16(v270, v275);
+    int16x8_t v277 = vsubq_s16(v9, v24);
+    int16x8_t v278 = vsubq_s16(v42, v58);
+    int16x8_t v279_tmp = vqrdmulhq_n_s16(v278, 3314);
+    int16x8_t v279 = vmlaq_n_s16(v279_tmp, v278, 5);
+    int16x8_t v280 = vaddq_s16(v277, v279);
+    int16x8_t v281 = vsubq_s16(v138, v128);
+    int16x8_t v282_tmp = vqrdmulhq_n_s16(v281, 3314);
+    int16x8_t v282 = vmlaq_n_s16(v282_tmp, v281, 5);
+    int16x8_t v283 = vsubq_s16(v101, v84);
+    int16x8_t v284 = vaddq_s16(v282, v283);
+    int16x8_t v285 = vqrdmulhq_n_s16(v284, 22112);
+    int16x8_t v286 = vaddq_s16(v280, v285);
+    int16x8_t v287 = vsubq_s16(v277, v279);
+    int16x8_t v288 = vsubq_s16(v283, v282);
+    int16x8_t v289 = vqrdmulhq_n_s16(v288, 24397);
+    int16x8_t v290 = vaddq_s16(v287, v289);
+    int16x8_t v291 = vsubq_s16(v267, v269);
+    int16x8_t v292 = vsubq_s16(v271, v273);
+    int16x8_t v293 = vqrdmulhq_n_s16(v292, 27504);
+    int16x8_t v294 = vaddq_s16(v291, v293);
+    int16x8_t v295 = vsubq_s16(v260, v263);
+    int16x8_t v296 = vqrdmulhq_n_s16(v295, 31869);
+    int16x8_t v297 = vsubq_s16(v256, v258);
+    int16x8_t v298 = vaddq_s16(v296, v297);
+    int16x8_t v299 = vsubq_s16(v248, v247);
+    int16x8_t v300 = vsubq_s16(v252, v251);
+    int16x8_t v301_tmp = vqrdmulhq_n_s16(v300, 5552);
+    int16x8_t v301 = vaddq_s16(v301_tmp, v300);
+    int16x8_t v302 = vaddq_s16(v299, v301);
+    int16x8_t v303 = vsubq_s16(v227, v232);
+    int16x8_t v304 = vsubq_s16(v237, v242);
+    int16x8_t v305_tmp = vqrdmulhq_n_s16(v304, 15865);
+    int16x8_t v305 = vaddq_s16(v305_tmp, v304);
+    int16x8_t v306 = vaddq_s16(v303, v305);
+    int16x8_t v307 = vsubq_s16(v202, v208);
+    int16x8_t v308 = vsubq_s16(v214, v220);
+    int16x8_t v309_tmp = vqrdmulhq_n_s16(v308, 1893);
+    int16x8_t v309 = vmlaq_n_s16(v309_tmp, v308, 2);
+    int16x8_t v310 = vaddq_s16(v307, v309);
+    int16x8_t v311 = vsubq_s16(v154, v167);
+    int16x8_t v312 = vsubq_s16(v180, v195);
+    int16x8_t v313_tmp = vqrdmulhq_n_s16(v312, 13357);
+    int16x8_t v313 = vmlaq_n_s16(v313_tmp, v312, 3);
+    int16x8_t v314 = vaddq_s16(v311, v313);
+    int16x8_t v315 = vsubq_s16(v102, v140);
+    int16x8_t v316_tmp = vqrdmulhq_n_s16(v315, 6226);
+    int16x8_t v316 = vmlaq_n_s16(v316_tmp, v315, 10);
+    int16x8_t v317 = vsubq_s16(v25, v60);
+    int16x8_t v318 = vaddq_s16(v316, v317);
+    int16x8_t v319 = vsubq_s16(v317, v316);
+    int16x8_t v320 = vsubq_s16(v311, v313);
+    int16x8_t v321 = vsubq_s16(v307, v309);
+    int16x8_t v322 = vsubq_s16(v303, v305);
+    int16x8_t v323 = vsubq_s16(v299, v301);
+    int16x8_t v324 = vsubq_s16(v297, v296);
+    int16x8_t v325 = vsubq_s16(v291, v293);
+    int16x8_t v326 = vsubq_s16(v287, v289);
+    int16x8_t v327 = vsubq_s16(v280, v285);
+    int16x8_t v328 = vsubq_s16(v270, v275);
+    int16x8_t v329 = vsubq_s16(v259, v265);
+    int16x8_t v330 = vsubq_s16(v249, v254);
+    int16x8_t v331 = vsubq_s16(v233, v244);
+    int16x8_t v332 = vsubq_s16(v209, v222);
+    int16x8_t v333 = vsubq_s16(v168, v197);
+    int16x8_t v334 = vsubq_s16(v61, v142);
+    vst1q_s16(out + out_stride * 0 + i, v143);
+    vst1q_s16(out + out_stride * 1 + i, v198);
+    vst1q_s16(out + out_stride * 2 + i, v223);
+    vst1q_s16(out + out_stride * 3 + i, v245);
+    vst1q_s16(out + out_stride * 4 + i, v255);
+    vst1q_s16(out + out_stride * 5 + i, v266);
+    vst1q_s16(out + out_stride * 6 + i, v276);
+    vst1q_s16(out + out_stride * 7 + i, v286);
+    vst1q_s16(out + out_stride * 8 + i, v290);
+    vst1q_s16(out + out_stride * 9 + i, v294);
+    vst1q_s16(out + out_stride * 10 + i, v298);
+    vst1q_s16(out + out_stride * 11 + i, v302);
+    vst1q_s16(out + out_stride * 12 + i, v306);
+    vst1q_s16(out + out_stride * 13 + i, v310);
+    vst1q_s16(out + out_stride * 14 + i, v314);
+    vst1q_s16(out + out_stride * 15 + i, v318);
+    vst1q_s16(out + out_stride * 16 + i, v319);
+    vst1q_s16(out + out_stride * 17 + i, v320);
+    vst1q_s16(out + out_stride * 18 + i, v321);
+    vst1q_s16(out + out_stride * 19 + i, v322);
+    vst1q_s16(out + out_stride * 20 + i, v323);
+    vst1q_s16(out + out_stride * 21 + i, v324);
+    vst1q_s16(out + out_stride * 22 + i, v325);
+    vst1q_s16(out + out_stride * 23 + i, v326);
+    vst1q_s16(out + out_stride * 24 + i, v327);
+    vst1q_s16(out + out_stride * 25 + i, v328);
+    vst1q_s16(out + out_stride * 26 + i, v329);
+    vst1q_s16(out + out_stride * 27 + i, v330);
+    vst1q_s16(out + out_stride * 28 + i, v331);
+    vst1q_s16(out + out_stride * 29 + i, v332);
+    vst1q_s16(out + out_stride * 30 + i, v333);
+    vst1q_s16(out + out_stride * 31 + i, v334);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct64-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct64-inl.h
new file mode 100644
index 0000000000..400da1a9de
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct64-inl.h
@@ -0,0 +1,985 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/* This file is automatically generated. Do not modify it directly. */
+#if HWY_TARGET != HWY_NEON
+#error "only include this file from fast_dct-inl.h"
+#endif
+
+constexpr size_t FastIDCTIntegerBits(FastDCTTag<64>) { return 1; }
+
+void FastIDCT(FastDCTTag<64>, const int16_t* in, size_t in_stride, int16_t* out,
+              size_t out_stride, size_t count) {
+  JXL_ASSERT(count % 8 == 0);
+  for (size_t i = 0; i < count; i += 8) {
+    int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
+    int16x8_t v1 = vld1q_s16(in + in_stride * 32 + i);
+    int16x8_t v2 = vaddq_s16(v0, v1);
+    int16x8_t v3 = vld1q_s16(in + in_stride * 16 + i);
+    int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
+    int16x8_t v4 = vaddq_s16(v4_tmp, v3);
+    int16x8_t v5 = vld1q_s16(in + in_stride * 48 + i);
+    int16x8_t v6 = vaddq_s16(v5, v3);
+    int16x8_t v7 = vaddq_s16(v4, v6);
+    int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
+    int16x8_t v9 = vaddq_s16(v2, v8);
+    int16x8_t v10 = vld1q_s16(in + in_stride * 8 + i);
+    int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
+    int16x8_t v11 = vaddq_s16(v11_tmp, v10);
+    int16x8_t v12 = vld1q_s16(in + in_stride * 40 + i);
+    int16x8_t v13 = vld1q_s16(in + in_stride * 24 + i);
+    int16x8_t v14 = vaddq_s16(v12, v13);
+    int16x8_t v15 = vaddq_s16(v11, v14);
+    int16x8_t v16 = vld1q_s16(in + in_stride * 56 + i);
+    int16x8_t v17 = vaddq_s16(v16, v12);
+    int16x8_t v18 = vaddq_s16(v13, v10);
+    int16x8_t v19 = vaddq_s16(v17, v18);
+    int16x8_t v20 = vqrdmulhq_n_s16(v19, 17734);
+    int16x8_t v21 = vqrdmulhq_n_s16(v18, 25080);
+    int16x8_t v22 = vaddq_s16(v20, v21);
+    int16x8_t v23 = vaddq_s16(v15, v22);
+    int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
+    int16x8_t v25 = vaddq_s16(v9, v24);
+    int16x8_t v26 = vld1q_s16(in + in_stride * 4 + i);
+    int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573);
+    int16x8_t v27 = vaddq_s16(v27_tmp, v26);
+    int16x8_t v28 = vld1q_s16(in + in_stride * 36 + i);
+    int16x8_t v29 = vld1q_s16(in + in_stride * 28 + i);
+    int16x8_t v30 = vaddq_s16(v28, v29);
+    int16x8_t v31 = vaddq_s16(v27, v30);
+    int16x8_t v32 = vld1q_s16(in + in_stride * 20 + i);
+    int16x8_t v33 = vld1q_s16(in + in_stride * 12 + i);
+    int16x8_t v34 = vaddq_s16(v32, v33);
+    int16x8_t v35 = vqrdmulhq_n_s16(v34, 25080);
+    int16x8_t v36 = vld1q_s16(in + in_stride * 52 + i);
+    int16x8_t v37 = vld1q_s16(in + in_stride * 44 + i);
+    int16x8_t v38 = vaddq_s16(v36, v37);
+    int16x8_t v39 = vaddq_s16(v38, v34);
+    int16x8_t v40 = vqrdmulhq_n_s16(v39, 17734);
+    int16x8_t v41 = vaddq_s16(v35, v40);
+    int16x8_t v42 = vaddq_s16(v31, v41);
+    int16x8_t v43 = vaddq_s16(v33, v26);
+    int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573);
+    int16x8_t v44 = vaddq_s16(v44_tmp, v43);
+    int16x8_t v45 = vaddq_s16(v37, v28);
+    int16x8_t v46 = vaddq_s16(v29, v32);
+    int16x8_t v47 = vaddq_s16(v45, v46);
+    int16x8_t v48 = vaddq_s16(v44, v47);
+    int16x8_t v49 = vqrdmulhq_n_s16(v48, 16705);
+    int16x8_t v50 = vaddq_s16(v46, v43);
+    int16x8_t v51_tmp = vqrdmulhq_n_s16(v50, 10045);
+    int16x8_t v51 = vaddq_s16(v51_tmp, v50);
+    int16x8_t v52 = vld1q_s16(in + in_stride * 60 + i);
+    int16x8_t v53 = vaddq_s16(v52, v36);
+    int16x8_t v54 = vaddq_s16(v53, v45);
+    int16x8_t v55 = vqrdmulhq_n_s16(v54, 17734);
+    int16x8_t v56 = vaddq_s16(v51, v55);
+    int16x8_t v57 = vqrdmulhq_n_s16(v56, 16705);
+    int16x8_t v58 = vaddq_s16(v49, v57);
+    int16x8_t v59 = vaddq_s16(v42, v58);
+    int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
+    int16x8_t v61 = vaddq_s16(v25, v60);
+    int16x8_t v62 = vld1q_s16(in + in_stride * 2 + i);
+    int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573);
+    int16x8_t v63 = vaddq_s16(v63_tmp, v62);
+    int16x8_t v64 = vld1q_s16(in + in_stride * 34 + i);
+    int16x8_t v65 = vld1q_s16(in + in_stride * 30 + i);
+    int16x8_t v66 = vaddq_s16(v64, v65);
+    int16x8_t v67 = vaddq_s16(v63, v66);
+    int16x8_t v68 = vld1q_s16(in + in_stride * 18 + i);
+    int16x8_t v69 = vld1q_s16(in + in_stride * 14 + i);
+    int16x8_t v70 = vaddq_s16(v68, v69);
+    int16x8_t v71 = vqrdmulhq_n_s16(v70, 25080);
+    int16x8_t v72 = vld1q_s16(in + in_stride * 50 + i);
+    int16x8_t v73 = vld1q_s16(in + in_stride * 46 + i);
+    int16x8_t v74 = vaddq_s16(v72, v73);
+    int16x8_t v75 = vaddq_s16(v74, v70);
+    int16x8_t v76 = vqrdmulhq_n_s16(v75, 17734);
+    int16x8_t v77 = vaddq_s16(v71, v76);
+    int16x8_t v78 = vaddq_s16(v67, v77);
+    int16x8_t v79 = vld1q_s16(in + in_stride * 10 + i);
+    int16x8_t v80 = vld1q_s16(in + in_stride * 6 + i);
+    int16x8_t v81 = vaddq_s16(v79, v80);
+    int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573);
+    int16x8_t v82 = vaddq_s16(v82_tmp, v81);
+    int16x8_t v83 = vld1q_s16(in + in_stride * 42 + i);
+    int16x8_t v84 = vld1q_s16(in + in_stride * 38 + i);
+    int16x8_t v85 = vaddq_s16(v83, v84);
+    int16x8_t v86 = vld1q_s16(in + in_stride * 26 + i);
+    int16x8_t v87 = vld1q_s16(in + in_stride * 22 + i);
+    int16x8_t v88 = vaddq_s16(v86, v87);
+    int16x8_t v89 = vaddq_s16(v85, v88);
+    int16x8_t v90 = vaddq_s16(v82, v89);
+    int16x8_t v91 = vqrdmulhq_n_s16(v90, 16705);
+    int16x8_t v92 = vaddq_s16(v88, v81);
+    int16x8_t v93_tmp = vqrdmulhq_n_s16(v92, 10045);
+    int16x8_t v93 = vaddq_s16(v93_tmp, v92);
+    int16x8_t v94 = vld1q_s16(in + in_stride * 58 + i);
+    int16x8_t v95 = vld1q_s16(in + in_stride * 54 + i);
+    int16x8_t v96 = vaddq_s16(v94, v95);
+    int16x8_t v97 = vaddq_s16(v96, v85);
+    int16x8_t v98 = vqrdmulhq_n_s16(v97, 17734);
+    int16x8_t v99 = vaddq_s16(v93, v98);
+    int16x8_t v100 = vqrdmulhq_n_s16(v99, 16705);
+    int16x8_t v101 = vaddq_s16(v91, v100);
+    int16x8_t v102 = vaddq_s16(v78, v101);
+    int16x8_t v103 = vaddq_s16(v69, v79);
+    int16x8_t v104 = vaddq_s16(v80, v62);
+    int16x8_t v105 = vaddq_s16(v103, v104);
+    int16x8_t v106_tmp = vqrdmulhq_n_s16(v105, 13573);
+    int16x8_t v106 = vaddq_s16(v106_tmp, v105);
+    int16x8_t v107 = vaddq_s16(v73, v83);
+    int16x8_t v108 = vaddq_s16(v84, v64);
+    int16x8_t v109 = vaddq_s16(v107, v108);
+    int16x8_t v110 = vaddq_s16(v65, v86);
+    int16x8_t v111 = vaddq_s16(v87, v68);
+    int16x8_t v112 = vaddq_s16(v110, v111);
+    int16x8_t v113 = vaddq_s16(v109, v112);
+    int16x8_t v114 = vaddq_s16(v106, v113);
+    int16x8_t v115 = vqrdmulhq_n_s16(v114, 16705);
+    int16x8_t v116 = vaddq_s16(v112, v105);
+    int16x8_t v117 = vqrdmulhq_n_s16(v116, 25080);
+    int16x8_t v118 = vqrdmulhq_n_s16(v116, 17734);
+    int16x8_t v119 = vld1q_s16(in + in_stride * 62 + i);
+    int16x8_t v120 = vaddq_s16(v119, v94);
+    int16x8_t v121 = vaddq_s16(v95, v72);
+    int16x8_t v122 = vaddq_s16(v120, v121);
+    int16x8_t v123 = vaddq_s16(v122, v109);
+    int16x8_t v124 = vqrdmulhq_n_s16(v123, 17734);
+    int16x8_t v125 = vaddq_s16(v118, v124);
+    int16x8_t v126 = vaddq_s16(v117, v125);
+    int16x8_t v127 = vqrdmulhq_n_s16(v126, 16705);
+    int16x8_t v128 = vaddq_s16(v115, v127);
+    int16x8_t v129 = vqrdmulhq_n_s16(v128, 16463);
+    int16x8_t v130_tmp = vqrdmulhq_n_s16(v104, 13573);
+    int16x8_t v130 = vaddq_s16(v130_tmp, v104);
+    int16x8_t v131 = vaddq_s16(v108, v110);
+    int16x8_t v132 = vaddq_s16(v130, v131);
+    int16x8_t v133 = vaddq_s16(v111, v103);
+    int16x8_t v134_tmp = vqrdmulhq_n_s16(v133, 10045);
+    int16x8_t v134 = vaddq_s16(v134_tmp, v133);
+    int16x8_t v135 = vaddq_s16(v121, v107);
+    int16x8_t v136 = vqrdmulhq_n_s16(v135, 17734);
+    int16x8_t v137 = vaddq_s16(v134, v136);
+    int16x8_t v138 = vaddq_s16(v132, v137);
+    int16x8_t v139 = vqrdmulhq_n_s16(v138, 16463);
+    int16x8_t v140 = vaddq_s16(v129, v139);
+    int16x8_t v141 = vaddq_s16(v102, v140);
+    int16x8_t v142 = vqrdmulhq_n_s16(v141, 16404);
+    int16x8_t v143 = vaddq_s16(v61, v142);
+    int16x8_t v144 = vld1q_s16(in + in_stride * 1 + i);
+    int16x8_t v145_tmp = vqrdmulhq_n_s16(v144, 13573);
+    int16x8_t v145 = vaddq_s16(v145_tmp, v144);
+    int16x8_t v146 = vld1q_s16(in + in_stride * 33 + i);
+    int16x8_t v147 = vld1q_s16(in + in_stride * 31 + i);
+    int16x8_t v148 = vaddq_s16(v146, v147);
+    int16x8_t v149 = vaddq_s16(v145, v148);
+    int16x8_t v150 = vld1q_s16(in + in_stride * 17 + i);
+    int16x8_t v151 = vld1q_s16(in + in_stride * 15 + i);
+    int16x8_t v152 = vaddq_s16(v150, v151);
+    int16x8_t v153 = vqrdmulhq_n_s16(v152, 25080);
+    int16x8_t v154 = vld1q_s16(in + in_stride * 49 + i);
+    int16x8_t v155 = vld1q_s16(in + in_stride * 47 + i);
+    int16x8_t v156 = vaddq_s16(v154, v155);
+    int16x8_t v157 = vaddq_s16(v156, v152);
+    int16x8_t v158 = vqrdmulhq_n_s16(v157, 17734);
+    int16x8_t v159 = vaddq_s16(v153, v158);
+    int16x8_t v160 = vaddq_s16(v149, v159);
+    int16x8_t v161 = vld1q_s16(in + in_stride * 9 + i);
+    int16x8_t v162 = vld1q_s16(in + in_stride * 7 + i);
+    int16x8_t v163 = vaddq_s16(v161, v162);
+    int16x8_t v164_tmp = vqrdmulhq_n_s16(v163, 13573);
+    int16x8_t v164 = vaddq_s16(v164_tmp, v163);
+    int16x8_t v165 = vld1q_s16(in + in_stride * 41 + i);
+    int16x8_t v166 = vld1q_s16(in + in_stride * 39 + i);
+    int16x8_t v167 = vaddq_s16(v165, v166);
+    int16x8_t v168 = vld1q_s16(in + in_stride * 25 + i);
+    int16x8_t v169 = vld1q_s16(in + in_stride * 23 + i);
+    int16x8_t v170 = vaddq_s16(v168, v169);
+    int16x8_t v171 = vaddq_s16(v167, v170);
+    int16x8_t v172 = vaddq_s16(v164, v171);
+    int16x8_t v173 = vqrdmulhq_n_s16(v172, 16705);
+    int16x8_t v174 = vaddq_s16(v170, v163);
+    int16x8_t v175_tmp = vqrdmulhq_n_s16(v174, 10045);
+    int16x8_t v175 = vaddq_s16(v175_tmp, v174);
+    int16x8_t v176 = vld1q_s16(in + in_stride * 57 + i);
+    int16x8_t v177 = vld1q_s16(in + in_stride * 55 + i);
+    int16x8_t v178 = vaddq_s16(v176, v177);
+    int16x8_t v179 = vaddq_s16(v178, v167);
+    int16x8_t v180 = vqrdmulhq_n_s16(v179, 17734);
+    int16x8_t v181 = vaddq_s16(v175, v180);
+    int16x8_t v182 = vqrdmulhq_n_s16(v181, 16705);
+    int16x8_t v183 = vaddq_s16(v173, v182);
+    int16x8_t v184 = vaddq_s16(v160, v183);
+    int16x8_t v185 = vld1q_s16(in + in_stride * 37 + i);
+    int16x8_t v186 = vld1q_s16(in + in_stride * 35 + i);
+    int16x8_t v187 = vaddq_s16(v185, v186);
+    int16x8_t v188 = vld1q_s16(in + in_stride * 45 + i);
+    int16x8_t v189 = vld1q_s16(in + in_stride * 43 + i);
+    int16x8_t v190 = vaddq_s16(v188, v189);
+    int16x8_t v191 = vaddq_s16(v187, v190);
+    int16x8_t v192 = vld1q_s16(in + in_stride * 29 + i);
+    int16x8_t v193 = vld1q_s16(in + in_stride * 27 + i);
+    int16x8_t v194 = vaddq_s16(v192, v193);
+    int16x8_t v195 = vld1q_s16(in + in_stride * 21 + i);
+    int16x8_t v196 = vld1q_s16(in + in_stride * 19 + i);
+    int16x8_t v197 = vaddq_s16(v195, v196);
+    int16x8_t v198 = vaddq_s16(v194, v197);
+    int16x8_t v199 = vaddq_s16(v191, v198);
+    int16x8_t v200 = vld1q_s16(in + in_stride * 5 + i);
+    int16x8_t v201 = vld1q_s16(in + in_stride * 3 + i);
+    int16x8_t v202 = vaddq_s16(v200, v201);
+    int16x8_t v203 = vld1q_s16(in + in_stride * 13 + i);
+    int16x8_t v204 = vld1q_s16(in + in_stride * 11 + i);
+    int16x8_t v205 = vaddq_s16(v203, v204);
+    int16x8_t v206 = vaddq_s16(v202, v205);
+    int16x8_t v207_tmp = vqrdmulhq_n_s16(v206, 13573);
+    int16x8_t v207 = vaddq_s16(v207_tmp, v206);
+    int16x8_t v208 = vaddq_s16(v199, v207);
+    int16x8_t v209 = vqrdmulhq_n_s16(v208, 16705);
+    int16x8_t v210 = vaddq_s16(v198, v206);
+    int16x8_t v211 = vqrdmulhq_n_s16(v210, 25080);
+    int16x8_t v212 = vqrdmulhq_n_s16(v210, 17734);
+    int16x8_t v213 = vld1q_s16(in + in_stride * 53 + i);
+    int16x8_t v214 = vld1q_s16(in + in_stride * 51 + i);
+    int16x8_t v215 = vaddq_s16(v213, v214);
+    int16x8_t v216 = vld1q_s16(in + in_stride * 61 + i);
+    int16x8_t v217 = vld1q_s16(in + in_stride * 59 + i);
+    int16x8_t v218 = vaddq_s16(v216, v217);
+    int16x8_t v219 = vaddq_s16(v215, v218);
+    int16x8_t v220 = vaddq_s16(v219, v191);
+    int16x8_t v221 = vqrdmulhq_n_s16(v220, 17734);
+    int16x8_t v222 = vaddq_s16(v212, v221);
+    int16x8_t v223 = vaddq_s16(v211, v222);
+    int16x8_t v224 = vqrdmulhq_n_s16(v223, 16705);
+    int16x8_t v225 = vaddq_s16(v209, v224);
+    int16x8_t v226 = vqrdmulhq_n_s16(v225, 16463);
+    int16x8_t v227_tmp = vqrdmulhq_n_s16(v202, 13573);
+    int16x8_t v227 = vaddq_s16(v227_tmp, v202);
+    int16x8_t v228 = vaddq_s16(v187, v194);
+    int16x8_t v229 = vaddq_s16(v227, v228);
+    int16x8_t v230 = vaddq_s16(v215, v190);
+    int16x8_t v231 = vqrdmulhq_n_s16(v230, 17734);
+    int16x8_t v232 = vaddq_s16(v197, v205);
+    int16x8_t v233_tmp = vqrdmulhq_n_s16(v232, 10045);
+    int16x8_t v233 = vaddq_s16(v233_tmp, v232);
+    int16x8_t v234 = vaddq_s16(v231, v233);
+    int16x8_t v235 = vaddq_s16(v229, v234);
+    int16x8_t v236 = vqrdmulhq_n_s16(v235, 16463);
+    int16x8_t v237 = vaddq_s16(v226, v236);
+    int16x8_t v238 = vaddq_s16(v184, v237);
+    int16x8_t v239 = vaddq_s16(v201, v144);
+    int16x8_t v240_tmp = vqrdmulhq_n_s16(v239, 13573);
+    int16x8_t v240 = vaddq_s16(v240_tmp, v239);
+    int16x8_t v241 = vaddq_s16(v186, v146);
+    int16x8_t v242 = vaddq_s16(v147, v192);
+    int16x8_t v243 = vaddq_s16(v241, v242);
+    int16x8_t v244 = vaddq_s16(v240, v243);
+    int16x8_t v245 = vaddq_s16(v196, v150);
+    int16x8_t v246 = vaddq_s16(v151, v203);
+    int16x8_t v247 = vaddq_s16(v245, v246);
+    int16x8_t v248_tmp = vqrdmulhq_n_s16(v247, 10045);
+    int16x8_t v248 = vaddq_s16(v248_tmp, v247);
+    int16x8_t v249 = vaddq_s16(v155, v188);
+    int16x8_t v250 = vaddq_s16(v214, v154);
+    int16x8_t v251 = vaddq_s16(v249, v250);
+    int16x8_t v252 = vqrdmulhq_n_s16(v251, 17734);
+    int16x8_t v253 = vaddq_s16(v248, v252);
+    int16x8_t v254 = vaddq_s16(v244, v253);
+    int16x8_t v255 = vaddq_s16(v204, v161);
+    int16x8_t v256 = vaddq_s16(v162, v200);
+    int16x8_t v257 = vaddq_s16(v255, v256);
+    int16x8_t v258_tmp = vqrdmulhq_n_s16(v257, 13573);
+    int16x8_t v258 = vaddq_s16(v258_tmp, v257);
+    int16x8_t v259 = vaddq_s16(v189, v165);
+    int16x8_t v260 = vaddq_s16(v166, v185);
+    int16x8_t v261 = vaddq_s16(v259, v260);
+    int16x8_t v262 = vaddq_s16(v169, v195);
+    int16x8_t v263 = vaddq_s16(v193, v168);
+    int16x8_t v264 = vaddq_s16(v262, v263);
+    int16x8_t v265 = vaddq_s16(v261, v264);
+    int16x8_t v266 = vaddq_s16(v258, v265);
+    int16x8_t v267 = vqrdmulhq_n_s16(v266, 16705);
+    int16x8_t v268 = vaddq_s16(v264, v257);
+    int16x8_t v269 = vqrdmulhq_n_s16(v268, 25080);
+    int16x8_t v270 = vaddq_s16(v217, v176);
+    int16x8_t v271 = vaddq_s16(v177, v213);
+    int16x8_t v272 = vaddq_s16(v270, v271);
+    int16x8_t v273 = vaddq_s16(v272, v261);
+    int16x8_t v274 = vqrdmulhq_n_s16(v273, 17734);
+    int16x8_t v275 = vqrdmulhq_n_s16(v268, 17734);
+    int16x8_t v276 = vaddq_s16(v274, v275);
+    int16x8_t v277 = vaddq_s16(v269, v276);
+    int16x8_t v278 = vqrdmulhq_n_s16(v277, 16705);
+    int16x8_t v279 = vaddq_s16(v267, v278);
+    int16x8_t v280 = vaddq_s16(v254, v279);
+    int16x8_t v281 = vqrdmulhq_n_s16(v280, 16404);
+    int16x8_t v282 = vaddq_s16(v256, v239);
+    int16x8_t v283_tmp = vqrdmulhq_n_s16(v282, 13573);
+    int16x8_t v283 = vaddq_s16(v283_tmp, v282);
+    int16x8_t v284 = vaddq_s16(v260, v241);
+    int16x8_t v285 = vaddq_s16(v242, v263);
+    int16x8_t v286 = vaddq_s16(v284, v285);
+    int16x8_t v287 = vaddq_s16(v283, v286);
+    int16x8_t v288 = vaddq_s16(v262, v245);
+    int16x8_t v289 = vaddq_s16(v246, v255);
+    int16x8_t v290 = vaddq_s16(v288, v289);
+    int16x8_t v291 = vqrdmulhq_n_s16(v290, 25080);
+    int16x8_t v292 = vqrdmulhq_n_s16(v290, 17734);
+    int16x8_t v293 = vaddq_s16(v271, v250);
+    int16x8_t v294 = vaddq_s16(v249, v259);
+    int16x8_t v295 = vaddq_s16(v293, v294);
+    int16x8_t v296 = vqrdmulhq_n_s16(v295, 17734);
+    int16x8_t v297 = vaddq_s16(v292, v296);
+    int16x8_t v298 = vaddq_s16(v291, v297);
+    int16x8_t v299 = vaddq_s16(v287, v298);
+    int16x8_t v300 = vqrdmulhq_n_s16(v299, 16463);
+    int16x8_t v301 = vaddq_s16(v289, v282);
+    int16x8_t v302 = vqrdmulhq_n_s16(v301, 23624);
+    int16x8_t v303 = vaddq_s16(v294, v284);
+    int16x8_t v304 = vqrdmulhq_n_s16(v303, 19705);
+    int16x8_t v305 = vaddq_s16(v285, v288);
+    int16x8_t v306 = vqrdmulhq_n_s16(v305, 19705);
+    int16x8_t v307 = vaddq_s16(v304, v306);
+    int16x8_t v308 = vqrdmulhq_n_s16(v307, 27779);
+    int16x8_t v309 = vaddq_s16(v302, v308);
+    int16x8_t v310 = vaddq_s16(v305, v301);
+    int16x8_t v311 = vqrdmulhq_n_s16(v310, 25080);
+    int16x8_t v312 = vqrdmulhq_n_s16(v310, 17734);
+    int16x8_t v313 = vld1q_s16(in + in_stride * 63 + i);
+    int16x8_t v314 = vaddq_s16(v313, v216);
+    int16x8_t v315 = vaddq_s16(v314, v270);
+    int16x8_t v316 = vaddq_s16(v315, v293);
+    int16x8_t v317 = vqrdmulhq_n_s16(v316, 25746);
+    int16x8_t v318 = vqrdmulhq_n_s16(v303, 25746);
+    int16x8_t v319 = vaddq_s16(v317, v318);
+    int16x8_t v320 = vqrdmulhq_n_s16(v319, 22571);
+    int16x8_t v321 = vaddq_s16(v312, v320);
+    int16x8_t v322 = vaddq_s16(v311, v321);
+    int16x8_t v323 = vqrdmulhq_n_s16(v322, 16705);
+    int16x8_t v324 = vaddq_s16(v309, v323);
+    int16x8_t v325 = vqrdmulhq_n_s16(v324, 16463);
+    int16x8_t v326 = vaddq_s16(v300, v325);
+    int16x8_t v327 = vqrdmulhq_n_s16(v326, 16404);
+    int16x8_t v328 = vaddq_s16(v281, v327);
+    int16x8_t v329 = vaddq_s16(v238, v328);
+    int16x8_t v330 = vqrdmulhq_n_s16(v329, 16389);
+    int16x8_t v331 = vaddq_s16(v143, v330);
+    int16x8_t v332 = vsubq_s16(v82, v89);
+    int16x8_t v333 = vqrdmulhq_n_s16(v332, 19705);
+    int16x8_t v334 = vqrdmulhq_n_s16(v92, 13573);
+    int16x8_t v335 = vsubq_s16(v334, v97);
+    int16x8_t v336 = vqrdmulhq_n_s16(v335, 25746);
+    int16x8_t v337 = vaddq_s16(v333, v336);
+    int16x8_t v338 = vsubq_s16(v63, v66);
+    int16x8_t v339 = vqrdmulhq_n_s16(v70, 17734);
+    int16x8_t v340_tmp = vqrdmulhq_n_s16(v74, 10045);
+    int16x8_t v340 = vaddq_s16(v340_tmp, v74);
+    int16x8_t v341 = vsubq_s16(v339, v340);
+    int16x8_t v342 = vaddq_s16(v338, v341);
+    int16x8_t v343 = vaddq_s16(v337, v342);
+    int16x8_t v344 = vsubq_s16(v130, v131);
+    int16x8_t v345 = vqrdmulhq_n_s16(v133, 13573);
+    int16x8_t v346 = vsubq_s16(v345, v135);
+    int16x8_t v347_tmp = vqrdmulhq_n_s16(v346, 10045);
+    int16x8_t v347 = vaddq_s16(v347_tmp, v346);
+    int16x8_t v348 = vaddq_s16(v344, v347);
+    int16x8_t v349 = vqrdmulhq_n_s16(v348, 17121);
+    int16x8_t v350 = vqrdmulhq_n_s16(v105, 27867);
+    int16x8_t v351 = vqrdmulhq_n_s16(v113, 19705);
+    int16x8_t v352 = vsubq_s16(v350, v351);
+    int16x8_t v353 = vqrdmulhq_n_s16(v116, 13573);
+    int16x8_t v354 = vsubq_s16(v353, v123);
+    int16x8_t v355 = vqrdmulhq_n_s16(v354, 25746);
+    int16x8_t v356 = vaddq_s16(v352, v355);
+    int16x8_t v357 = vqrdmulhq_n_s16(v356, 17121);
+    int16x8_t v358 = vaddq_s16(v349, v357);
+    int16x8_t v359 = vaddq_s16(v343, v358);
+    int16x8_t v360 = vqrdmulhq_n_s16(v359, 16563);
+    int16x8_t v361 = vsubq_s16(v27, v30);
+    int16x8_t v362 = vqrdmulhq_n_s16(v34, 17734);
+    int16x8_t v363_tmp = vqrdmulhq_n_s16(v38, 10045);
+    int16x8_t v363 = vaddq_s16(v363_tmp, v38);
+    int16x8_t v364 = vsubq_s16(v362, v363);
+    int16x8_t v365 = vaddq_s16(v361, v364);
+    int16x8_t v366 = vsubq_s16(v44, v47);
+    int16x8_t v367 = vqrdmulhq_n_s16(v366, 19705);
+    int16x8_t v368 = vqrdmulhq_n_s16(v50, 13573);
+    int16x8_t v369 = vsubq_s16(v368, v54);
+    int16x8_t v370 = vqrdmulhq_n_s16(v369, 25746);
+    int16x8_t v371 = vaddq_s16(v367, v370);
+    int16x8_t v372 = vaddq_s16(v365, v371);
+    int16x8_t v373 = vqrdmulhq_n_s16(v372, 17121);
+    int16x8_t v374 = vsubq_s16(v0, v1);
+    int16x8_t v375 = vsubq_s16(v4, v6);
+    int16x8_t v376_tmp = vqrdmulhq_n_s16(v375, 10045);
+    int16x8_t v376 = vaddq_s16(v376_tmp, v375);
+    int16x8_t v377 = vaddq_s16(v374, v376);
+    int16x8_t v378 = vsubq_s16(v11, v14);
+    int16x8_t v379 = vqrdmulhq_n_s16(v18, 17734);
+    int16x8_t v380_tmp = vqrdmulhq_n_s16(v17, 10045);
+    int16x8_t v380 = vaddq_s16(v380_tmp, v17);
+    int16x8_t v381 = vsubq_s16(v379, v380);
+    int16x8_t v382 = vaddq_s16(v378, v381);
+    int16x8_t v383 = vqrdmulhq_n_s16(v382, 19705);
+    int16x8_t v384 = vaddq_s16(v377, v383);
+    int16x8_t v385 = vaddq_s16(v373, v384);
+    int16x8_t v386 = vaddq_s16(v360, v385);
+    int16x8_t v387 = vsubq_s16(v145, v148);
+    int16x8_t v388 = vqrdmulhq_n_s16(v152, 17734);
+    int16x8_t v389_tmp = vqrdmulhq_n_s16(v156, 10045);
+    int16x8_t v389 = vaddq_s16(v389_tmp, v156);
+    int16x8_t v390 = vsubq_s16(v388, v389);
+    int16x8_t v391 = vaddq_s16(v387, v390);
+    int16x8_t v392 = vsubq_s16(v164, v171);
+    int16x8_t v393 = vqrdmulhq_n_s16(v392, 19705);
+    int16x8_t v394 = vqrdmulhq_n_s16(v174, 13573);
+    int16x8_t v395 = vsubq_s16(v394, v179);
+    int16x8_t v396 = vqrdmulhq_n_s16(v395, 25746);
+    int16x8_t v397 = vaddq_s16(v393, v396);
+    int16x8_t v398 = vaddq_s16(v391, v397);
+    int16x8_t v399 = vsubq_s16(v227, v228);
+    int16x8_t v400 = vqrdmulhq_n_s16(v232, 13573);
+    int16x8_t v401 = vsubq_s16(v400, v230);
+    int16x8_t v402_tmp = vqrdmulhq_n_s16(v401, 10045);
+    int16x8_t v402 = vaddq_s16(v402_tmp, v401);
+    int16x8_t v403 = vaddq_s16(v399, v402);
+    int16x8_t v404 = vqrdmulhq_n_s16(v403, 17121);
+    int16x8_t v405 = vqrdmulhq_n_s16(v206, 27867);
+    int16x8_t v406 = vqrdmulhq_n_s16(v199, 19705);
+    int16x8_t v407 = vsubq_s16(v405, v406);
+    int16x8_t v408 = vqrdmulhq_n_s16(v210, 13573);
+    int16x8_t v409 = vsubq_s16(v408, v220);
+    int16x8_t v410 = vqrdmulhq_n_s16(v409, 25746);
+    int16x8_t v411 = vaddq_s16(v407, v410);
+    int16x8_t v412 = vqrdmulhq_n_s16(v411, 17121);
+    int16x8_t v413 = vaddq_s16(v404, v412);
+    int16x8_t v414 = vaddq_s16(v398, v413);
+    int16x8_t v415 = vsubq_s16(v240, v243);
+    int16x8_t v416 = vqrdmulhq_n_s16(v247, 13573);
+    int16x8_t v417 = vsubq_s16(v416, v251);
+    int16x8_t v418_tmp = vqrdmulhq_n_s16(v417, 10045);
+    int16x8_t v418 = vaddq_s16(v418_tmp, v417);
+    int16x8_t v419 = vaddq_s16(v415, v418);
+    int16x8_t v420 = vqrdmulhq_n_s16(v257, 27867);
+    int16x8_t v421 = vqrdmulhq_n_s16(v265, 19705);
+    int16x8_t v422 = vsubq_s16(v420, v421);
+    int16x8_t v423 = vqrdmulhq_n_s16(v268, 13573);
+    int16x8_t v424 = vsubq_s16(v423, v273);
+    int16x8_t v425 = vqrdmulhq_n_s16(v424, 25746);
+    int16x8_t v426 = vaddq_s16(v422, v425);
+    int16x8_t v427 = vaddq_s16(v419, v426);
+    int16x8_t v428 = vqrdmulhq_n_s16(v427, 16563);
+    int16x8_t v429 = vqrdmulhq_n_s16(v301, 27867);
+    int16x8_t v430 = vsubq_s16(v429, v307);
+    int16x8_t v431 = vqrdmulhq_n_s16(v310, 10664);
+    int16x8_t v432 = vsubq_s16(v431, v319);
+    int16x8_t v433 = vaddq_s16(v430, v432);
+    int16x8_t v434 = vqrdmulhq_n_s16(v433, 17121);
+    int16x8_t v435 = vsubq_s16(v283, v286);
+    int16x8_t v436 = vqrdmulhq_n_s16(v290, 13573);
+    int16x8_t v437 = vsubq_s16(v436, v295);
+    int16x8_t v438_tmp = vqrdmulhq_n_s16(v437, 10045);
+    int16x8_t v438 = vaddq_s16(v438_tmp, v437);
+    int16x8_t v439 = vaddq_s16(v435, v438);
+    int16x8_t v440 = vqrdmulhq_n_s16(v439, 17121);
+    int16x8_t v441 = vaddq_s16(v434, v440);
+    int16x8_t v442 = vqrdmulhq_n_s16(v441, 16563);
+    int16x8_t v443 = vaddq_s16(v428, v442);
+    int16x8_t v444 = vaddq_s16(v414, v443);
+    int16x8_t v445 = vqrdmulhq_n_s16(v444, 16429);
+    int16x8_t v446 = vaddq_s16(v386, v445);
+    int16x8_t v447 = vsubq_s16(v374, v376);
+    int16x8_t v448 = vsubq_s16(v378, v381);
+    int16x8_t v449 = vqrdmulhq_n_s16(v448, 29490);
+    int16x8_t v450 = vaddq_s16(v447, v449);
+    int16x8_t v451 = vsubq_s16(v361, v364);
+    int16x8_t v452 = vqrdmulhq_n_s16(v366, 29490);
+    int16x8_t v453_tmp = vqrdmulhq_n_s16(v369, 5763);
+    int16x8_t v453 = vaddq_s16(v453_tmp, v369);
+    int16x8_t v454 = vsubq_s16(v452, v453);
+    int16x8_t v455 = vaddq_s16(v451, v454);
+    int16x8_t v456 = vqrdmulhq_n_s16(v455, 18578);
+    int16x8_t v457 = vaddq_s16(v450, v456);
+    int16x8_t v458 = vsubq_s16(v338, v341);
+    int16x8_t v459 = vqrdmulhq_n_s16(v332, 29490);
+    int16x8_t v460_tmp = vqrdmulhq_n_s16(v335, 5763);
+    int16x8_t v460 = vaddq_s16(v460_tmp, v335);
+    int16x8_t v461 = vsubq_s16(v459, v460);
+    int16x8_t v462 = vaddq_s16(v458, v461);
+    int16x8_t v463 = vqrdmulhq_n_s16(v352, 27803);
+    int16x8_t v464 = vqrdmulhq_n_s16(v354, 21845);
+    int16x8_t v465 = vsubq_s16(v463, v464);
+    int16x8_t v466 = vsubq_s16(v344, v347);
+    int16x8_t v467 = vqrdmulhq_n_s16(v466, 18578);
+    int16x8_t v468 = vaddq_s16(v465, v467);
+    int16x8_t v469 = vaddq_s16(v462, v468);
+    int16x8_t v470 = vqrdmulhq_n_s16(v469, 16890);
+    int16x8_t v471 = vaddq_s16(v457, v470);
+    int16x8_t v472 = vsubq_s16(v415, v418);
+    int16x8_t v473_tmp = vqrdmulhq_n_s16(v422, 16273);
+    int16x8_t v473 = vaddq_s16(v473_tmp, v422);
+    int16x8_t v474_tmp = vqrdmulhq_n_s16(v424, 5763);
+    int16x8_t v474 = vaddq_s16(v474_tmp, v424);
+    int16x8_t v475 = vsubq_s16(v473, v474);
+    int16x8_t v476 = vaddq_s16(v472, v475);
+    int16x8_t v477 = vqrdmulhq_n_s16(v476, 16890);
+    int16x8_t v478 = vqrdmulhq_n_s16(v435, 20261);
+    int16x8_t v479 = vqrdmulhq_n_s16(v437, 26472);
+    int16x8_t v480 = vsubq_s16(v478, v479);
+    int16x8_t v481 = vqrdmulhq_n_s16(v480, 30046);
+    int16x8_t v482 = vqrdmulhq_n_s16(v430, 30322);
+    int16x8_t v483 = vqrdmulhq_n_s16(v432, 30322);
+    int16x8_t v484 = vsubq_s16(v482, v483);
+    int16x8_t v485 = vqrdmulhq_n_s16(v484, 30046);
+    int16x8_t v486 = vaddq_s16(v481, v485);
+    int16x8_t v487 = vqrdmulhq_n_s16(v486, 16890);
+    int16x8_t v488 = vaddq_s16(v477, v487);
+    int16x8_t v489 = vsubq_s16(v387, v390);
+    int16x8_t v490 = vqrdmulhq_n_s16(v392, 29490);
+    int16x8_t v491_tmp = vqrdmulhq_n_s16(v395, 5763);
+    int16x8_t v491 = vaddq_s16(v491_tmp, v395);
+    int16x8_t v492 = vsubq_s16(v490, v491);
+    int16x8_t v493 = vaddq_s16(v489, v492);
+    int16x8_t v494 = vsubq_s16(v399, v402);
+    int16x8_t v495 = vqrdmulhq_n_s16(v494, 18578);
+    int16x8_t v496 = vqrdmulhq_n_s16(v407, 27803);
+    int16x8_t v497 = vqrdmulhq_n_s16(v409, 21845);
+    int16x8_t v498 = vsubq_s16(v496, v497);
+    int16x8_t v499 = vaddq_s16(v495, v498);
+    int16x8_t v500 = vaddq_s16(v493, v499);
+    int16x8_t v501 = vaddq_s16(v488, v500);
+    int16x8_t v502 = vqrdmulhq_n_s16(v501, 16508);
+    int16x8_t v503 = vaddq_s16(v471, v502);
+    int16x8_t v504 = vsubq_s16(v2, v8);
+    int16x8_t v505 = vsubq_s16(v15, v22);
+    int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 18446);
+    int16x8_t v506 = vmlaq_n_s16(v506_tmp, v505, 2);
+    int16x8_t v507 = vaddq_s16(v504, v506);
+    int16x8_t v508 = vsubq_s16(v31, v41);
+    int16x8_t v509 = vsubq_s16(v48, v56);
+    int16x8_t v510_tmp = vqrdmulhq_n_s16(v509, 18446);
+    int16x8_t v510 = vmlaq_n_s16(v510_tmp, v509, 2);
+    int16x8_t v511 = vaddq_s16(v508, v510);
+    int16x8_t v512 = vqrdmulhq_n_s16(v511, 21195);
+    int16x8_t v513 = vaddq_s16(v507, v512);
+    int16x8_t v514 = vsubq_s16(v67, v77);
+    int16x8_t v515 = vsubq_s16(v90, v99);
+    int16x8_t v516_tmp = vqrdmulhq_n_s16(v515, 18446);
+    int16x8_t v516 = vmlaq_n_s16(v516_tmp, v515, 2);
+    int16x8_t v517 = vaddq_s16(v514, v516);
+    int16x8_t v518 = vsubq_s16(v114, v126);
+    int16x8_t v519_tmp = vqrdmulhq_n_s16(v518, 18446);
+    int16x8_t v519 = vmlaq_n_s16(v519_tmp, v518, 2);
+    int16x8_t v520 = vsubq_s16(v132, v137);
+    int16x8_t v521 = vaddq_s16(v519, v520);
+    int16x8_t v522 = vqrdmulhq_n_s16(v521, 21195);
+    int16x8_t v523 = vaddq_s16(v517, v522);
+    int16x8_t v524 = vqrdmulhq_n_s16(v523, 17401);
+    int16x8_t v525 = vaddq_s16(v513, v524);
+    int16x8_t v526 = vsubq_s16(v172, v181);
+    int16x8_t v527_tmp = vqrdmulhq_n_s16(v526, 18446);
+    int16x8_t v527 = vmlaq_n_s16(v527_tmp, v526, 2);
+    int16x8_t v528 = vsubq_s16(v149, v159);
+    int16x8_t v529 = vaddq_s16(v527, v528);
+    int16x8_t v530 = vsubq_s16(v229, v234);
+    int16x8_t v531 = vsubq_s16(v208, v223);
+    int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 18446);
+    int16x8_t v532 = vmlaq_n_s16(v532_tmp, v531, 2);
+    int16x8_t v533 = vaddq_s16(v530, v532);
+    int16x8_t v534 = vqrdmulhq_n_s16(v533, 21195);
+    int16x8_t v535 = vaddq_s16(v529, v534);
+    int16x8_t v536 = vsubq_s16(v244, v253);
+    int16x8_t v537 = vsubq_s16(v266, v277);
+    int16x8_t v538_tmp = vqrdmulhq_n_s16(v537, 18446);
+    int16x8_t v538 = vmlaq_n_s16(v538_tmp, v537, 2);
+    int16x8_t v539 = vaddq_s16(v536, v538);
+    int16x8_t v540 = vqrdmulhq_n_s16(v539, 17401);
+    int16x8_t v541 = vqrdmulhq_n_s16(v287, 25826);
+    int16x8_t v542 = vqrdmulhq_n_s16(v298, 25826);
+    int16x8_t v543 = vsubq_s16(v541, v542);
+    int16x8_t v544 = vqrdmulhq_n_s16(v543, 14281);
+    int16x8_t v545_tmp = vqrdmulhq_n_s16(v309, 31509);
+    int16x8_t v545 = vaddq_s16(v545_tmp, v309);
+    int16x8_t v546 = vsubq_s16(v545, v322);
+    int16x8_t v547 = vqrdmulhq_n_s16(v546, 28847);
+    int16x8_t v548 = vaddq_s16(v544, v547);
+    int16x8_t v549 = vaddq_s16(v540, v548);
+    int16x8_t v550 = vaddq_s16(v535, v549);
+    int16x8_t v551 = vqrdmulhq_n_s16(v550, 16629);
+    int16x8_t v552 = vaddq_s16(v525, v551);
+    int16x8_t v553 = vsubq_s16(v504, v506);
+    int16x8_t v554 = vsubq_s16(v508, v510);
+    int16x8_t v555 = vqrdmulhq_n_s16(v554, 25826);
+    int16x8_t v556 = vaddq_s16(v553, v555);
+    int16x8_t v557 = vsubq_s16(v514, v516);
+    int16x8_t v558 = vsubq_s16(v520, v519);
+    int16x8_t v559 = vqrdmulhq_n_s16(v558, 25826);
+    int16x8_t v560 = vaddq_s16(v557, v559);
+    int16x8_t v561 = vqrdmulhq_n_s16(v560, 18124);
+    int16x8_t v562 = vaddq_s16(v556, v561);
+    int16x8_t v563 = vsubq_s16(v528, v527);
+    int16x8_t v564 = vsubq_s16(v530, v532);
+    int16x8_t v565 = vqrdmulhq_n_s16(v564, 25826);
+    int16x8_t v566 = vaddq_s16(v563, v565);
+    int16x8_t v567 = vsubq_s16(v536, v538);
+    int16x8_t v568 = vqrdmulhq_n_s16(v567, 18124);
+    int16x8_t v569_tmp = vqrdmulhq_n_s16(v546, 654);
+    int16x8_t v569 = vmlaq_n_s16(v569_tmp, v546, 2);
+    int16x8_t v570 = vsubq_s16(v543, v569);
+    int16x8_t v571 = vqrdmulhq_n_s16(v570, 18124);
+    int16x8_t v572 = vaddq_s16(v568, v571);
+    int16x8_t v573 = vaddq_s16(v566, v572);
+    int16x8_t v574 = vqrdmulhq_n_s16(v573, 16792);
+    int16x8_t v575 = vaddq_s16(v562, v574);
+    int16x8_t v576 = vsubq_s16(v458, v461);
+    int16x8_t v577_tmp = vqrdmulhq_n_s16(v465, 25030);
+    int16x8_t v577 = vaddq_s16(v577_tmp, v465);
+    int16x8_t v578 = vsubq_s16(v466, v577);
+    int16x8_t v579_tmp = vqrdmulhq_n_s16(v578, 1988);
+    int16x8_t v579 = vaddq_s16(v579_tmp, v578);
+    int16x8_t v580 = vaddq_s16(v576, v579);
+    int16x8_t v581 = vqrdmulhq_n_s16(v580, 19102);
+    int16x8_t v582 = vsubq_s16(v447, v449);
+    int16x8_t v583 = vsubq_s16(v451, v454);
+    int16x8_t v584_tmp = vqrdmulhq_n_s16(v583, 1988);
+    int16x8_t v584 = vaddq_s16(v584_tmp, v583);
+    int16x8_t v585 = vaddq_s16(v582, v584);
+    int16x8_t v586 = vaddq_s16(v581, v585);
+    int16x8_t v587 = vsubq_s16(v489, v492);
+    int16x8_t v588_tmp = vqrdmulhq_n_s16(v498, 25030);
+    int16x8_t v588 = vaddq_s16(v588_tmp, v498);
+    int16x8_t v589 = vsubq_s16(v494, v588);
+    int16x8_t v590_tmp = vqrdmulhq_n_s16(v589, 1988);
+    int16x8_t v590 = vaddq_s16(v590_tmp, v589);
+    int16x8_t v591 = vaddq_s16(v587, v590);
+    int16x8_t v592 = vsubq_s16(v472, v475);
+    int16x8_t v593 = vqrdmulhq_n_s16(v592, 19102);
+    int16x8_t v594 = vsubq_s16(v480, v484);
+    int16x8_t v595 = vaddq_s16(v593, v594);
+    int16x8_t v596 = vaddq_s16(v591, v595);
+    int16x8_t v597 = vqrdmulhq_n_s16(v596, 17000);
+    int16x8_t v598 = vaddq_s16(v586, v597);
+    int16x8_t v599 = vsubq_s16(v365, v371);
+    int16x8_t v600_tmp = vqrdmulhq_n_s16(v599, 23673);
+    int16x8_t v600 = vaddq_s16(v600_tmp, v599);
+    int16x8_t v601 = vsubq_s16(v377, v383);
+    int16x8_t v602 = vaddq_s16(v600, v601);
+    int16x8_t v603 = vsubq_s16(v348, v356);
+    int16x8_t v604_tmp = vqrdmulhq_n_s16(v603, 23673);
+    int16x8_t v604 = vaddq_s16(v604_tmp, v603);
+    int16x8_t v605 = vsubq_s16(v342, v337);
+    int16x8_t v606 = vaddq_s16(v604, v605);
+    int16x8_t v607 = vqrdmulhq_n_s16(v606, 20398);
+    int16x8_t v608 = vaddq_s16(v602, v607);
+    int16x8_t v609 = vsubq_s16(v391, v397);
+    int16x8_t v610 = vsubq_s16(v403, v411);
+    int16x8_t v611_tmp = vqrdmulhq_n_s16(v610, 23673);
+    int16x8_t v611 = vaddq_s16(v611_tmp, v610);
+    int16x8_t v612 = vaddq_s16(v609, v611);
+    int16x8_t v613 = vsubq_s16(v419, v426);
+    int16x8_t v614 = vqrdmulhq_n_s16(v613, 20398);
+    int16x8_t v615 = vsubq_s16(v439, v433);
+    int16x8_t v616_tmp = vqrdmulhq_n_s16(v615, 2367);
+    int16x8_t v616 = vaddq_s16(v616_tmp, v615);
+    int16x8_t v617 = vaddq_s16(v614, v616);
+    int16x8_t v618 = vaddq_s16(v612, v617);
+    int16x8_t v619 = vqrdmulhq_n_s16(v618, 17255);
+    int16x8_t v620 = vaddq_s16(v608, v619);
+    int16x8_t v621 = vsubq_s16(v160, v183);
+    int16x8_t v622 = vsubq_s16(v235, v225);
+    int16x8_t v623_tmp = vqrdmulhq_n_s16(v622, 3314);
+    int16x8_t v623 = vmlaq_n_s16(v623_tmp, v622, 5);
+    int16x8_t v624 = vaddq_s16(v621, v623);
+    int16x8_t v625 = vsubq_s16(v254, v279);
+    int16x8_t v626 = vsubq_s16(v299, v324);
+    int16x8_t v627_tmp = vqrdmulhq_n_s16(v626, 3314);
+    int16x8_t v627 = vmlaq_n_s16(v627_tmp, v626, 5);
+    int16x8_t v628 = vaddq_s16(v625, v627);
+    int16x8_t v629 = vqrdmulhq_n_s16(v628, 22112);
+    int16x8_t v630 = vaddq_s16(v624, v629);
+    int16x8_t v631 = vqrdmulhq_n_s16(v630, 17561);
+    int16x8_t v632 = vsubq_s16(v9, v24);
+    int16x8_t v633 = vsubq_s16(v42, v58);
+    int16x8_t v634_tmp = vqrdmulhq_n_s16(v633, 3314);
+    int16x8_t v634 = vmlaq_n_s16(v634_tmp, v633, 5);
+    int16x8_t v635 = vaddq_s16(v632, v634);
+    int16x8_t v636 = vsubq_s16(v78, v101);
+    int16x8_t v637 = vsubq_s16(v138, v128);
+    int16x8_t v638_tmp = vqrdmulhq_n_s16(v637, 3314);
+    int16x8_t v638 = vmlaq_n_s16(v638_tmp, v637, 5);
+    int16x8_t v639 = vaddq_s16(v636, v638);
+    int16x8_t v640 = vqrdmulhq_n_s16(v639, 22112);
+    int16x8_t v641 = vaddq_s16(v635, v640);
+    int16x8_t v642 = vaddq_s16(v631, v641);
+    int16x8_t v643 = vsubq_s16(v632, v634);
+    int16x8_t v644 = vsubq_s16(v636, v638);
+    int16x8_t v645 = vqrdmulhq_n_s16(v644, 24397);
+    int16x8_t v646 = vaddq_s16(v643, v645);
+    int16x8_t v647 = vsubq_s16(v621, v623);
+    int16x8_t v648 = vsubq_s16(v625, v627);
+    int16x8_t v649 = vqrdmulhq_n_s16(v648, 24397);
+    int16x8_t v650 = vaddq_s16(v647, v649);
+    int16x8_t v651 = vqrdmulhq_n_s16(v650, 17921);
+    int16x8_t v652 = vaddq_s16(v646, v651);
+    int16x8_t v653 = vsubq_s16(v601, v600);
+    int16x8_t v654 = vsubq_s16(v605, v604);
+    int16x8_t v655 = vqrdmulhq_n_s16(v654, 27504);
+    int16x8_t v656 = vaddq_s16(v653, v655);
+    int16x8_t v657 = vsubq_s16(v609, v611);
+    int16x8_t v658 = vqrdmulhq_n_s16(v613, 27504);
+    int16x8_t v659_tmp = vqrdmulhq_n_s16(v615, 14606);
+    int16x8_t v659 = vaddq_s16(v659_tmp, v615);
+    int16x8_t v660 = vsubq_s16(v658, v659);
+    int16x8_t v661 = vaddq_s16(v657, v660);
+    int16x8_t v662 = vqrdmulhq_n_s16(v661, 18343);
+    int16x8_t v663 = vaddq_s16(v656, v662);
+    int16x8_t v664 = vsubq_s16(v582, v584);
+    int16x8_t v665 = vsubq_s16(v576, v579);
+    int16x8_t v666 = vqrdmulhq_n_s16(v665, 31869);
+    int16x8_t v667 = vaddq_s16(v664, v666);
+    int16x8_t v668 = vsubq_s16(v587, v590);
+    int16x8_t v669_tmp = vqrdmulhq_n_s16(v594, 23444);
+    int16x8_t v669 = vaddq_s16(v669_tmp, v594);
+    int16x8_t v670 = vsubq_s16(v592, v669);
+    int16x8_t v671 = vqrdmulhq_n_s16(v670, 31869);
+    int16x8_t v672 = vaddq_s16(v668, v671);
+    int16x8_t v673 = vqrdmulhq_n_s16(v672, 18830);
+    int16x8_t v674 = vaddq_s16(v667, v673);
+    int16x8_t v675 = vsubq_s16(v553, v555);
+    int16x8_t v676 = vsubq_s16(v557, v559);
+    int16x8_t v677_tmp = vqrdmulhq_n_s16(v676, 5552);
+    int16x8_t v677 = vaddq_s16(v677_tmp, v676);
+    int16x8_t v678 = vaddq_s16(v675, v677);
+    int16x8_t v679 = vsubq_s16(v563, v565);
+    int16x8_t v680 = vsubq_s16(v567, v570);
+    int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 5552);
+    int16x8_t v681 = vaddq_s16(v681_tmp, v680);
+    int16x8_t v682 = vaddq_s16(v679, v681);
+    int16x8_t v683 = vqrdmulhq_n_s16(v682, 19393);
+    int16x8_t v684 = vaddq_s16(v678, v683);
+    int16x8_t v685 = vsubq_s16(v507, v512);
+    int16x8_t v686 = vsubq_s16(v517, v522);
+    int16x8_t v687_tmp = vqrdmulhq_n_s16(v686, 15865);
+    int16x8_t v687 = vaddq_s16(v687_tmp, v686);
+    int16x8_t v688 = vaddq_s16(v685, v687);
+    int16x8_t v689 = vsubq_s16(v529, v534);
+    int16x8_t v690_tmp = vqrdmulhq_n_s16(v548, 28937);
+    int16x8_t v690 = vaddq_s16(v690_tmp, v548);
+    int16x8_t v691 = vsubq_s16(v539, v690);
+    int16x8_t v692_tmp = vqrdmulhq_n_s16(v691, 15865);
+    int16x8_t v692 = vaddq_s16(v692_tmp, v691);
+    int16x8_t v693 = vaddq_s16(v689, v692);
+    int16x8_t v694 = vqrdmulhq_n_s16(v693, 20040);
+    int16x8_t v695 = vaddq_s16(v688, v694);
+    int16x8_t v696 = vsubq_s16(v476, v486);
+    int16x8_t v697_tmp = vqrdmulhq_n_s16(v696, 1893);
+    int16x8_t v697 = vmlaq_n_s16(v697_tmp, v696, 2);
+    int16x8_t v698 = vsubq_s16(v493, v499);
+    int16x8_t v699 = vaddq_s16(v697, v698);
+    int16x8_t v700 = vqrdmulhq_n_s16(v699, 20783);
+    int16x8_t v701 = vsubq_s16(v450, v456);
+    int16x8_t v702 = vsubq_s16(v462, v468);
+    int16x8_t v703_tmp = vqrdmulhq_n_s16(v702, 1893);
+    int16x8_t v703 = vmlaq_n_s16(v703_tmp, v702, 2);
+    int16x8_t v704 = vaddq_s16(v701, v703);
+    int16x8_t v705 = vaddq_s16(v700, v704);
+    int16x8_t v706 = vsubq_s16(v384, v373);
+    int16x8_t v707 = vsubq_s16(v343, v358);
+    int16x8_t v708_tmp = vqrdmulhq_n_s16(v707, 13357);
+    int16x8_t v708 = vmlaq_n_s16(v708_tmp, v707, 3);
+    int16x8_t v709 = vaddq_s16(v706, v708);
+    int16x8_t v710 = vsubq_s16(v398, v413);
+    int16x8_t v711 = vsubq_s16(v427, v441);
+    int16x8_t v712_tmp = vqrdmulhq_n_s16(v711, 13357);
+    int16x8_t v712 = vmlaq_n_s16(v712_tmp, v711, 3);
+    int16x8_t v713 = vaddq_s16(v710, v712);
+    int16x8_t v714 = vqrdmulhq_n_s16(v713, 21637);
+    int16x8_t v715 = vaddq_s16(v709, v714);
+    int16x8_t v716 = vsubq_s16(v25, v60);
+    int16x8_t v717 = vsubq_s16(v102, v140);
+    int16x8_t v718_tmp = vqrdmulhq_n_s16(v717, 6226);
+    int16x8_t v718 = vmlaq_n_s16(v718_tmp, v717, 10);
+    int16x8_t v719 = vaddq_s16(v716, v718);
+    int16x8_t v720 = vsubq_s16(v280, v326);
+    int16x8_t v721_tmp = vqrdmulhq_n_s16(v720, 6226);
+    int16x8_t v721 = vmlaq_n_s16(v721_tmp, v720, 10);
+    int16x8_t v722 = vsubq_s16(v184, v237);
+    int16x8_t v723 = vaddq_s16(v721, v722);
+    int16x8_t v724 = vqrdmulhq_n_s16(v723, 22622);
+    int16x8_t v725 = vaddq_s16(v719, v724);
+    int16x8_t v726 = vsubq_s16(v716, v718);
+    int16x8_t v727 = vsubq_s16(v722, v721);
+    int16x8_t v728 = vqrdmulhq_n_s16(v727, 23761);
+    int16x8_t v729 = vaddq_s16(v726, v728);
+    int16x8_t v730 = vsubq_s16(v706, v708);
+    int16x8_t v731 = vsubq_s16(v710, v712);
+    int16x8_t v732 = vqrdmulhq_n_s16(v731, 25084);
+    int16x8_t v733 = vaddq_s16(v730, v732);
+    int16x8_t v734 = vsubq_s16(v701, v703);
+    int16x8_t v735 = vsubq_s16(v698, v697);
+    int16x8_t v736 = vqrdmulhq_n_s16(v735, 26631);
+    int16x8_t v737 = vaddq_s16(v734, v736);
+    int16x8_t v738 = vsubq_s16(v685, v687);
+    int16x8_t v739 = vsubq_s16(v689, v692);
+    int16x8_t v740 = vqrdmulhq_n_s16(v739, 28454);
+    int16x8_t v741 = vaddq_s16(v738, v740);
+    int16x8_t v742 = vsubq_s16(v675, v677);
+    int16x8_t v743 = vsubq_s16(v679, v681);
+    int16x8_t v744 = vqrdmulhq_n_s16(v743, 30624);
+    int16x8_t v745 = vaddq_s16(v742, v744);
+    int16x8_t v746 = vsubq_s16(v664, v666);
+    int16x8_t v747 = vsubq_s16(v668, v671);
+    int16x8_t v748_tmp = vqrdmulhq_n_s16(v747, 472);
+    int16x8_t v748 = vaddq_s16(v748_tmp, v747);
+    int16x8_t v749 = vaddq_s16(v746, v748);
+    int16x8_t v750 = vsubq_s16(v653, v655);
+    int16x8_t v751 = vsubq_s16(v657, v660);
+    int16x8_t v752_tmp = vqrdmulhq_n_s16(v751, 3672);
+    int16x8_t v752 = vaddq_s16(v752_tmp, v751);
+    int16x8_t v753 = vaddq_s16(v750, v752);
+    int16x8_t v754 = vsubq_s16(v643, v645);
+    int16x8_t v755 = vsubq_s16(v647, v649);
+    int16x8_t v756_tmp = vqrdmulhq_n_s16(v755, 7662);
+    int16x8_t v756 = vaddq_s16(v756_tmp, v755);
+    int16x8_t v757 = vaddq_s16(v754, v756);
+    int16x8_t v758 = vsubq_s16(v635, v640);
+    int16x8_t v759 = vsubq_s16(v624, v629);
+    int16x8_t v760_tmp = vqrdmulhq_n_s16(v759, 12756);
+    int16x8_t v760 = vaddq_s16(v760_tmp, v759);
+    int16x8_t v761 = vaddq_s16(v758, v760);
+    int16x8_t v762 = vsubq_s16(v602, v607);
+    int16x8_t v763 = vsubq_s16(v612, v617);
+    int16x8_t v764_tmp = vqrdmulhq_n_s16(v763, 19463);
+    int16x8_t v764 = vaddq_s16(v764_tmp, v763);
+    int16x8_t v765 = vaddq_s16(v762, v764);
+    int16x8_t v766 = vsubq_s16(v585, v581);
+    int16x8_t v767 = vsubq_s16(v591, v595);
+    int16x8_t v768_tmp = vqrdmulhq_n_s16(v767, 28661);
+    int16x8_t v768 = vaddq_s16(v768_tmp, v767);
+    int16x8_t v769 = vaddq_s16(v766, v768);
+    int16x8_t v770 = vsubq_s16(v556, v561);
+    int16x8_t v771 = vsubq_s16(v566, v572);
+    int16x8_t v772_tmp = vqrdmulhq_n_s16(v771, 9242);
+    int16x8_t v772 = vmlaq_n_s16(v772_tmp, v771, 2);
+    int16x8_t v773 = vaddq_s16(v770, v772);
+    int16x8_t v774 = vsubq_s16(v513, v524);
+    int16x8_t v775 = vsubq_s16(v535, v549);
+    int16x8_t v776_tmp = vqrdmulhq_n_s16(v775, 30298);
+    int16x8_t v776 = vmlaq_n_s16(v776_tmp, v775, 2);
+    int16x8_t v777 = vaddq_s16(v774, v776);
+    int16x8_t v778 = vsubq_s16(v457, v470);
+    int16x8_t v779 = vsubq_s16(v500, v488);
+    int16x8_t v780_tmp = vqrdmulhq_n_s16(v779, 2773);
+    int16x8_t v780 = vmlaq_n_s16(v780_tmp, v779, 4);
+    int16x8_t v781 = vaddq_s16(v778, v780);
+    int16x8_t v782 = vsubq_s16(v385, v360);
+    int16x8_t v783 = vsubq_s16(v414, v443);
+    int16x8_t v784_tmp = vqrdmulhq_n_s16(v783, 26108);
+    int16x8_t v784 = vmlaq_n_s16(v784_tmp, v783, 6);
+    int16x8_t v785 = vaddq_s16(v782, v784);
+    int16x8_t v786 = vsubq_s16(v61, v142);
+    int16x8_t v787 = vsubq_s16(v238, v328);
+    int16x8_t v788_tmp = vqrdmulhq_n_s16(v787, 12251);
+    int16x8_t v788 = vmlaq_n_s16(v788_tmp, v787, 20);
+    int16x8_t v789 = vaddq_s16(v786, v788);
+    int16x8_t v790 = vsubq_s16(v786, v788);
+    int16x8_t v791 = vsubq_s16(v782, v784);
+    int16x8_t v792 = vsubq_s16(v778, v780);
+    int16x8_t v793 = vsubq_s16(v774, v776);
+    int16x8_t v794 = vsubq_s16(v770, v772);
+    int16x8_t v795 = vsubq_s16(v766, v768);
+    int16x8_t v796 = vsubq_s16(v762, v764);
+    int16x8_t v797 = vsubq_s16(v758, v760);
+    int16x8_t v798 = vsubq_s16(v754, v756);
+    int16x8_t v799 = vsubq_s16(v750, v752);
+    int16x8_t v800 = vsubq_s16(v746, v748);
+    int16x8_t v801 = vsubq_s16(v742, v744);
+    int16x8_t v802 = vsubq_s16(v738, v740);
+    int16x8_t v803 = vsubq_s16(v734, v736);
+    int16x8_t v804 = vsubq_s16(v730, v732);
+    int16x8_t v805 = vsubq_s16(v726, v728);
+    int16x8_t v806 = vsubq_s16(v719, v724);
+    int16x8_t v807 = vsubq_s16(v709, v714);
+    int16x8_t v808 = vsubq_s16(v704, v700);
+    int16x8_t v809 = vsubq_s16(v688, v694);
+    int16x8_t v810 = vsubq_s16(v678, v683);
+    int16x8_t v811 = vsubq_s16(v667, v673);
+    int16x8_t v812 = vsubq_s16(v656, v662);
+    int16x8_t v813 = vsubq_s16(v646, v651);
+    int16x8_t v814 = vsubq_s16(v641, v631);
+    int16x8_t v815 = vsubq_s16(v608, v619);
+    int16x8_t v816 = vsubq_s16(v586, v597);
+    int16x8_t v817 = vsubq_s16(v562, v574);
+    int16x8_t v818 = vsubq_s16(v525, v551);
+    int16x8_t v819 = vsubq_s16(v471, v502);
+    int16x8_t v820 = vsubq_s16(v386, v445);
+    int16x8_t v821 = vsubq_s16(v143, v330);
+    vst1q_s16(out + out_stride * 0 + i, v331);
+    vst1q_s16(out + out_stride * 1 + i, v446);
+    vst1q_s16(out + out_stride * 2 + i, v503);
+    vst1q_s16(out + out_stride * 3 + i, v552);
+    vst1q_s16(out + out_stride * 4 + i, v575);
+    vst1q_s16(out + out_stride * 5 + i, v598);
+    vst1q_s16(out + out_stride * 6 + i, v620);
+    vst1q_s16(out + out_stride * 7 + i, v642);
+    vst1q_s16(out + out_stride * 8 + i, v652);
+    vst1q_s16(out + out_stride * 9 + i, v663);
+    vst1q_s16(out + out_stride * 10 + i, v674);
+    vst1q_s16(out + out_stride * 11 + i, v684);
+    vst1q_s16(out + out_stride * 12 + i, v695);
+    vst1q_s16(out + out_stride * 13 + i, v705);
+    vst1q_s16(out + out_stride * 14 + i, v715);
+    vst1q_s16(out + out_stride * 15 + i, v725);
+    vst1q_s16(out + out_stride * 16 + i, v729);
+    vst1q_s16(out + out_stride * 17 + i, v733);
+    vst1q_s16(out + out_stride * 18 + i, v737);
+    vst1q_s16(out + out_stride * 19 + i, v741);
+    vst1q_s16(out + out_stride * 20 + i, v745);
+    vst1q_s16(out + out_stride * 21 + i, v749);
+    vst1q_s16(out + out_stride * 22 + i, v753);
+    vst1q_s16(out + out_stride * 23 + i, v757);
+    vst1q_s16(out + out_stride * 24 + i, v761);
+    vst1q_s16(out + out_stride * 25 + i, v765);
+    vst1q_s16(out + out_stride * 26 + i, v769);
+    vst1q_s16(out + out_stride * 27 + i, v773);
+    vst1q_s16(out + out_stride * 28 + i, v777);
+    vst1q_s16(out + out_stride * 29 + i, v781);
+    vst1q_s16(out + out_stride * 30 + i, v785);
+    vst1q_s16(out + out_stride * 31 + i, v789);
+    vst1q_s16(out + out_stride * 32 + i, v790);
+    vst1q_s16(out + out_stride * 33 + i, v791);
+    vst1q_s16(out + out_stride * 34 + i, v792);
+    vst1q_s16(out + out_stride * 35 + i, v793);
+    vst1q_s16(out + out_stride * 36 + i, v794);
+    vst1q_s16(out + out_stride * 37 + i, v795);
+    vst1q_s16(out + out_stride * 38 + i, v796);
+    vst1q_s16(out + out_stride * 39 + i, v797);
+    vst1q_s16(out + out_stride * 40 + i, v798);
+    vst1q_s16(out + out_stride * 41 + i, v799);
+    vst1q_s16(out + out_stride * 42 + i, v800);
+    vst1q_s16(out + out_stride * 43 + i, v801);
+    vst1q_s16(out + out_stride * 44 + i, v802);
+    vst1q_s16(out + out_stride * 45 + i, v803);
+    vst1q_s16(out + out_stride * 46 + i, v804);
+    vst1q_s16(out + out_stride * 47 + i, v805);
+    vst1q_s16(out + out_stride * 48 + i, v806);
+    vst1q_s16(out + out_stride * 49 + i, v807);
+    vst1q_s16(out + out_stride * 50 + i, v808);
+    vst1q_s16(out + out_stride * 51 + i, v809);
+    vst1q_s16(out + out_stride * 52 + i, v810);
+    vst1q_s16(out + out_stride * 53 + i, v811);
+    vst1q_s16(out + out_stride * 54 + i, v812);
+    vst1q_s16(out + out_stride * 55 + i, v813);
+    vst1q_s16(out + out_stride * 56 + i, v814);
+    vst1q_s16(out + out_stride * 57 + i, v815);
+    vst1q_s16(out + out_stride * 58 + i, v816);
+    vst1q_s16(out + out_stride * 59 + i, v817);
+    vst1q_s16(out + out_stride * 60 + i, v818);
+    vst1q_s16(out + out_stride * 61 + i, v819);
+    vst1q_s16(out + out_stride * 62 + i, v820);
+    vst1q_s16(out + out_stride * 63 + i, v821);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h
new file mode 100644
index 0000000000..946ace4a0c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h
@@ -0,0 +1,80 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/* This file is automatically generated. Do not modify it directly. */
+#if HWY_TARGET != HWY_NEON
+#error "only include this file from fast_dct-inl.h"
+#endif
+
+constexpr size_t FastIDCTIntegerBits(FastDCTTag<8>) { return 1; }
+
+void FastIDCT(FastDCTTag<8>, const int16_t* in, size_t in_stride, int16_t* out,
+              size_t out_stride, size_t count) {
+  JXL_ASSERT(count % 8 == 0);
+  for (size_t i = 0; i < count; i += 8) {
+    int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
+    int16x8_t v1 = vld1q_s16(in + in_stride * 4 + i);
+    int16x8_t v2 = vaddq_s16(v0, v1);
+    int16x8_t v3 = vld1q_s16(in + in_stride * 2 + i);
+    int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
+    int16x8_t v4 = vaddq_s16(v4_tmp, v3);
+    int16x8_t v5 = vld1q_s16(in + in_stride * 6 + i);
+    int16x8_t v6 = vaddq_s16(v5, v3);
+    int16x8_t v7 = vaddq_s16(v4, v6);
+    int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
+    int16x8_t v9 = vaddq_s16(v2, v8);
+    int16x8_t v10 = vld1q_s16(in + in_stride * 1 + i);
+    int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
+    int16x8_t v11 = vaddq_s16(v11_tmp, v10);
+    int16x8_t v12 = vld1q_s16(in + in_stride * 5 + i);
+    int16x8_t v13 = vld1q_s16(in + in_stride * 3 + i);
+    int16x8_t v14 = vaddq_s16(v12, v13);
+    int16x8_t v15 = vaddq_s16(v11, v14);
+    int16x8_t v16 = vaddq_s16(v13, v10);
+    int16x8_t v17 = vqrdmulhq_n_s16(v16, 25080);
+    int16x8_t v18 = vld1q_s16(in + in_stride * 7 + i);
+    int16x8_t v19 = vaddq_s16(v18, v12);
+    int16x8_t v20 = vaddq_s16(v16, v19);
+    int16x8_t v21 = vqrdmulhq_n_s16(v20, 17734);
+    int16x8_t v22 = vaddq_s16(v17, v21);
+    int16x8_t v23 = vaddq_s16(v15, v22);
+    int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
+    int16x8_t v25 = vaddq_s16(v9, v24);
+    int16x8_t v26 = vsubq_s16(v0, v1);
+    int16x8_t v27 = vsubq_s16(v4, v6);
+    int16x8_t v28_tmp = vqrdmulhq_n_s16(v27, 10045);
+    int16x8_t v28 = vaddq_s16(v28_tmp, v27);
+    int16x8_t v29 = vaddq_s16(v26, v28);
+    int16x8_t v30 = vsubq_s16(v11, v14);
+    int16x8_t v31 = vqrdmulhq_n_s16(v16, 17734);
+    int16x8_t v32_tmp = vqrdmulhq_n_s16(v19, 10045);
+    int16x8_t v32 = vaddq_s16(v32_tmp, v19);
+    int16x8_t v33 = vsubq_s16(v31, v32);
+    int16x8_t v34 = vaddq_s16(v30, v33);
+    int16x8_t v35 = vqrdmulhq_n_s16(v34, 19705);
+    int16x8_t v36 = vaddq_s16(v29, v35);
+    int16x8_t v37 = vsubq_s16(v26, v28);
+    int16x8_t v38 = vsubq_s16(v30, v33);
+    int16x8_t v39 = vqrdmulhq_n_s16(v38, 29490);
+    int16x8_t v40 = vaddq_s16(v37, v39);
+    int16x8_t v41 = vsubq_s16(v2, v8);
+    int16x8_t v42 = vsubq_s16(v15, v22);
+    int16x8_t v43_tmp = vqrdmulhq_n_s16(v42, 18446);
+    int16x8_t v43 = vmlaq_n_s16(v43_tmp, v42, 2);
+    int16x8_t v44 = vaddq_s16(v41, v43);
+    int16x8_t v45 = vsubq_s16(v41, v43);
+    int16x8_t v46 = vsubq_s16(v37, v39);
+    int16x8_t v47 = vsubq_s16(v29, v35);
+    int16x8_t v48 = vsubq_s16(v9, v24);
+    vst1q_s16(out + out_stride * 0 + i, v25);
+    vst1q_s16(out + out_stride * 1 + i, v36);
+    vst1q_s16(out + out_stride * 2 + i, v40);
+    vst1q_s16(out + out_stride * 3 + i, v44);
+    vst1q_s16(out + out_stride * 4 + i, v45);
+    vst1q_s16(out + out_stride * 5 + i, v46);
+    vst1q_s16(out + out_stride * 6 + i, v47);
+    vst1q_s16(out + out_stride * 7 + i, v48);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct_test.cc b/third_party/jpeg-xl/lib/jxl/fast_dct_test.cc
new file mode 100644
index 0000000000..5bb1a79cc5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct_test.cc
@@ -0,0 +1,378 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#include <numeric>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/fast_dct_test.cc"
+#include <hwy/foreach_target.h>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/dct-inl.h"
+#include "lib/jxl/fast_dct-inl.h"
+#include "lib/jxl/fast_dct.h"
+#include "lib/jxl/transpose-inl.h"
+
+// Test utils
+#include <hwy/highway.h>
+#include <hwy/tests/test_util-inl.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+template <size_t N, size_t M>
+HWY_NOINLINE void TestFastTranspose() {
+#if HWY_TARGET == HWY_NEON
+  auto array_mem = hwy::AllocateAligned<int16_t>(N * M);
+  int16_t* array = array_mem.get();
+  auto transposed_mem = hwy::AllocateAligned<int16_t>(N * M);
+  int16_t* transposed = transposed_mem.get();
+  std::iota(array, array + N * M, 0);
+  for (size_t j = 0; j < 100000000 / (N * M); j++) {
+    FastTransposeBlock(array, M, N, M, transposed, N);
+  }
+  for (size_t i = 0; i < M; i++) {
+    for (size_t j = 0; j < N; j++) {
+      EXPECT_EQ(array[j * M + i], transposed[i * N + j]);
+    }
+  }
+#endif
+}
+
+template <size_t N, size_t M>
+HWY_NOINLINE void TestFloatTranspose() {
+  auto array_mem = hwy::AllocateAligned<float>(N * M);
+  float* array = array_mem.get();
+  auto transposed_mem = hwy::AllocateAligned<float>(N * M);
+  float* transposed = transposed_mem.get();
+  std::iota(array, array + N * M, 0);
+  for (size_t j = 0; j < 100000000 / (N * M); j++) {
+    Transpose<N, M>::Run(DCTFrom(array, M), DCTTo(transposed, N));
+  }
+  for (size_t i = 0; i < M; i++) {
+    for (size_t j = 0; j < N; j++) {
+      EXPECT_EQ(array[j * M + i], transposed[i * N + j]);
+    }
+  }
+}
+
+// TODO(sboukortt): re-enable the FloatIDCT tests once we find out why they fail
+// in ASAN mode in the CI runners and seemingly not locally.
+
+HWY_NOINLINE void TestFastTranspose8x8() { TestFastTranspose<8, 8>(); }
+HWY_NOINLINE void TestFloatTranspose8x8() { TestFloatTranspose<8, 8>(); }
+HWY_NOINLINE void TestFastIDCT8x8() { TestFastIDCT<8, 8>(); }
+HWY_NOINLINE void TestFloatIDCT8x8() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<8, 8>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose8x16() { TestFastTranspose<8, 16>(); }
+HWY_NOINLINE void TestFloatTranspose8x16() { TestFloatTranspose<8, 16>(); }
+HWY_NOINLINE void TestFastIDCT8x16() { TestFastIDCT<8, 16>(); }
+HWY_NOINLINE void TestFloatIDCT8x16() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<8, 16>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose8x32() { TestFastTranspose<8, 32>(); }
+HWY_NOINLINE void TestFloatTranspose8x32() { TestFloatTranspose<8, 32>(); }
+HWY_NOINLINE void TestFastIDCT8x32() { TestFastIDCT<8, 32>(); }
+HWY_NOINLINE void TestFloatIDCT8x32() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<8, 32>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose16x8() { TestFastTranspose<16, 8>(); }
+HWY_NOINLINE void TestFloatTranspose16x8() { TestFloatTranspose<16, 8>(); }
+HWY_NOINLINE void TestFastIDCT16x8() { TestFastIDCT<16, 8>(); }
+HWY_NOINLINE void TestFloatIDCT16x8() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<16, 8>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose16x16() { TestFastTranspose<16, 16>(); }
+HWY_NOINLINE void TestFloatTranspose16x16() { TestFloatTranspose<16, 16>(); }
+HWY_NOINLINE void TestFastIDCT16x16() { TestFastIDCT<16, 16>(); }
+HWY_NOINLINE void TestFloatIDCT16x16() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<16, 16>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose16x32() { TestFastTranspose<16, 32>(); }
+HWY_NOINLINE void TestFloatTranspose16x32() { TestFloatTranspose<16, 32>(); }
+HWY_NOINLINE void TestFastIDCT16x32() { TestFastIDCT<16, 32>(); }
+HWY_NOINLINE void TestFloatIDCT16x32() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<16, 32>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose32x8() { TestFastTranspose<32, 8>(); }
+HWY_NOINLINE void TestFloatTranspose32x8() { TestFloatTranspose<32, 8>(); }
+HWY_NOINLINE void TestFastIDCT32x8() { TestFastIDCT<32, 8>(); }
+HWY_NOINLINE void TestFloatIDCT32x8() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<32, 8>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose32x16() { TestFastTranspose<32, 16>(); }
+HWY_NOINLINE void TestFloatTranspose32x16() { TestFloatTranspose<32, 16>(); }
+HWY_NOINLINE void TestFastIDCT32x16() { TestFastIDCT<32, 16>(); }
+HWY_NOINLINE void TestFloatIDCT32x16() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<32, 16>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose32x32() { TestFastTranspose<32, 32>(); }
+HWY_NOINLINE void TestFloatTranspose32x32() { TestFloatTranspose<32, 32>(); }
+HWY_NOINLINE void TestFastIDCT32x32() { TestFastIDCT<32, 32>(); }
+HWY_NOINLINE void TestFloatIDCT32x32() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<32, 32>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose32x64() { TestFastTranspose<32, 64>(); }
+HWY_NOINLINE void TestFloatTranspose32x64() { TestFloatTranspose<32, 64>(); }
+HWY_NOINLINE void TestFastIDCT32x64() { TestFastIDCT<32, 64>(); }
+HWY_NOINLINE void TestFloatIDCT32x64() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<32, 64>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose64x32() { TestFastTranspose<64, 32>(); }
+HWY_NOINLINE void TestFloatTranspose64x32() { TestFloatTranspose<64, 32>(); }
+HWY_NOINLINE void TestFastIDCT64x32() { TestFastIDCT<64, 32>(); }
+HWY_NOINLINE void TestFloatIDCT64x32() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<64, 32>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose64x64() { TestFastTranspose<64, 64>(); }
+HWY_NOINLINE void TestFloatTranspose64x64() { TestFloatTranspose<64, 64>(); }
+HWY_NOINLINE void TestFastIDCT64x64() { TestFastIDCT<64, 64>(); }
+HWY_NOINLINE void TestFloatIDCT64x64() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<64, 64>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose64x128() { TestFastTranspose<64, 128>(); }
+HWY_NOINLINE void TestFloatTranspose64x128() { TestFloatTranspose<64, 128>(); }
+/*
+HWY_NOINLINE void TestFastIDCT64x128() { TestFastIDCT<64, 128>(); }
+HWY_NOINLINE void TestFloatIDCT64x128() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<64, 128>();
+#endif
+}
+*/
+HWY_NOINLINE void TestFastTranspose128x64() { TestFastTranspose<128, 64>(); }
+HWY_NOINLINE void TestFloatTranspose128x64() { TestFloatTranspose<128, 64>(); }
+/*
+HWY_NOINLINE void TestFastIDCT128x64() { TestFastIDCT<128, 64>(); }
+HWY_NOINLINE void TestFloatIDCT128x64() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<128, 64>();
+#endif
+}
+*/
+HWY_NOINLINE void TestFastTranspose128x128() { TestFastTranspose<128, 128>(); }
+HWY_NOINLINE void TestFloatTranspose128x128() {
+  TestFloatTranspose<128, 128>();
+}
+/*
+HWY_NOINLINE void TestFastIDCT128x128() { TestFastIDCT<128, 128>(); }
+HWY_NOINLINE void TestFloatIDCT128x128() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<128, 128>();
+#endif
+}
+*/
+HWY_NOINLINE void TestFastTranspose128x256() { TestFastTranspose<128, 256>(); }
+HWY_NOINLINE void TestFloatTranspose128x256() {
+  TestFloatTranspose<128, 256>();
+}
+/*
+HWY_NOINLINE void TestFastIDCT128x256() { TestFastIDCT<128, 256>(); }
+HWY_NOINLINE void TestFloatIDCT128x256() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<128, 256>();
+#endif
+}
+*/
+HWY_NOINLINE void TestFastTranspose256x128() { TestFastTranspose<256, 128>(); }
+HWY_NOINLINE void TestFloatTranspose256x128() {
+  TestFloatTranspose<256, 128>();
+}
+/*
+HWY_NOINLINE void TestFastIDCT256x128() { TestFastIDCT<256, 128>(); }
+HWY_NOINLINE void TestFloatIDCT256x128() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<256, 128>();
+#endif
+}
+*/
+HWY_NOINLINE void TestFastTranspose256x256() { TestFastTranspose<256, 256>(); }
+HWY_NOINLINE void TestFloatTranspose256x256() {
+  TestFloatTranspose<256, 256>();
+}
+/*
+HWY_NOINLINE void TestFastIDCT256x256() { TestFastIDCT<256, 256>(); }
+HWY_NOINLINE void TestFloatIDCT256x256() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<256, 256>();
+#endif
+}
+*/
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class FastDCTTargetTest : public hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(FastDCTTargetTest);
+
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose8x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose8x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose8x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose8x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose8x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose8x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose16x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose16x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose16x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose16x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose16x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose16x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose32x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose32x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose32x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose32x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose32x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose32x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose32x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose32x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose64x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose64x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose64x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose64x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose64x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose64x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose128x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose128x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose128x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose128x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose128x256);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose128x256);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose256x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose256x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose256x256);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose256x256);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT8x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT8x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT8x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT8x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT8x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT8x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT16x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT16x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT16x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT16x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT16x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT16x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT32x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT32x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT32x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT32x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT32x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT32x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT32x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT32x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT64x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT64x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT64x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT64x64);
+/*
+ * DCT-128 and above have very large errors just by rounding inputs.
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT64x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT64x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT128x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT128x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT128x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT128x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT128x256);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT128x256);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT256x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT256x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT256x256);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT256x256);
+*/
+
+TEST(FastDCTTest, TestWrapperFloat) { BenchmarkFloatIDCT32x32(); }
+TEST(FastDCTTest, TestWrapperFast) { BenchmarkFastIDCT32x32(); }
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/fast_math-inl.h b/third_party/jpeg-xl/lib/jxl/fast_math-inl.h
new file mode 100644
index 0000000000..5c48034290
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_math-inl.h
@@ -0,0 +1,236 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Fast SIMD math ops (log2, encoder only, cos, erf for splines)
+
+#if defined(LIB_JXL_FAST_MATH_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_FAST_MATH_INL_H_
+#undef LIB_JXL_FAST_MATH_INL_H_
+#else
+#define LIB_JXL_FAST_MATH_INL_H_
+#endif
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/common.h"
+#include "lib/jxl/rational_polynomial-inl.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Eq;
+using hwy::HWY_NAMESPACE::Floor;
+using hwy::HWY_NAMESPACE::Ge;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+using hwy::HWY_NAMESPACE::Le;
+using hwy::HWY_NAMESPACE::Min;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::NegMulAdd;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ShiftLeft;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::Xor;
+
+// Computes base-2 logarithm like std::log2. Undefined if negative / NaN.
+// L1 error ~3.9E-6
+template <class DF, class V>
+V FastLog2f(const DF df, V x) {
+  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
+  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
+                                          HWY_REP4(1.4287160470083755E+00f),
+                                          HWY_REP4(7.4245873327820566E-01f)};
+  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
+                                          HWY_REP4(1.0096718572241148E+00f),
+                                          HWY_REP4(1.7409343003366853E-01f)};
+
+  const Rebind<int32_t, DF> di;
+  const auto x_bits = BitCast(di, x);
+
+  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
+  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
+  // Shifted exponent = log2; also used to clear mantissa.
+  const auto exp_shifted = ShiftRight<23>(exp_bits);
+  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
+  const auto exp_val = ConvertTo(df, exp_shifted);
+  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
+             exp_val);
+}
+
+// max relative error ~3e-7
+template <class DF, class V>
+V FastPow2f(const DF df, V x) {
+  const Rebind<int32_t, DF> di;
+  auto floorx = Floor(x);
+  auto exp =
+      BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
+  auto frac = Sub(x, floorx);
+  auto num = Add(frac, Set(df, 1.01749063e+01));
+  num = MulAdd(num, frac, Set(df, 4.88687798e+01));
+  num = MulAdd(num, frac, Set(df, 9.85506591e+01));
+  num = Mul(num, exp);
+  auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02));
+  den = MulAdd(den, frac, Set(df, -1.94414990e+01));
+  den = MulAdd(den, frac, Set(df, 9.85506633e+01));
+  return Div(num, den);
+}
+
+// max relative error ~3e-5
+template <class DF, class V>
+V FastPowf(const DF df, V base, V exponent) {
+  return FastPow2f(df, Mul(FastLog2f(df, base), exponent));
+}
+
+// Computes cosine like std::cos.
+// L1 error 7e-5.
+template <class DF, class V>
+V FastCosf(const DF df, V x) {
+  // Step 1: range reduction to [0, 2pi)
+  const auto pi2 = Set(df, kPi * 2.0f);
+  const auto pi2_inv = Set(df, 0.5f / kPi);
+  const auto npi2 = Mul(Floor(Mul(x, pi2_inv)), pi2);
+  const auto xmodpi2 = Sub(x, npi2);
+  // Step 2: range reduction to [0, pi]
+  const auto x_pi = Min(xmodpi2, Sub(pi2, xmodpi2));
+  // Step 3: range reduction to [0, pi/2]
+  const auto above_pihalf = Ge(x_pi, Set(df, kPi / 2.0f));
+  const auto x_pihalf = IfThenElse(above_pihalf, Sub(Set(df, kPi), x_pi), x_pi);
+  // Step 4: Taylor-like approximation, scaled by 2**0.75 to make angle
+  // duplication steps faster, on x/4.
+  const auto xs = Mul(x_pihalf, Set(df, 0.25f));
+  const auto x2 = Mul(xs, xs);
+  const auto x4 = Mul(x2, x2);
+  const auto cosx_prescaling =
+      MulAdd(x4, Set(df, 0.06960438),
+             MulAdd(x2, Set(df, -0.84087373), Set(df, 1.68179268)));
+  // Step 5: angle duplication.
+  const auto cosx_scale1 =
+      MulAdd(cosx_prescaling, cosx_prescaling, Set(df, -1.414213562));
+  const auto cosx_scale2 = MulAdd(cosx_scale1, cosx_scale1, Set(df, -1));
+  // Step 6: change sign if needed.
+  const Rebind<uint32_t, DF> du;
+  auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, above_pihalf)));
+  return BitCast(df, Xor(signbit, BitCast(du, cosx_scale2)));
+}
+
+// Computes the error function like std::erf.
+// L1 error 7e-4.
+template <class DF, class V>
+V FastErff(const DF df, V x) {
+  // Formula from
+  // https://en.wikipedia.org/wiki/Error_function#Numerical_approximations
+  // but constants have been recomputed.
+  const auto xle0 = Le(x, Zero(df));
+  const auto absx = Abs(x);
+  // Compute 1 - 1 / ((((x * a + b) * x + c) * x + d) * x + 1)**4
+  const auto denom1 =
+      MulAdd(absx, Set(df, 7.77394369e-02), Set(df, 2.05260015e-04));
+  const auto denom2 = MulAdd(denom1, absx, Set(df, 2.32120216e-01));
+  const auto denom3 = MulAdd(denom2, absx, Set(df, 2.77820801e-01));
+  const auto denom4 = MulAdd(denom3, absx, Set(df, 1.0f));
+  const auto denom5 = Mul(denom4, denom4);
+  const auto inv_denom5 = Div(Set(df, 1.0f), denom5);
+  const auto result = NegMulAdd(inv_denom5, inv_denom5, Set(df, 1.0f));
+  // Change sign if needed.
+  const Rebind<uint32_t, DF> du;
+  auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, xle0)));
+  return BitCast(df, Xor(signbit, BitCast(du, result)));
+}
+
+inline float FastLog2f(float f) {
+  HWY_CAPPED(float, 1) D;
+  return GetLane(FastLog2f(D, Set(D, f)));
+}
+
+inline float FastPow2f(float f) {
+  HWY_CAPPED(float, 1) D;
+  return GetLane(FastPow2f(D, Set(D, f)));
+}
+
+inline float FastPowf(float b, float e) {
+  HWY_CAPPED(float, 1) D;
+  return GetLane(FastPowf(D, Set(D, b), Set(D, e)));
+}
+
+inline float FastCosf(float f) {
+  HWY_CAPPED(float, 1) D;
+  return GetLane(FastCosf(D, Set(D, f)));
+}
+
+inline float FastErff(float f) {
+  HWY_CAPPED(float, 1) D;
+  return GetLane(FastErff(D, Set(D, f)));
+}
+
+// Returns cbrt(x) + add with 6 ulp max error.
+// Modified from vectormath_exp.h, Apache 2 license.
+// https://www.agner.org/optimize/vectorclass.zip
+template <class V>
+V CubeRootAndAdd(const V x, const V add) {
+  const HWY_FULL(float) df;
+  const HWY_FULL(int32_t) di;
+
+  const auto kExpBias = Set(di, 0x54800000);  // cast(1.) + cast(1.) / 3
+  const auto kExpMul = Set(di, 0x002AAAAA);   // shifted 1/3
+  const auto k1_3 = Set(df, 1.0f / 3);
+  const auto k4_3 = Set(df, 4.0f / 3);
+
+  const auto xa = x;  // assume inputs never negative
+  const auto xa_3 = Mul(k1_3, xa);
+
+  // Multiply exponent by -1/3
+  const auto m1 = BitCast(di, xa);
+  // Special case for 0. 0 is represented with an exponent of 0, so the
+  // "kExpBias - 1/3 * exp" below gives the wrong result. The IfThenZeroElse()
+  // sets those values as 0, which prevents having NaNs in the computations
+  // below.
+  // TODO(eustas): use fused op
+  const auto m2 = IfThenZeroElse(
+      Eq(m1, Zero(di)), Sub(kExpBias, Mul((ShiftRight<23>(m1)), kExpMul)));
+  auto r = BitCast(df, m2);
+
+  // Newton-Raphson iterations
+  for (int i = 0; i < 3; i++) {
+    const auto r2 = Mul(r, r);
+    r = NegMulAdd(xa_3, Mul(r2, r2), Mul(k4_3, r));
+  }
+  // Final iteration
+  auto r2 = Mul(r, r);
+  r = MulAdd(k1_3, NegMulAdd(xa, Mul(r2, r2), r), r);
+  r2 = Mul(r, r);
+  r = MulAdd(r2, x, add);
+
+  return r;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_FAST_MATH_INL_H_
+
+#if HWY_ONCE
+#ifndef FAST_MATH_ONCE
+#define FAST_MATH_ONCE
+
+namespace jxl {
+inline float FastLog2f(float f) { return HWY_STATIC_DISPATCH(FastLog2f)(f); }
+inline float FastPow2f(float f) { return HWY_STATIC_DISPATCH(FastPow2f)(f); }
+inline float FastPowf(float b, float e) {
+  return HWY_STATIC_DISPATCH(FastPowf)(b, e);
+}
+inline float FastCosf(float f) { return HWY_STATIC_DISPATCH(FastCosf)(f); }
+inline float FastErff(float f) { return HWY_STATIC_DISPATCH(FastErff)(f); }
+}  // namespace jxl
+
+#endif  // FAST_MATH_ONCE
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/fast_math_test.cc b/third_party/jpeg-xl/lib/jxl/fast_math_test.cc
new file mode 100644
index 0000000000..897aadc120
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_math_test.cc
@@ -0,0 +1,288 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/fast_math_test.cc"
+#include <hwy/foreach_target.h>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/dec_xyb-inl.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+// Test utils
+#include <hwy/highway.h>
+#include <hwy/tests/test_util-inl.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+HWY_NOINLINE void TestFastLog2() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(1e-7f, 1e3f);
+    const auto actual_v = FastLog2f(d, Set(d, f));
+    const float actual = GetLane(actual_v);
+    const float abs_err = std::abs(std::log2(f) - actual);
+    EXPECT_LT(abs_err, 3.1E-6) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestFastPow2() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_rel_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(-100, 100);
+    const auto actual_v = FastPow2f(d, Set(d, f));
+    const float actual = GetLane(actual_v);
+    const float expected = std::pow(2, f);
+    const float rel_err = std::abs(expected - actual) / expected;
+    EXPECT_LT(rel_err, 3.1E-6) << "f = " << f;
+    max_rel_err = std::max(max_rel_err, rel_err);
+  }
+  printf("max rel err %e\n", static_cast<double>(max_rel_err));
+}
+
+HWY_NOINLINE void TestFastPow() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_rel_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float b = rng.UniformF(1e-3f, 1e3f);
+    const float e = rng.UniformF(-10, 10);
+    const auto actual_v = FastPowf(d, Set(d, b), Set(d, e));
+    const float actual = GetLane(actual_v);
+    const float expected = std::pow(b, e);
+    const float rel_err = std::abs(expected - actual) / expected;
+    EXPECT_LT(rel_err, 3E-5) << "b = " << b << " e = " << e;
+    max_rel_err = std::max(max_rel_err, rel_err);
+  }
+  printf("max rel err %e\n", static_cast<double>(max_rel_err));
+}
+
+HWY_NOINLINE void TestFastCos() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(-1e3f, 1e3f);
+    const auto actual_v = FastCosf(d, Set(d, f));
+    const float actual = GetLane(actual_v);
+    const float abs_err = std::abs(std::cos(f) - actual);
+    EXPECT_LT(abs_err, 7E-5) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestFastErf() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(-5.f, 5.f);
+    const auto actual_v = FastErff(d, Set(d, f));
+    const float actual = GetLane(actual_v);
+    const float abs_err = std::abs(std::erf(f) - actual);
+    EXPECT_LT(abs_err, 7E-4) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestCubeRoot() {
+  const HWY_FULL(float) d;
+  for (uint64_t x5 = 0; x5 < 2000000; x5++) {
+    const float x = x5 * 1E-5f;
+    const float expected = cbrtf(x);
+    HWY_ALIGN float approx[MaxLanes(d)];
+    Store(CubeRootAndAdd(Set(d, x), Zero(d)), d, approx);
+
+    // All lanes are same
+    for (size_t i = 1; i < Lanes(d); ++i) {
+      EXPECT_NEAR(approx[0], approx[i], 5E-7f);
+    }
+    EXPECT_NEAR(approx[0], expected, 8E-7f);
+  }
+}
+
+HWY_NOINLINE void TestFastSRGB() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(0.0f, 1.0f);
+    const auto actual_v = FastLinearToSRGB(d, Set(d, f));
+    const float actual = GetLane(actual_v);
+    const float expected = GetLane(TF_SRGB().EncodedFromDisplay(d, Set(d, f)));
+    const float abs_err = std::abs(expected - actual);
+    EXPECT_LT(abs_err, 1.2E-4) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestFastPQEFD() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(0.0f, 1.0f);
+    const float actual = GetLane(TF_PQ().EncodedFromDisplay(d, Set(d, f)));
+    const float expected = TF_PQ().EncodedFromDisplay(f);
+    const float abs_err = std::abs(expected - actual);
+    EXPECT_LT(abs_err, 7e-7) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestFastHLGEFD() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(0.0f, 1.0f);
+    const float actual = GetLane(TF_HLG().EncodedFromDisplay(d, Set(d, f)));
+    const float expected = TF_HLG().EncodedFromDisplay(f);
+    const float abs_err = std::abs(expected - actual);
+    EXPECT_LT(abs_err, 5e-7) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestFast709EFD() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(0.0f, 1.0f);
+    const float actual = GetLane(TF_709().EncodedFromDisplay(d, Set(d, f)));
+    const float expected = TF_709().EncodedFromDisplay(f);
+    const float abs_err = std::abs(expected - actual);
+    EXPECT_LT(abs_err, 2e-6) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestFastPQDFE() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(0.0f, 1.0f);
+    const float actual = GetLane(TF_PQ().DisplayFromEncoded(d, Set(d, f)));
+    const float expected = TF_PQ().DisplayFromEncoded(f);
+    const float abs_err = std::abs(expected - actual);
+    EXPECT_LT(abs_err, 3E-6) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestFastXYB() {
+  if (!HasFastXYBTosRGB8()) return;
+  ImageMetadata metadata;
+  ImageBundle ib(&metadata);
+  int scaling = 1;
+  int n = 256 * scaling;
+  float inv_scaling = 1.0f / scaling;
+  int kChunk = 32;
+  // The image is divided in chunks to reduce total memory usage.
+  for (int cr = 0; cr < n; cr += kChunk) {
+    for (int cg = 0; cg < n; cg += kChunk) {
+      for (int cb = 0; cb < n; cb += kChunk) {
+        Image3F chunk(kChunk * kChunk, kChunk);
+        for (int ir = 0; ir < kChunk; ir++) {
+          for (int ig = 0; ig < kChunk; ig++) {
+            for (int ib = 0; ib < kChunk; ib++) {
+              float r = (cr + ir) * inv_scaling;
+              float g = (cg + ig) * inv_scaling;
+              float b = (cb + ib) * inv_scaling;
+              chunk.PlaneRow(0, ir)[ig * kChunk + ib] = r * (1.0f / 255);
+              chunk.PlaneRow(1, ir)[ig * kChunk + ib] = g * (1.0f / 255);
+              chunk.PlaneRow(2, ir)[ig * kChunk + ib] = b * (1.0f / 255);
+            }
+          }
+        }
+        ib.SetFromImage(std::move(chunk), ColorEncoding::SRGB());
+        Image3F xyb(kChunk * kChunk, kChunk);
+        std::vector<uint8_t> roundtrip(kChunk * kChunk * kChunk * 3);
+        ToXYB(ib, nullptr, &xyb, GetJxlCms());
+        for (int y = 0; y < kChunk; y++) {
+          const float* xyba[4] = {xyb.PlaneRow(0, y), xyb.PlaneRow(1, y),
+                                  xyb.PlaneRow(2, y), nullptr};
+          jxl::HWY_NAMESPACE::FastXYBTosRGB8(
+              xyba, roundtrip.data() + 3 * xyb.xsize() * y, false, xyb.xsize());
+        }
+        for (int ir = 0; ir < kChunk; ir++) {
+          for (int ig = 0; ig < kChunk; ig++) {
+            for (int ib = 0; ib < kChunk; ib++) {
+              float r = (cr + ir) * inv_scaling;
+              float g = (cg + ig) * inv_scaling;
+              float b = (cb + ib) * inv_scaling;
+              size_t idx = ir * kChunk * kChunk + ig * kChunk + ib;
+              int rr = roundtrip[3 * idx];
+              int rg = roundtrip[3 * idx + 1];
+              int rb = roundtrip[3 * idx + 2];
+              EXPECT_LT(abs(r - rr), 2) << "expected " << r << " got " << rr;
+              EXPECT_LT(abs(g - rg), 2) << "expected " << g << " got " << rg;
+              EXPECT_LT(abs(b - rb), 2) << "expected " << b << " got " << rb;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class FastMathTargetTest : public hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(FastMathTargetTest);
+
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastLog2);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPow2);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPow);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastCos);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastErf);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestCubeRoot);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastSRGB);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPQDFE);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPQEFD);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastHLGEFD);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFast709EFD);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastXYB);
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/field_encodings.h b/third_party/jpeg-xl/lib/jxl/field_encodings.h
new file mode 100644
index 0000000000..613e8fad33
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/field_encodings.h
@@ -0,0 +1,134 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_FIELD_ENCODINGS_H_
+#define LIB_JXL_FIELD_ENCODINGS_H_
+
+// Constants needed to encode/decode fields; avoids including the full fields.h.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <hwy/base.h>
+#include <vector>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Macro to define the Fields' derived class Name when compiling with debug
+// names.
+#if JXL_IS_DEBUG_BUILD
+#define JXL_FIELDS_NAME(X) \
+  const char* Name() const override { return #X; }
+#else
+#define JXL_FIELDS_NAME(X)
+#endif  // JXL_IS_DEBUG_BUILD
+
+class Visitor;
+class Fields {
+ public:
+  virtual ~Fields() = default;
+#if JXL_IS_DEBUG_BUILD
+  virtual const char* Name() const = 0;
+#endif  // JXL_IS_DEBUG_BUILD
+  virtual Status VisitFields(Visitor* JXL_RESTRICT visitor) = 0;
+};
+
+// Distribution of U32 values for one particular selector. Represents either a
+// power of two-sized range, or a single value. A separate type ensures this is
+// only passed to the U32Enc ctor.
+struct U32Distr {
+  // No need to validate - all `d` are legitimate.
+  constexpr explicit U32Distr(uint32_t d) : d(d) {}
+
+  static constexpr uint32_t kDirect = 0x80000000u;
+
+  constexpr bool IsDirect() const { return (d & kDirect) != 0; }
+
+  // Only call if IsDirect().
+  constexpr uint32_t Direct() const { return d & (kDirect - 1); }
+
+  // Only call if !IsDirect().
+  constexpr size_t ExtraBits() const { return (d & 0x1F) + 1; }
+  uint32_t Offset() const { return (d >> 5) & 0x3FFFFFF; }
+
+  uint32_t d;
+};
+
+// A direct-coded 31-bit value occupying 2 bits in the bitstream.
+constexpr U32Distr Val(uint32_t value) {
+  return U32Distr(value | U32Distr::kDirect);
+}
+
+// Value - `offset` will be signaled in `bits` extra bits.
+constexpr U32Distr BitsOffset(uint32_t bits, uint32_t offset) {
+  return U32Distr(((bits - 1) & 0x1F) + ((offset & 0x3FFFFFF) << 5));
+}
+
+// Value will be signaled in `bits` extra bits.
+constexpr U32Distr Bits(uint32_t bits) { return BitsOffset(bits, 0); }
+
+// See U32Coder documentation in fields.h.
+class U32Enc {
+ public:
+  constexpr U32Enc(const U32Distr d0, const U32Distr d1, const U32Distr d2,
+                   const U32Distr d3)
+      : d_{d0, d1, d2, d3} {}
+
+  // Returns the U32Distr at `selector` = 0..3, least-significant first.
+  U32Distr GetDistr(const uint32_t selector) const {
+    JXL_ASSERT(selector < 4);
+    return d_[selector];
+  }
+
+ private:
+  U32Distr d_[4];
+};
+
+// Returns bit with the given `index` (0 = least significant).
+template <typename T>
+static inline constexpr uint64_t MakeBit(T index) {
+  return 1ULL << static_cast<uint32_t>(index);
+}
+
+// Returns vector of all possible values of an Enum type. Relies on each Enum
+// providing an overload of EnumBits() that returns a bit array of its values,
+// which implies values must be in [0, 64).
+template <typename Enum>
+std::vector<Enum> Values() {
+  uint64_t bits = EnumBits(Enum());
+
+  std::vector<Enum> values;
+  values.reserve(hwy::PopCount(bits));
+
+  // For each 1-bit in bits: add its index as value
+  while (bits != 0) {
+    const int index = Num0BitsBelowLS1Bit_Nonzero(bits);
+    values.push_back(static_cast<Enum>(index));
+    bits &= bits - 1;  // clear least-significant bit
+  }
+  return values;
+}
+
+// Returns true if value is one of Values<Enum>().
+template <class Enum>
+Status EnumValid(const Enum value) {
+  if (static_cast<uint32_t>(value) >= 64) {
+    return JXL_FAILURE("Value %u too large for %s\n",
+                       static_cast<uint32_t>(value), EnumName(Enum()));
+  }
+  const uint64_t bit = MakeBit(value);
+  if ((EnumBits(Enum()) & bit) == 0) {
+    return JXL_FAILURE("Invalid value %u for %s\n",
+                       static_cast<uint32_t>(value), EnumName(Enum()));
+  }
+  return true;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_FIELD_ENCODINGS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/fields.cc b/third_party/jpeg-xl/lib/jxl/fields.cc
new file mode 100644
index 0000000000..cd1e72bd94
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fields.cc
@@ -0,0 +1,642 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/fields.h"
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <cmath>
+#include <hwy/base.h>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/printf_macros.h"
+
+namespace jxl {
+
+namespace {
+
+using ::jxl::fields_internal::VisitorBase;
+
+struct InitVisitor : public VisitorBase {
+  Status Bits(const size_t /*unused*/, const uint32_t default_value,
+              uint32_t* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status U32(const U32Enc /*unused*/, const uint32_t default_value,
+             uint32_t* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status U64(const uint64_t default_value,
+             uint64_t* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status Bool(bool default_value, bool* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status F16(const float default_value, float* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  // Always visit conditional fields to ensure they are initialized.
+  Status Conditional(bool /*condition*/) override { return true; }
+
+  Status AllDefault(const Fields& /*fields*/,
+                    bool* JXL_RESTRICT all_default) override {
+    // Just initialize this field and don't skip initializing others.
+    JXL_RETURN_IF_ERROR(Bool(true, all_default));
+    return false;
+  }
+
+  Status VisitNested(Fields* /*fields*/) override {
+    // Avoid re-initializing nested bundles (their ctors already called
+    // Bundle::Init for their fields).
+    return true;
+  }
+};
+
+// Similar to InitVisitor, but also initializes nested fields.
+struct SetDefaultVisitor : public VisitorBase {
+  Status Bits(const size_t /*unused*/, const uint32_t default_value,
+              uint32_t* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status U32(const U32Enc /*unused*/, const uint32_t default_value,
+             uint32_t* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status U64(const uint64_t default_value,
+             uint64_t* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status Bool(bool default_value, bool* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status F16(const float default_value, float* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  // Always visit conditional fields to ensure they are initialized.
+  Status Conditional(bool /*condition*/) override { return true; }
+
+  Status AllDefault(const Fields& /*fields*/,
+                    bool* JXL_RESTRICT all_default) override {
+    // Just initialize this field and don't skip initializing others.
+    JXL_RETURN_IF_ERROR(Bool(true, all_default));
+    return false;
+  }
+};
+
+class AllDefaultVisitor : public VisitorBase {
+ public:
+  explicit AllDefaultVisitor() : VisitorBase() {}
+
+  Status Bits(const size_t bits, const uint32_t default_value,
+              uint32_t* JXL_RESTRICT value) override {
+    all_default_ &= *value == default_value;
+    return true;
+  }
+
+  Status U32(const U32Enc /*unused*/, const uint32_t default_value,
+             uint32_t* JXL_RESTRICT value) override {
+    all_default_ &= *value == default_value;
+    return true;
+  }
+
+  Status U64(const uint64_t default_value,
+             uint64_t* JXL_RESTRICT value) override {
+    all_default_ &= *value == default_value;
+    return true;
+  }
+
+  Status F16(const float default_value, float* JXL_RESTRICT value) override {
+    all_default_ &= std::abs(*value - default_value) < 1E-6f;
+    return true;
+  }
+
+  Status AllDefault(const Fields& /*fields*/,
+                    bool* JXL_RESTRICT /*all_default*/) override {
+    // Visit all fields so we can compute the actual all_default_ value.
+    return false;
+  }
+
+  bool AllDefault() const { return all_default_; }
+
+ private:
+  bool all_default_ = true;
+};
+
+class ReadVisitor : public VisitorBase {
+ public:
+  explicit ReadVisitor(BitReader* reader) : VisitorBase(), reader_(reader) {}
+
+  Status Bits(const size_t bits, const uint32_t /*default_value*/,
+              uint32_t* JXL_RESTRICT value) override {
+    *value = BitsCoder::Read(bits, reader_);
+    if (!reader_->AllReadsWithinBounds()) {
+      return JXL_STATUS(StatusCode::kNotEnoughBytes,
+                        "Not enough bytes for header");
+    }
+    return true;
+  }
+
+  Status U32(const U32Enc dist, const uint32_t /*default_value*/,
+             uint32_t* JXL_RESTRICT value) override {
+    *value = U32Coder::Read(dist, reader_);
+    if (!reader_->AllReadsWithinBounds()) {
+      return JXL_STATUS(StatusCode::kNotEnoughBytes,
+                        "Not enough bytes for header");
+    }
+    return true;
+  }
+
+  Status U64(const uint64_t /*default_value*/,
+             uint64_t* JXL_RESTRICT value) override {
+    *value = U64Coder::Read(reader_);
+    if (!reader_->AllReadsWithinBounds()) {
+      return JXL_STATUS(StatusCode::kNotEnoughBytes,
+                        "Not enough bytes for header");
+    }
+    return true;
+  }
+
+  Status F16(const float /*default_value*/,
+             float* JXL_RESTRICT value) override {
+    ok_ &= F16Coder::Read(reader_, value);
+    if (!reader_->AllReadsWithinBounds()) {
+      return JXL_STATUS(StatusCode::kNotEnoughBytes,
+                        "Not enough bytes for header");
+    }
+    return true;
+  }
+
+  void SetDefault(Fields* fields) override { Bundle::SetDefault(fields); }
+
+  bool IsReading() const override { return true; }
+
+  // This never fails because visitors are expected to keep reading until
+  // EndExtensions, see comment there.
+  Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override {
+    JXL_QUIET_RETURN_IF_ERROR(VisitorBase::BeginExtensions(extensions));
+    if (*extensions == 0) return true;
+
+    // For each nonzero bit, i.e. extension that is present:
+    for (uint64_t remaining_extensions = *extensions; remaining_extensions != 0;
+         remaining_extensions &= remaining_extensions - 1) {
+      const size_t idx_extension =
+          Num0BitsBelowLS1Bit_Nonzero(remaining_extensions);
+      // Read additional U64 (one per extension) indicating the number of bits
+      // (allows skipping individual extensions).
+      JXL_RETURN_IF_ERROR(U64(0, &extension_bits_[idx_extension]));
+      if (!SafeAdd(total_extension_bits_, extension_bits_[idx_extension],
+                   total_extension_bits_)) {
+        return JXL_FAILURE("Extension bits overflowed, invalid codestream");
+      }
+    }
+    // Used by EndExtensions to skip past any _remaining_ extensions.
+    pos_after_ext_size_ = reader_->TotalBitsConsumed();
+    JXL_ASSERT(pos_after_ext_size_ != 0);
+    return true;
+  }
+
+  Status EndExtensions() override {
+    JXL_QUIET_RETURN_IF_ERROR(VisitorBase::EndExtensions());
+    // Happens if extensions == 0: don't read size, done.
+    if (pos_after_ext_size_ == 0) return true;
+
+    // Not enough bytes as set by BeginExtensions or earlier. Do not return
+    // this as a JXL_FAILURE or false (which can also propagate to error
+    // through e.g. JXL_RETURN_IF_ERROR), since this may be used while
+    // silently checking whether there are enough bytes. If this case must be
+    // treated as an error, reader_>Close() will do this, just like is already
+    // done for non-extension fields.
+    if (!enough_bytes_) return true;
+
+    // Skip new fields this (old?) decoder didn't know about, if any.
+    const size_t bits_read = reader_->TotalBitsConsumed();
+    uint64_t end;
+    if (!SafeAdd(pos_after_ext_size_, total_extension_bits_, end)) {
+      return JXL_FAILURE("Invalid extension size, caused overflow");
+    }
+    if (bits_read > end) {
+      return JXL_FAILURE("Read more extension bits than budgeted");
+    }
+    const size_t remaining_bits = end - bits_read;
+    if (remaining_bits != 0) {
+      JXL_WARNING("Skipping %" PRIuS "-bit extension(s)", remaining_bits);
+      reader_->SkipBits(remaining_bits);
+      if (!reader_->AllReadsWithinBounds()) {
+        return JXL_STATUS(StatusCode::kNotEnoughBytes,
+                          "Not enough bytes for header");
+      }
+    }
+    return true;
+  }
+
+  Status OK() const { return ok_; }
+
+ private:
+  // Whether any error other than not enough bytes occurred.
+  bool ok_ = true;
+
+  // Whether there are enough input bytes to read from.
+  bool enough_bytes_ = true;
+  BitReader* const reader_;
+  // May be 0 even if the corresponding extension is present.
+  uint64_t extension_bits_[Bundle::kMaxExtensions] = {0};
+  uint64_t total_extension_bits_ = 0;
+  size_t pos_after_ext_size_ = 0;  // 0 iff extensions == 0.
+};
+
+class MaxBitsVisitor : public VisitorBase {
+ public:
+  Status Bits(const size_t bits, const uint32_t /*default_value*/,
+              uint32_t* JXL_RESTRICT /*value*/) override {
+    max_bits_ += BitsCoder::MaxEncodedBits(bits);
+    return true;
+  }
+
+  Status U32(const U32Enc enc, const uint32_t /*default_value*/,
+             uint32_t* JXL_RESTRICT /*value*/) override {
+    max_bits_ += U32Coder::MaxEncodedBits(enc);
+    return true;
+  }
+
+  Status U64(const uint64_t /*default_value*/,
+             uint64_t* JXL_RESTRICT /*value*/) override {
+    max_bits_ += U64Coder::MaxEncodedBits();
+    return true;
+  }
+
+  Status F16(const float /*default_value*/,
+             float* JXL_RESTRICT /*value*/) override {
+    max_bits_ += F16Coder::MaxEncodedBits();
+    return true;
+  }
+
+  Status AllDefault(const Fields& /*fields*/,
+                    bool* JXL_RESTRICT all_default) override {
+    JXL_RETURN_IF_ERROR(Bool(true, all_default));
+    return false;  // For max bits, assume nothing is default
+  }
+
+  // Always visit conditional fields to get a (loose) upper bound.
+  Status Conditional(bool /*condition*/) override { return true; }
+
+  Status BeginExtensions(uint64_t* JXL_RESTRICT /*extensions*/) override {
+    // Skip - extensions are not included in "MaxBits" because their length
+    // is potentially unbounded.
+    return true;
+  }
+
+  Status EndExtensions() override { return true; }
+
+  size_t MaxBits() const { return max_bits_; }
+
+ private:
+  size_t max_bits_ = 0;
+};
+
+class CanEncodeVisitor : public VisitorBase {
+ public:
+  explicit CanEncodeVisitor() : VisitorBase() {}
+
+  Status Bits(const size_t bits, const uint32_t /*default_value*/,
+              uint32_t* JXL_RESTRICT value) override {
+    size_t encoded_bits = 0;
+    ok_ &= BitsCoder::CanEncode(bits, *value, &encoded_bits);
+    encoded_bits_ += encoded_bits;
+    return true;
+  }
+
+  Status U32(const U32Enc enc, const uint32_t /*default_value*/,
+             uint32_t* JXL_RESTRICT value) override {
+    size_t encoded_bits = 0;
+    ok_ &= U32Coder::CanEncode(enc, *value, &encoded_bits);
+    encoded_bits_ += encoded_bits;
+    return true;
+  }
+
+  Status U64(const uint64_t /*default_value*/,
+             uint64_t* JXL_RESTRICT value) override {
+    size_t encoded_bits = 0;
+    ok_ &= U64Coder::CanEncode(*value, &encoded_bits);
+    encoded_bits_ += encoded_bits;
+    return true;
+  }
+
+  Status F16(const float /*default_value*/,
+             float* JXL_RESTRICT value) override {
+    size_t encoded_bits = 0;
+    ok_ &= F16Coder::CanEncode(*value, &encoded_bits);
+    encoded_bits_ += encoded_bits;
+    return true;
+  }
+
+  Status AllDefault(const Fields& fields,
+                    bool* JXL_RESTRICT all_default) override {
+    *all_default = Bundle::AllDefault(fields);
+    JXL_RETURN_IF_ERROR(Bool(true, all_default));
+    return *all_default;
+  }
+
+  Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override {
+    JXL_QUIET_RETURN_IF_ERROR(VisitorBase::BeginExtensions(extensions));
+    extensions_ = *extensions;
+    if (*extensions != 0) {
+      JXL_ASSERT(pos_after_ext_ == 0);
+      pos_after_ext_ = encoded_bits_;
+      JXL_ASSERT(pos_after_ext_ != 0);  // visited "extensions"
+    }
+    return true;
+  }
+  // EndExtensions = default.
+
+  Status GetSizes(size_t* JXL_RESTRICT extension_bits,
+                  size_t* JXL_RESTRICT total_bits) {
+    JXL_RETURN_IF_ERROR(ok_);
+    *extension_bits = 0;
+    *total_bits = encoded_bits_;
+    // Only if extension field was nonzero will we encode their sizes.
+    if (pos_after_ext_ != 0) {
+      JXL_ASSERT(encoded_bits_ >= pos_after_ext_);
+      *extension_bits = encoded_bits_ - pos_after_ext_;
+      // Also need to encode *extension_bits and bill it to *total_bits.
+      size_t encoded_bits = 0;
+      ok_ &= U64Coder::CanEncode(*extension_bits, &encoded_bits);
+      *total_bits += encoded_bits;
+
+      // TODO(janwas): support encoding individual extension sizes. We
+      // currently ascribe all bits to the first and send zeros for the
+      // others.
+      for (size_t i = 1; i < hwy::PopCount(extensions_); ++i) {
+        encoded_bits = 0;
+        ok_ &= U64Coder::CanEncode(0, &encoded_bits);
+        *total_bits += encoded_bits;
+      }
+    }
+    return true;
+  }
+
+ private:
+  bool ok_ = true;
+  size_t encoded_bits_ = 0;
+  uint64_t extensions_ = 0;
+  // Snapshot of encoded_bits_ after visiting the extension field, but NOT
+  // including the hidden extension sizes.
+  uint64_t pos_after_ext_ = 0;
+};
+}  // namespace
+
+void Bundle::Init(Fields* fields) {
+  InitVisitor visitor;
+  if (!visitor.Visit(fields)) {
+    JXL_ABORT("Init should never fail");
+  }
+}
+void Bundle::SetDefault(Fields* fields) {
+  SetDefaultVisitor visitor;
+  if (!visitor.Visit(fields)) {
+    JXL_ABORT("SetDefault should never fail");
+  }
+}
+bool Bundle::AllDefault(const Fields& fields) {
+  AllDefaultVisitor visitor;
+  if (!visitor.VisitConst(fields)) {
+    JXL_ABORT("AllDefault should never fail");
+  }
+  return visitor.AllDefault();
+}
+size_t Bundle::MaxBits(const Fields& fields) {
+  MaxBitsVisitor visitor;
+#if JXL_ENABLE_ASSERT
+  Status ret =
+#else
+  (void)
+#endif  // JXL_ENABLE_ASSERT
+      visitor.VisitConst(fields);
+  JXL_ASSERT(ret);
+  return visitor.MaxBits();
+}
+Status Bundle::CanEncode(const Fields& fields, size_t* extension_bits,
+                         size_t* total_bits) {
+  CanEncodeVisitor visitor;
+  JXL_QUIET_RETURN_IF_ERROR(visitor.VisitConst(fields));
+  JXL_QUIET_RETURN_IF_ERROR(visitor.GetSizes(extension_bits, total_bits));
+  return true;
+}
+Status Bundle::Read(BitReader* reader, Fields* fields) {
+  ReadVisitor visitor(reader);
+  JXL_RETURN_IF_ERROR(visitor.Visit(fields));
+  return visitor.OK();
+}
+bool Bundle::CanRead(BitReader* reader, Fields* fields) {
+  ReadVisitor visitor(reader);
+  Status status = visitor.Visit(fields);
+  // We are only checking here whether there are enough bytes. We still return
+  // true for other errors because it means there are enough bytes to determine
+  // there's an error. Use Read() to determine which error it is.
+  return status.code() != StatusCode::kNotEnoughBytes;
+}
+
+size_t BitsCoder::MaxEncodedBits(const size_t bits) { return bits; }
+
+Status BitsCoder::CanEncode(const size_t bits, const uint32_t value,
+                            size_t* JXL_RESTRICT encoded_bits) {
+  *encoded_bits = bits;
+  if (value >= (1ULL << bits)) {
+    return JXL_FAILURE("Value %u too large for %" PRIu64 " bits", value,
+                       static_cast<uint64_t>(bits));
+  }
+  return true;
+}
+
+uint32_t BitsCoder::Read(const size_t bits, BitReader* JXL_RESTRICT reader) {
+  return reader->ReadBits(bits);
+}
+
+size_t U32Coder::MaxEncodedBits(const U32Enc enc) {
+  size_t extra_bits = 0;
+  for (uint32_t selector = 0; selector < 4; ++selector) {
+    const U32Distr d = enc.GetDistr(selector);
+    if (d.IsDirect()) {
+      continue;
+    } else {
+      extra_bits = std::max<size_t>(extra_bits, d.ExtraBits());
+    }
+  }
+  return 2 + extra_bits;
+}
+
+Status U32Coder::CanEncode(const U32Enc enc, const uint32_t value,
+                           size_t* JXL_RESTRICT encoded_bits) {
+  uint32_t selector;
+  size_t total_bits;
+  const Status ok = ChooseSelector(enc, value, &selector, &total_bits);
+  *encoded_bits = ok ? total_bits : 0;
+  return ok;
+}
+
+uint32_t U32Coder::Read(const U32Enc enc, BitReader* JXL_RESTRICT reader) {
+  const uint32_t selector = reader->ReadFixedBits<2>();
+  const U32Distr d = enc.GetDistr(selector);
+  if (d.IsDirect()) {
+    return d.Direct();
+  } else {
+    return reader->ReadBits(d.ExtraBits()) + d.Offset();
+  }
+}
+
+Status U32Coder::ChooseSelector(const U32Enc enc, const uint32_t value,
+                                uint32_t* JXL_RESTRICT selector,
+                                size_t* JXL_RESTRICT total_bits) {
+#if JXL_ENABLE_ASSERT
+  const size_t bits_required = 32 - Num0BitsAboveMS1Bit(value);
+#endif  // JXL_ENABLE_ASSERT
+  JXL_ASSERT(bits_required <= 32);
+
+  *selector = 0;
+  *total_bits = 0;
+
+  // It is difficult to verify whether Dist32Byte are sorted, so check all
+  // selectors and keep the one with the fewest total_bits.
+  *total_bits = 64;  // more than any valid encoding
+  for (uint32_t s = 0; s < 4; ++s) {
+    const U32Distr d = enc.GetDistr(s);
+    if (d.IsDirect()) {
+      if (d.Direct() == value) {
+        *selector = s;
+        *total_bits = 2;
+        return true;  // Done, direct is always the best possible.
+      }
+      continue;
+    }
+    const size_t extra_bits = d.ExtraBits();
+    const uint32_t offset = d.Offset();
+    if (value < offset || value >= offset + (1ULL << extra_bits)) continue;
+
+    // Better than prior encoding, remember it:
+    if (2 + extra_bits < *total_bits) {
+      *selector = s;
+      *total_bits = 2 + extra_bits;
+    }
+  }
+
+  if (*total_bits == 64) {
+    return JXL_FAILURE("No feasible selector for %u", value);
+  }
+
+  return true;
+}
+
+uint64_t U64Coder::Read(BitReader* JXL_RESTRICT reader) {
+  uint64_t selector = reader->ReadFixedBits<2>();
+  if (selector == 0) {
+    return 0;
+  }
+  if (selector == 1) {
+    return 1 + reader->ReadFixedBits<4>();
+  }
+  if (selector == 2) {
+    return 17 + reader->ReadFixedBits<8>();
+  }
+
+  // selector 3, varint, groups have first 12, then 8, and last 4 bits.
+  uint64_t result = reader->ReadFixedBits<12>();
+
+  uint64_t shift = 12;
+  while (reader->ReadFixedBits<1>()) {
+    if (shift == 60) {
+      result |= static_cast<uint64_t>(reader->ReadFixedBits<4>()) << shift;
+      break;
+    }
+    result |= static_cast<uint64_t>(reader->ReadFixedBits<8>()) << shift;
+    shift += 8;
+  }
+
+  return result;
+}
+
+// Can always encode, but useful because it also returns bit size.
+Status U64Coder::CanEncode(uint64_t value, size_t* JXL_RESTRICT encoded_bits) {
+  if (value == 0) {
+    *encoded_bits = 2;  // 2 selector bits
+  } else if (value <= 16) {
+    *encoded_bits = 2 + 4;  // 2 selector bits + 4 payload bits
+  } else if (value <= 272) {
+    *encoded_bits = 2 + 8;  // 2 selector bits + 8 payload bits
+  } else {
+    *encoded_bits = 2 + 12;  // 2 selector bits + 12 payload bits
+    value >>= 12;
+    int shift = 12;
+    while (value > 0 && shift < 60) {
+      *encoded_bits += 1 + 8;  // 1 continuation bit + 8 payload bits
+      value >>= 8;
+      shift += 8;
+    }
+    if (value > 0) {
+      // This only could happen if shift == N - 4.
+      *encoded_bits += 1 + 4;  // 1 continuation bit + 4 payload bits
+    } else {
+      *encoded_bits += 1;  // 1 stop bit
+    }
+  }
+
+  return true;
+}
+
+Status F16Coder::Read(BitReader* JXL_RESTRICT reader,
+                      float* JXL_RESTRICT value) {
+  const uint32_t bits16 = reader->ReadFixedBits<16>();
+  const uint32_t sign = bits16 >> 15;
+  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
+  const uint32_t mantissa = bits16 & 0x3FF;
+
+  if (JXL_UNLIKELY(biased_exp == 31)) {
+    return JXL_FAILURE("F16 infinity or NaN are not supported");
+  }
+
+  // Subnormal or zero
+  if (JXL_UNLIKELY(biased_exp == 0)) {
+    *value = (1.0f / 16384) * (mantissa * (1.0f / 1024));
+    if (sign) *value = -*value;
+    return true;
+  }
+
+  // Normalized: convert the representation directly (faster than ldexp/tables).
+  const uint32_t biased_exp32 = biased_exp + (127 - 15);
+  const uint32_t mantissa32 = mantissa << (23 - 10);
+  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
+  memcpy(value, &bits32, sizeof(bits32));
+  return true;
+}
+
+Status F16Coder::CanEncode(float value, size_t* JXL_RESTRICT encoded_bits) {
+  *encoded_bits = MaxEncodedBits();
+  if (std::isnan(value) || std::isinf(value)) {
+    return JXL_FAILURE("Should not attempt to store NaN and infinity");
+  }
+  return std::abs(value) <= 65504.0f;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/fields.h b/third_party/jpeg-xl/lib/jxl/fields.h
new file mode 100644
index 0000000000..10d0b7aa30
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fields.h
@@ -0,0 +1,377 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_FIELDS_H_
+#define LIB_JXL_FIELDS_H_
+
+// Forward/backward-compatible 'bundles' with auto-serialized 'fields'.
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cinttypes>
+#include <cmath>  // abs
+#include <cstdarg>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/field_encodings.h"
+
+namespace jxl {
+
+struct AuxOut;
+struct BitWriter;
+
+// Integer coders: BitsCoder (raw), U32Coder (table), U64Coder (varint).
+
+// Reads/writes a given (fixed) number of bits <= 32.
+namespace BitsCoder {
+size_t MaxEncodedBits(size_t bits);
+
+Status CanEncode(size_t bits, uint32_t value,
+                 size_t* JXL_RESTRICT encoded_bits);
+
+uint32_t Read(size_t bits, BitReader* JXL_RESTRICT reader);
+
+// Returns false if the value is too large to encode.
+Status Write(size_t bits, uint32_t value, BitWriter* JXL_RESTRICT writer);
+}  // namespace BitsCoder
+
+// Encodes u32 using a lookup table and/or extra bits, governed by a per-field
+// encoding `enc` which consists of four distributions `d` chosen via a 2-bit
+// selector (least significant = 0). Each d may have two modes:
+// - direct: if d.IsDirect(), the value is d.Direct();
+// - offset: the value is derived from d.ExtraBits() extra bits plus d.Offset();
+// This encoding is denser than Exp-Golomb or Gamma codes when both small and
+// large values occur.
+//
+// Examples:
+// Direct: U32Enc(Val(8), Val(16), Val(32), Bits(6)), value 32 => 10b.
+// Offset: U32Enc(Val(0), BitsOffset(1, 1), BitsOffset(2, 3), BitsOffset(8, 8))
+//   defines the following prefix code:
+//   00 -> 0
+//   01x -> 1..2
+//   10xx -> 3..7
+//   11xxxxxxxx -> 8..263
+namespace U32Coder {
+size_t MaxEncodedBits(U32Enc enc);
+Status CanEncode(U32Enc enc, uint32_t value, size_t* JXL_RESTRICT encoded_bits);
+uint32_t Read(U32Enc enc, BitReader* JXL_RESTRICT reader);
+
+// Returns false if the value is too large to encode.
+Status Write(U32Enc enc, uint32_t value, BitWriter* JXL_RESTRICT writer);
+
+// "private"
+Status ChooseSelector(U32Enc enc, uint32_t value,
+                      uint32_t* JXL_RESTRICT selector,
+                      size_t* JXL_RESTRICT total_bits);
+}  // namespace U32Coder
+
+// Encodes 64-bit unsigned integers with a fixed distribution, taking 2 bits
+// to encode 0, 6 bits to encode 1 to 16, 10 bits to encode 17 to 272, 15 bits
+// to encode up to 4095, and on the order of log2(value) * 1.125 bits for
+// larger values.
+namespace U64Coder {
+constexpr size_t MaxEncodedBits() { return 2 + 12 + 6 * (8 + 1) + (4 + 1); }
+
+uint64_t Read(BitReader* JXL_RESTRICT reader);
+
+// Returns false if the value is too large to encode.
+Status Write(uint64_t value, BitWriter* JXL_RESTRICT writer);
+
+// Can always encode, but useful because it also returns bit size.
+Status CanEncode(uint64_t value, size_t* JXL_RESTRICT encoded_bits);
+}  // namespace U64Coder
+
+// IEEE 754 half-precision (binary16). Refuses to read/write NaN/Inf.
+namespace F16Coder {
+constexpr size_t MaxEncodedBits() { return 16; }
+
+// Returns false if the bit representation is NaN or infinity
+Status Read(BitReader* JXL_RESTRICT reader, float* JXL_RESTRICT value);
+
+// Returns false if the value is too large to encode.
+Status Write(float value, BitWriter* JXL_RESTRICT writer);
+Status CanEncode(float value, size_t* JXL_RESTRICT encoded_bits);
+}  // namespace F16Coder
+
+// A "bundle" is a forward- and backward compatible collection of fields.
+// They are used for SizeHeader/FrameHeader/GroupHeader. Bundles can be
+// extended by appending(!) fields. Optional fields may be omitted from the
+// bitstream by conditionally visiting them. When reading new bitstreams with
+// old code, we skip unknown fields at the end of the bundle. This requires
+// storing the amount of extra appended bits, and that fields are visited in
+// chronological order of being added to the format, because old decoders
+// cannot skip some future fields and resume reading old fields. Similarly,
+// new readers query bits in an "extensions" field to skip (groups of) fields
+// not present in old bitstreams. Note that each bundle must include an
+// "extensions" field prior to freezing the format, otherwise it cannot be
+// extended.
+//
+// To ensure interoperability, there will be no opaque fields.
+//
+// HOWTO:
+// - basic usage: define a struct with member variables ("fields") and a
+//   VisitFields(v) member function that calls v->U32/Bool etc. for each
+//   field, specifying their default values. The ctor must call
+//   Bundle::Init(this).
+//
+// - print a trace of visitors: ensure each bundle has a static Name() member
+//   function, and change Bundle::Print* to return true.
+//
+// - optional fields: in VisitFields, add if (v->Conditional(your_condition))
+//   { v->Bool(default, &field); }. This prevents reading/writing field
+//   if !your_condition, which is typically computed from a prior field.
+//   WARNING: to ensure all fields are initialized, do not add an else branch;
+//   instead add another if (v->Conditional(!your_condition)).
+//
+// - repeated fields: for dynamic sizes, use e.g. std::vector and in
+//   VisitFields, if (v->IsReading()) field.resize(size) before accessing field.
+//   For static or bounded sizes, use an array or std::array. In all cases,
+//   simply visit each array element as if it were a normal field.
+//
+// - nested bundles: add a bundle as a normal field and in VisitFields call
+//   JXL_RETURN_IF_ERROR(v->VisitNested(&nested));
+//
+// - allow future extensions: define a "uint64_t extensions" field and call
+//   v->BeginExtensions(&extensions) after visiting all non-extension fields,
+//   and `return v->EndExtensions();` after the last extension field.
+//
+// - encode an entire bundle in one bit if ALL its fields equal their default
+//   values: add a "mutable bool all_default" field and as the first visitor:
+//   if (v->AllDefault(*this, &all_default)) {
+//     // Overwrite all serialized fields, but not any nonserialized_*.
+//     v->SetDefault(this);
+//     return true;
+//   }
+//   Note: if extensions are present, AllDefault() == false.
+
+namespace Bundle {
+constexpr size_t kMaxExtensions = 64;  // bits in u64
+
+// Initializes fields to the default values. It is not recursive to nested
+// fields, this function is intended to be called in the constructors so
+// each nested field will already Init itself.
+void Init(Fields* JXL_RESTRICT fields);
+
+// Similar to Init, but recursive to nested fields.
+void SetDefault(Fields* JXL_RESTRICT fields);
+
+// Returns whether ALL fields (including `extensions`, if present) are equal
+// to their default value.
+bool AllDefault(const Fields& fields);
+
+// Returns max number of bits required to encode a T.
+size_t MaxBits(const Fields& fields);
+
+// Returns whether a header's fields can all be encoded, i.e. they have a
+// valid representation. If so, "*total_bits" is the exact number of bits
+// required. Called by Write.
+Status CanEncode(const Fields& fields, size_t* JXL_RESTRICT extension_bits,
+                 size_t* JXL_RESTRICT total_bits);
+
+Status Read(BitReader* reader, Fields* JXL_RESTRICT fields);
+
+// Returns whether enough bits are available to fully read this bundle using
+// Read. Also returns true in case of a codestream error (other than not being
+// large enough): that means enough bits are available to determine there's an
+// error, use Read to get such error status.
+// NOTE: this advances the BitReader, a different one pointing back at the
+// original bit position in the codestream must be created to use Read after
+// this.
+bool CanRead(BitReader* reader, Fields* JXL_RESTRICT fields);
+
+Status Write(const Fields& fields, BitWriter* JXL_RESTRICT writer, size_t layer,
+             AuxOut* aux_out);
+}  // namespace Bundle
+
+// Different subclasses of Visitor are passed to implementations of Fields
+// throughout their lifetime. Templates used to be used for this but dynamic
+// polymorphism produces more compact executables than template reification did.
+class Visitor {
+ public:
+  virtual ~Visitor() = default;
+  virtual Status Visit(Fields* fields) = 0;
+
+  virtual Status Bool(bool default_value, bool* JXL_RESTRICT value) = 0;
+  virtual Status U32(U32Enc, uint32_t, uint32_t*) = 0;
+
+  // Helper to construct U32Enc from U32Distr.
+  Status U32(const U32Distr d0, const U32Distr d1, const U32Distr d2,
+             const U32Distr d3, const uint32_t default_value,
+             uint32_t* JXL_RESTRICT value) {
+    return U32(U32Enc(d0, d1, d2, d3), default_value, value);
+  }
+
+  template <typename EnumT>
+  Status Enum(const EnumT default_value, EnumT* JXL_RESTRICT value) {
+    uint32_t u32 = static_cast<uint32_t>(*value);
+    // 00 -> 0
+    // 01 -> 1
+    // 10xxxx -> 2..17
+    // 11yyyyyy -> 18..81
+    JXL_RETURN_IF_ERROR(U32(Val(0), Val(1), BitsOffset(4, 2), BitsOffset(6, 18),
+                            static_cast<uint32_t>(default_value), &u32));
+    *value = static_cast<EnumT>(u32);
+    return EnumValid(*value);
+  }
+
+  virtual Status Bits(size_t bits, uint32_t default_value,
+                      uint32_t* JXL_RESTRICT value) = 0;
+  virtual Status U64(uint64_t default_value, uint64_t* JXL_RESTRICT value) = 0;
+  virtual Status F16(float default_value, float* JXL_RESTRICT value) = 0;
+
+  // Returns whether VisitFields should visit some subsequent fields.
+  // "condition" is typically from prior fields, e.g. flags.
+  // Overridden by InitVisitor and MaxBitsVisitor.
+  virtual Status Conditional(bool condition) { return condition; }
+
+  // Overridden by InitVisitor, AllDefaultVisitor and CanEncodeVisitor.
+  virtual Status AllDefault(const Fields& /*fields*/,
+                            bool* JXL_RESTRICT all_default) {
+    JXL_RETURN_IF_ERROR(Bool(true, all_default));
+    return *all_default;
+  }
+
+  virtual void SetDefault(Fields* /*fields*/) {
+    // Do nothing by default, this is overridden by ReadVisitor.
+  }
+
+  // Returns the result of visiting a nested Bundle.
+  // Overridden by InitVisitor.
+  virtual Status VisitNested(Fields* fields) { return Visit(fields); }
+
+  // Overridden by ReadVisitor. Enables dynamically-sized fields.
+  virtual bool IsReading() const { return false; }
+
+  virtual Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) = 0;
+  virtual Status EndExtensions() = 0;
+};
+
+namespace fields_internal {
+// A bundle can be in one of three states concerning extensions: not-begun,
+// active, ended. Bundles may be nested, so we need a stack of states.
+class ExtensionStates {
+ public:
+  void Push() {
+    // Initial state = not-begun.
+    begun_ <<= 1;
+    ended_ <<= 1;
+  }
+
+  // Clears current state; caller must check IsEnded beforehand.
+  void Pop() {
+    begun_ >>= 1;
+    ended_ >>= 1;
+  }
+
+  // Returns true if state == active || state == ended.
+  Status IsBegun() const { return (begun_ & 1) != 0; }
+  // Returns true if state != not-begun && state != active.
+  Status IsEnded() const { return (ended_ & 1) != 0; }
+
+  void Begin() {
+    JXL_ASSERT(!IsBegun());
+    JXL_ASSERT(!IsEnded());
+    begun_ += 1;
+  }
+
+  void End() {
+    JXL_ASSERT(IsBegun());
+    JXL_ASSERT(!IsEnded());
+    ended_ += 1;
+  }
+
+ private:
+  // Current state := least-significant bit of begun_ and ended_.
+  uint64_t begun_ = 0;
+  uint64_t ended_ = 0;
+};
+
+// Visitors generate Init/AllDefault/Read/Write logic for all fields. Each
+// bundle's VisitFields member function calls visitor->U32 etc. We do not
+// overload operator() because a function name is easier to search for.
+
+class VisitorBase : public Visitor {
+ public:
+  explicit VisitorBase() {}
+  ~VisitorBase() override { JXL_ASSERT(depth_ == 0); }
+
+  // This is the only call site of Fields::VisitFields.
+  // Ensures EndExtensions was called.
+  Status Visit(Fields* fields) override {
+    depth_ += 1;
+    JXL_ASSERT(depth_ <= Bundle::kMaxExtensions);
+    extension_states_.Push();
+
+    const Status ok = fields->VisitFields(this);
+
+    if (ok) {
+      // If VisitFields called BeginExtensions, must also call
+      // EndExtensions.
+      JXL_ASSERT(!extension_states_.IsBegun() || extension_states_.IsEnded());
+    } else {
+      // Failed, undefined state: don't care whether EndExtensions was
+      // called.
+    }
+
+    extension_states_.Pop();
+    JXL_ASSERT(depth_ != 0);
+    depth_ -= 1;
+
+    return ok;
+  }
+
+  // For visitors accepting a const Visitor, need to const-cast so we can call
+  // the non-const Visitor::VisitFields. NOTE: C is not modified except the
+  // `all_default` field by CanEncodeVisitor.
+  Status VisitConst(const Fields& t) { return Visit(const_cast<Fields*>(&t)); }
+
+  // Derived types (overridden by InitVisitor because it is unsafe to read
+  // from *value there)
+
+  Status Bool(bool default_value, bool* JXL_RESTRICT value) override {
+    uint32_t bits = *value ? 1 : 0;
+    JXL_RETURN_IF_ERROR(Bits(1, static_cast<uint32_t>(default_value), &bits));
+    JXL_DASSERT(bits <= 1);
+    *value = bits == 1;
+    return true;
+  }
+
+  // Overridden by ReadVisitor and WriteVisitor.
+  // Called before any conditional visit based on "extensions".
+  // Overridden by ReadVisitor, CanEncodeVisitor and WriteVisitor.
+  Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override {
+    JXL_RETURN_IF_ERROR(U64(0, extensions));
+
+    extension_states_.Begin();
+    return true;
+  }
+
+  // Called after all extension fields (if any). Although non-extension
+  // fields could be visited afterward, we prefer the convention that
+  // extension fields are always the last to be visited. Overridden by
+  // ReadVisitor.
+  Status EndExtensions() override {
+    extension_states_.End();
+    return true;
+  }
+
+ private:
+  size_t depth_ = 0;  // to check nesting
+  ExtensionStates extension_states_;
+};
+}  // namespace fields_internal
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_FIELDS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/fields_test.cc b/third_party/jpeg-xl/lib/jxl/fields_test.cc
new file mode 100644
index 0000000000..cf54c780ea
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fields_test.cc
@@ -0,0 +1,429 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/fields.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <array>
+#include <utility>
+
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+// Ensures `value` round-trips and in exactly `expected_bits_written`.
+void TestU32Coder(const uint32_t value, const size_t expected_bits_written) {
+  const U32Enc enc(Val(0), Bits(4), Val(0x7FFFFFFF), Bits(32));
+
+  BitWriter writer;
+  BitWriter::Allotment allotment(
+      &writer, RoundUpBitsToByteMultiple(U32Coder::MaxEncodedBits(enc)));
+
+  size_t precheck_pos;
+  EXPECT_TRUE(U32Coder::CanEncode(enc, value, &precheck_pos));
+  EXPECT_EQ(expected_bits_written, precheck_pos);
+
+  EXPECT_TRUE(U32Coder::Write(enc, value, &writer));
+  EXPECT_EQ(expected_bits_written, writer.BitsWritten());
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, 0, nullptr);
+
+  BitReader reader(writer.GetSpan());
+  const uint32_t decoded_value = U32Coder::Read(enc, &reader);
+  EXPECT_EQ(value, decoded_value);
+  EXPECT_TRUE(reader.Close());
+}
+
+TEST(FieldsTest, U32CoderTest) {
+  TestU32Coder(0, 2);
+  TestU32Coder(1, 6);
+  TestU32Coder(15, 6);
+  TestU32Coder(0x7FFFFFFF, 2);
+  TestU32Coder(128, 34);
+  TestU32Coder(0x7FFFFFFEu, 34);
+  TestU32Coder(0x80000000u, 34);
+  TestU32Coder(0xFFFFFFFFu, 34);
+}
+
+void TestU64Coder(const uint64_t value, const size_t expected_bits_written) {
+  BitWriter writer;
+  BitWriter::Allotment allotment(
+      &writer, RoundUpBitsToByteMultiple(U64Coder::MaxEncodedBits()));
+
+  size_t precheck_pos;
+  EXPECT_TRUE(U64Coder::CanEncode(value, &precheck_pos));
+  EXPECT_EQ(expected_bits_written, precheck_pos);
+
+  EXPECT_TRUE(U64Coder::Write(value, &writer));
+  EXPECT_EQ(expected_bits_written, writer.BitsWritten());
+
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, 0, nullptr);
+
+  BitReader reader(writer.GetSpan());
+  const uint64_t decoded_value = U64Coder::Read(&reader);
+  EXPECT_EQ(value, decoded_value);
+  EXPECT_TRUE(reader.Close());
+}
+
+TEST(FieldsTest, U64CoderTest) {
+  // Values that should take 2 bits (selector 00): 0
+  TestU64Coder(0, 2);
+
+  // Values that should take 6 bits (2 for selector, 4 for value): 1..16
+  TestU64Coder(1, 6);
+  TestU64Coder(2, 6);
+  TestU64Coder(8, 6);
+  TestU64Coder(15, 6);
+  TestU64Coder(16, 6);
+
+  // Values that should take 10 bits (2 for selector, 8 for value): 17..272
+  TestU64Coder(17, 10);
+  TestU64Coder(18, 10);
+  TestU64Coder(100, 10);
+  TestU64Coder(271, 10);
+  TestU64Coder(272, 10);
+
+  // Values that should take 15 bits (2 for selector, 12 for value, 1 for varint
+  // end): (0)..273..4095
+  TestU64Coder(273, 15);
+  TestU64Coder(274, 15);
+  TestU64Coder(1000, 15);
+  TestU64Coder(4094, 15);
+  TestU64Coder(4095, 15);
+
+  // Take 24 bits (of which 20 actual value): (0)..4096..1048575
+  TestU64Coder(4096, 24);
+  TestU64Coder(4097, 24);
+  TestU64Coder(10000, 24);
+  TestU64Coder(1048574, 24);
+  TestU64Coder(1048575, 24);
+
+  // Take 33 bits (of which 28 actual value): (0)..1048576..268435455
+  TestU64Coder(1048576, 33);
+  TestU64Coder(1048577, 33);
+  TestU64Coder(10000000, 33);
+  TestU64Coder(268435454, 33);
+  TestU64Coder(268435455, 33);
+
+  // Take 42 bits (of which 36 actual value): (0)..268435456..68719476735
+  TestU64Coder(268435456ull, 42);
+  TestU64Coder(268435457ull, 42);
+  TestU64Coder(1000000000ull, 42);
+  TestU64Coder(68719476734ull, 42);
+  TestU64Coder(68719476735ull, 42);
+
+  // Take 51 bits (of which 44 actual value): (0)..68719476736..17592186044415
+  TestU64Coder(68719476736ull, 51);
+  TestU64Coder(68719476737ull, 51);
+  TestU64Coder(1000000000000ull, 51);
+  TestU64Coder(17592186044414ull, 51);
+  TestU64Coder(17592186044415ull, 51);
+
+  // Take 60 bits (of which 52 actual value):
+  // (0)..17592186044416..4503599627370495
+  TestU64Coder(17592186044416ull, 60);
+  TestU64Coder(17592186044417ull, 60);
+  TestU64Coder(100000000000000ull, 60);
+  TestU64Coder(4503599627370494ull, 60);
+  TestU64Coder(4503599627370495ull, 60);
+
+  // Take 69 bits (of which 60 actual value):
+  // (0)..4503599627370496..1152921504606846975
+  TestU64Coder(4503599627370496ull, 69);
+  TestU64Coder(4503599627370497ull, 69);
+  TestU64Coder(10000000000000000ull, 69);
+  TestU64Coder(1152921504606846974ull, 69);
+  TestU64Coder(1152921504606846975ull, 69);
+
+  // Take 73 bits (of which 64 actual value):
+  // (0)..1152921504606846976..18446744073709551615
+  TestU64Coder(1152921504606846976ull, 73);
+  TestU64Coder(1152921504606846977ull, 73);
+  TestU64Coder(10000000000000000000ull, 73);
+  TestU64Coder(18446744073709551614ull, 73);
+  TestU64Coder(18446744073709551615ull, 73);
+}
+
+Status TestF16Coder(const float value) {
+  size_t max_encoded_bits;
+  // It is not a fatal error if it can't be encoded.
+  if (!F16Coder::CanEncode(value, &max_encoded_bits)) return false;
+  EXPECT_EQ(F16Coder::MaxEncodedBits(), max_encoded_bits);
+
+  BitWriter writer;
+  BitWriter::Allotment allotment(&writer,
+                                 RoundUpBitsToByteMultiple(max_encoded_bits));
+
+  EXPECT_TRUE(F16Coder::Write(value, &writer));
+  EXPECT_EQ(F16Coder::MaxEncodedBits(), writer.BitsWritten());
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, 0, nullptr);
+
+  BitReader reader(writer.GetSpan());
+  float decoded_value;
+  EXPECT_TRUE(F16Coder::Read(&reader, &decoded_value));
+  // All values we test can be represented exactly.
+  EXPECT_EQ(value, decoded_value);
+  EXPECT_TRUE(reader.Close());
+  return true;
+}
+
+TEST(FieldsTest, F16CoderTest) {
+  for (float sign : {-1.0f, 1.0f}) {
+    // (anything less than 1E-3 are subnormals)
+    for (float mag : {0.0f, 0.5f, 1.0f, 2.0f, 2.5f, 16.015625f, 1.0f / 4096,
+                      1.0f / 16384, 65504.0f}) {
+      EXPECT_TRUE(TestF16Coder(sign * mag));
+    }
+  }
+
+  // Out of range
+  EXPECT_FALSE(TestF16Coder(65504.01f));
+  EXPECT_FALSE(TestF16Coder(-65505.0f));
+}
+
+// Ensures Read(Write()) returns the same fields.
+TEST(FieldsTest, TestRoundtripSize) {
+  for (int i = 0; i < 8; i++) {
+    SizeHeader size;
+    ASSERT_TRUE(size.Set(123 + 77 * i, 7 + i));
+
+    size_t extension_bits = 999, total_bits = 999;  // Initialize as garbage.
+    ASSERT_TRUE(Bundle::CanEncode(size, &extension_bits, &total_bits));
+    EXPECT_EQ(0u, extension_bits);
+
+    BitWriter writer;
+    ASSERT_TRUE(WriteSizeHeader(size, &writer, 0, nullptr));
+    EXPECT_EQ(total_bits, writer.BitsWritten());
+    writer.ZeroPadToByte();
+
+    SizeHeader size2;
+    BitReader reader(writer.GetSpan());
+    ASSERT_TRUE(ReadSizeHeader(&reader, &size2));
+    EXPECT_EQ(total_bits, reader.TotalBitsConsumed());
+    EXPECT_TRUE(reader.Close());
+
+    EXPECT_EQ(size.xsize(), size2.xsize());
+    EXPECT_EQ(size.ysize(), size2.ysize());
+  }
+}
+
+// Ensure all values can be reached by the encoding.
+TEST(FieldsTest, TestCropRect) {
+  CodecMetadata metadata;
+  for (int32_t i = -999; i < 19000; ++i) {
+    FrameHeader f(&metadata);
+    f.custom_size_or_origin = true;
+    f.frame_origin.x0 = i;
+    f.frame_origin.y0 = i;
+    f.frame_size.xsize = 1000 + i;
+    f.frame_size.ysize = 1000 + i;
+    size_t extension_bits = 0, total_bits = 0;
+    ASSERT_TRUE(Bundle::CanEncode(f, &extension_bits, &total_bits));
+    EXPECT_EQ(0u, extension_bits);
+    EXPECT_GE(total_bits, 9u);
+  }
+}
+TEST(FieldsTest, TestPreview) {
+  // (div8 cannot represent 4360, but !div8 can go a little higher)
+  for (uint32_t i = 1; i < 4360; ++i) {
+    PreviewHeader p;
+    ASSERT_TRUE(p.Set(i, i));
+    size_t extension_bits = 0, total_bits = 0;
+    ASSERT_TRUE(Bundle::CanEncode(p, &extension_bits, &total_bits));
+    EXPECT_EQ(0u, extension_bits);
+    EXPECT_GE(total_bits, 6u);
+  }
+}
+
+// Ensures Read(Write()) returns the same fields.
+TEST(FieldsTest, TestRoundtripFrame) {
+  CodecMetadata metadata;
+  FrameHeader h(&metadata);
+  h.extensions = 0x800;
+
+  size_t extension_bits = 999, total_bits = 999;  // Initialize as garbage.
+  ASSERT_TRUE(Bundle::CanEncode(h, &extension_bits, &total_bits));
+  EXPECT_EQ(0u, extension_bits);
+  BitWriter writer;
+  ASSERT_TRUE(WriteFrameHeader(h, &writer, nullptr));
+  EXPECT_EQ(total_bits, writer.BitsWritten());
+  writer.ZeroPadToByte();
+
+  FrameHeader h2(&metadata);
+  BitReader reader(writer.GetSpan());
+  ASSERT_TRUE(ReadFrameHeader(&reader, &h2));
+  EXPECT_EQ(total_bits, reader.TotalBitsConsumed());
+  EXPECT_TRUE(reader.Close());
+
+  EXPECT_EQ(h.extensions, h2.extensions);
+  EXPECT_EQ(h.flags, h2.flags);
+}
+
+#ifndef JXL_CRASH_ON_ERROR
+// Ensure out-of-bounds values cause an error.
+TEST(FieldsTest, TestOutOfRange) {
+  SizeHeader h;
+  ASSERT_TRUE(h.Set(0xFFFFFFFFull, 0xFFFFFFFFull));
+  size_t extension_bits = 999, total_bits = 999;  // Initialize as garbage.
+  ASSERT_FALSE(Bundle::CanEncode(h, &extension_bits, &total_bits));
+}
+#endif
+
+struct OldBundle : public Fields {
+  OldBundle() { Bundle::Init(this); }
+  JXL_FIELDS_NAME(OldBundle)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(1), Bits(2), Bits(3), Bits(4), 1, &old_small));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->F16(1.125f, &old_f));
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Bits(7), Bits(12), Bits(16), Bits(32), 0, &old_large));
+
+    JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions));
+    return visitor->EndExtensions();
+  }
+
+  uint32_t old_small;
+  float old_f;
+  uint32_t old_large;
+  uint64_t extensions;
+};
+
+struct NewBundle : public Fields {
+  NewBundle() { Bundle::Init(this); }
+  JXL_FIELDS_NAME(NewBundle)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(1), Bits(2), Bits(3), Bits(4), 1, &old_small));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->F16(1.125f, &old_f));
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Bits(7), Bits(12), Bits(16), Bits(32), 0, &old_large));
+
+    JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions));
+    if (visitor->Conditional(extensions & 1)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Val(2), Bits(2), Bits(3), Bits(4), 2, &new_small));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(-2.0f, &new_f));
+    }
+    if (visitor->Conditional(extensions & 2)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Bits(9), Bits(12), Bits(16), Bits(32), 0, &new_large));
+    }
+    return visitor->EndExtensions();
+  }
+
+  uint32_t old_small;
+  float old_f;
+  uint32_t old_large;
+  uint64_t extensions;
+
+  // If extensions & 1
+  uint32_t new_small = 2;
+  float new_f = -2.0f;
+  // If extensions & 2
+  uint32_t new_large = 0;
+};
+
+TEST(FieldsTest, TestNewDecoderOldData) {
+  OldBundle old_bundle;
+  old_bundle.old_large = 123;
+  old_bundle.old_f = 3.75f;
+  old_bundle.extensions = 0;
+
+  // Write to bit stream
+  const size_t kMaxOutBytes = 999;
+  BitWriter writer;
+  // Make sure values are initialized by code under test.
+  size_t extension_bits = 12345, total_bits = 12345;
+  ASSERT_TRUE(Bundle::CanEncode(old_bundle, &extension_bits, &total_bits));
+  ASSERT_LE(total_bits, kMaxOutBytes * kBitsPerByte);
+  EXPECT_EQ(0u, extension_bits);
+  AuxOut aux_out;
+  ASSERT_TRUE(Bundle::Write(old_bundle, &writer, kLayerHeader, &aux_out));
+
+  BitWriter::Allotment allotment(&writer,
+                                 kMaxOutBytes * kBitsPerByte - total_bits);
+  writer.Write(20, 0xA55A);  // sentinel
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, kLayerHeader, nullptr);
+
+  ASSERT_LE(writer.GetSpan().size(), kMaxOutBytes);
+  BitReader reader(writer.GetSpan());
+  NewBundle new_bundle;
+  ASSERT_TRUE(Bundle::Read(&reader, &new_bundle));
+  EXPECT_EQ(reader.TotalBitsConsumed(),
+            aux_out.layers[kLayerHeader].total_bits);
+  EXPECT_EQ(reader.ReadBits(20), 0xA55Au);
+  EXPECT_TRUE(reader.Close());
+
+  // Old fields are the same in both
+  EXPECT_EQ(old_bundle.extensions, new_bundle.extensions);
+  EXPECT_EQ(old_bundle.old_small, new_bundle.old_small);
+  EXPECT_EQ(old_bundle.old_f, new_bundle.old_f);
+  EXPECT_EQ(old_bundle.old_large, new_bundle.old_large);
+  // New fields match their defaults
+  EXPECT_EQ(2u, new_bundle.new_small);
+  EXPECT_EQ(-2.0f, new_bundle.new_f);
+  EXPECT_EQ(0u, new_bundle.new_large);
+}
+
+TEST(FieldsTest, TestOldDecoderNewData) {
+  NewBundle new_bundle;
+  new_bundle.old_large = 123;
+  new_bundle.extensions = 3;
+  new_bundle.new_f = 999.0f;
+  new_bundle.new_large = 456;
+
+  // Write to bit stream
+  constexpr size_t kMaxOutBytes = 999;
+  BitWriter writer;
+  // Make sure values are initialized by code under test.
+  size_t extension_bits = 12345, total_bits = 12345;
+  ASSERT_TRUE(Bundle::CanEncode(new_bundle, &extension_bits, &total_bits));
+  EXPECT_NE(0u, extension_bits);
+  AuxOut aux_out;
+  ASSERT_TRUE(Bundle::Write(new_bundle, &writer, kLayerHeader, &aux_out));
+  ASSERT_LE(aux_out.layers[kLayerHeader].total_bits,
+            kMaxOutBytes * kBitsPerByte);
+
+  BitWriter::Allotment allotment(
+      &writer,
+      kMaxOutBytes * kBitsPerByte - aux_out.layers[kLayerHeader].total_bits);
+  // Ensure Read skips the additional fields
+  writer.Write(20, 0xA55A);  // sentinel
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, kLayerHeader, nullptr);
+
+  BitReader reader(writer.GetSpan());
+  OldBundle old_bundle;
+  ASSERT_TRUE(Bundle::Read(&reader, &old_bundle));
+  EXPECT_EQ(reader.TotalBitsConsumed(),
+            aux_out.layers[kLayerHeader].total_bits);
+  EXPECT_EQ(reader.ReadBits(20), 0xA55Au);
+  EXPECT_TRUE(reader.Close());
+
+  // Old fields are the same in both
+  EXPECT_EQ(new_bundle.extensions, old_bundle.extensions);
+  EXPECT_EQ(new_bundle.old_small, old_bundle.old_small);
+  EXPECT_EQ(new_bundle.old_f, old_bundle.old_f);
+  EXPECT_EQ(new_bundle.old_large, old_bundle.old_large);
+  // (Can't check new fields because old decoder doesn't know about them)
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/frame_header.cc b/third_party/jpeg-xl/lib/jxl/frame_header.cc
new file mode 100644
index 0000000000..475ce8e05e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/frame_header.cc
@@ -0,0 +1,494 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/frame_header.h"
+
+#include <sstream>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+
+constexpr uint8_t YCbCrChromaSubsampling::kHShift[] = {0, 1, 1, 0};
+constexpr uint8_t YCbCrChromaSubsampling::kVShift[] = {0, 1, 0, 1};
+
+static Status VisitBlendMode(Visitor* JXL_RESTRICT visitor,
+                             BlendMode default_value, BlendMode* blend_mode) {
+  uint32_t encoded = static_cast<uint32_t>(*blend_mode);
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+      Val(static_cast<uint32_t>(BlendMode::kReplace)),
+      Val(static_cast<uint32_t>(BlendMode::kAdd)),
+      Val(static_cast<uint32_t>(BlendMode::kBlend)), BitsOffset(2, 3),
+      static_cast<uint32_t>(default_value), &encoded));
+  if (encoded > 4) {
+    return JXL_FAILURE("Invalid blend_mode");
+  }
+  *blend_mode = static_cast<BlendMode>(encoded);
+  return true;
+}
+
+static Status VisitFrameType(Visitor* JXL_RESTRICT visitor,
+                             FrameType default_value, FrameType* frame_type) {
+  uint32_t encoded = static_cast<uint32_t>(*frame_type);
+
+  JXL_QUIET_RETURN_IF_ERROR(
+      visitor->U32(Val(static_cast<uint32_t>(FrameType::kRegularFrame)),
+                   Val(static_cast<uint32_t>(FrameType::kDCFrame)),
+                   Val(static_cast<uint32_t>(FrameType::kReferenceOnly)),
+                   Val(static_cast<uint32_t>(FrameType::kSkipProgressive)),
+                   static_cast<uint32_t>(default_value), &encoded));
+  *frame_type = static_cast<FrameType>(encoded);
+  return true;
+}
+
+BlendingInfo::BlendingInfo() { Bundle::Init(this); }
+
+Status BlendingInfo::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(
+      VisitBlendMode(visitor, BlendMode::kReplace, &mode));
+  if (visitor->Conditional(nonserialized_num_extra_channels > 0 &&
+                           (mode == BlendMode::kBlend ||
+                            mode == BlendMode::kAlphaWeightedAdd))) {
+    // Up to 11 alpha channels for blending.
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+        Val(0), Val(1), Val(2), BitsOffset(3, 3), 0, &alpha_channel));
+    if (visitor->IsReading() &&
+        alpha_channel >= nonserialized_num_extra_channels) {
+      return JXL_FAILURE("Invalid alpha channel for blending");
+    }
+  }
+  if (visitor->Conditional((nonserialized_num_extra_channels > 0 &&
+                            (mode == BlendMode::kBlend ||
+                             mode == BlendMode::kAlphaWeightedAdd)) ||
+                           mode == BlendMode::kMul)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &clamp));
+  }
+  // 'old' frame for blending. Only necessary if this is not a full frame, or
+  // blending is not kReplace.
+  if (visitor->Conditional(mode != BlendMode::kReplace ||
+                           nonserialized_is_partial_frame)) {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(0), Val(1), Val(2), Val(3), 0, &source));
+  }
+  return true;
+}
+
+std::string BlendingInfo::DebugString() const {
+  std::ostringstream os;
+  os << (mode == BlendMode::kReplace            ? "Replace"
+         : mode == BlendMode::kAdd              ? "Add"
+         : mode == BlendMode::kBlend            ? "Blend"
+         : mode == BlendMode::kAlphaWeightedAdd ? "AlphaWeightedAdd"
+                                                : "Mul");
+  if (nonserialized_num_extra_channels > 0 &&
+      (mode == BlendMode::kBlend || mode == BlendMode::kAlphaWeightedAdd)) {
+    os << ",alpha=" << alpha_channel << ",clamp=" << clamp;
+  } else if (mode == BlendMode::kMul) {
+    os << ",clamp=" << clamp;
+  }
+  if (mode != BlendMode::kReplace || nonserialized_is_partial_frame) {
+    os << ",source=" << source;
+  }
+  return os.str();
+}
+
+AnimationFrame::AnimationFrame(const CodecMetadata* metadata)
+    : nonserialized_metadata(metadata) {
+  Bundle::Init(this);
+}
+Status AnimationFrame::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->Conditional(nonserialized_metadata != nullptr &&
+                           nonserialized_metadata->m.have_animation)) {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(0), Val(1), Bits(8), Bits(32), 0, &duration));
+  }
+
+  if (visitor->Conditional(
+          nonserialized_metadata != nullptr &&
+          nonserialized_metadata->m.animation.have_timecodes)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(32, 0, &timecode));
+  }
+  return true;
+}
+
+YCbCrChromaSubsampling::YCbCrChromaSubsampling() { Bundle::Init(this); }
+Passes::Passes() { Bundle::Init(this); }
+Status Passes::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(
+      visitor->U32(Val(1), Val(2), Val(3), BitsOffset(3, 4), 1, &num_passes));
+  JXL_ASSERT(num_passes <= kMaxNumPasses);  // Cannot happen when reading
+
+  if (visitor->Conditional(num_passes != 1)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+        Val(0), Val(1), Val(2), BitsOffset(1, 3), 0, &num_downsample));
+    JXL_ASSERT(num_downsample <= 4);  // 1,2,4,8
+    if (num_downsample > num_passes) {
+      return JXL_FAILURE("num_downsample %u > num_passes %u", num_downsample,
+                         num_passes);
+    }
+
+    for (uint32_t i = 0; i < num_passes - 1; i++) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 0, &shift[i]));
+    }
+    shift[num_passes - 1] = 0;
+
+    for (uint32_t i = 0; i < num_downsample; ++i) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Val(1), Val(2), Val(4), Val(8), 1, &downsample[i]));
+      if (i > 0 && downsample[i] >= downsample[i - 1]) {
+        return JXL_FAILURE("downsample sequence should be decreasing");
+      }
+    }
+    for (uint32_t i = 0; i < num_downsample; ++i) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Val(0), Val(1), Val(2), Bits(3), 0, &last_pass[i]));
+      if (i > 0 && last_pass[i] <= last_pass[i - 1]) {
+        return JXL_FAILURE("last_pass sequence should be increasing");
+      }
+      if (last_pass[i] >= num_passes) {
+        return JXL_FAILURE("last_pass %u >= num_passes %u", last_pass[i],
+                           num_passes);
+      }
+    }
+  }
+
+  return true;
+}
+
+std::string Passes::DebugString() const {
+  std::ostringstream os;
+  os << "p=" << num_passes;
+  if (num_downsample) {
+    os << ",ds=";
+    for (uint32_t i = 0; i < num_downsample; ++i) {
+      os << last_pass[i] << ":" << downsample[i];
+      if (i + 1 < num_downsample) os << ";";
+    }
+  }
+  bool have_shifts = false;
+  for (uint32_t i = 0; i < num_passes; ++i) {
+    if (shift[i]) have_shifts = true;
+  }
+  if (have_shifts) {
+    os << ",shifts=";
+    for (uint32_t i = 0; i < num_passes; ++i) {
+      os << shift[i];
+      if (i + 1 < num_passes) os << ";";
+    }
+  }
+  return os.str();
+}
+
+FrameHeader::FrameHeader(const CodecMetadata* metadata)
+    : animation_frame(metadata), nonserialized_metadata(metadata) {
+  Bundle::Init(this);
+}
+
+Status ReadFrameHeader(BitReader* JXL_RESTRICT reader,
+                       FrameHeader* JXL_RESTRICT frame) {
+  return Bundle::Read(reader, frame);
+}
+
+Status FrameHeader::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(
+      VisitFrameType(visitor, FrameType::kRegularFrame, &frame_type));
+  if (visitor->IsReading() && nonserialized_is_preview &&
+      frame_type != kRegularFrame) {
+    return JXL_FAILURE("Only regular frame could be a preview");
+  }
+
+  // FrameEncoding.
+  bool is_modular = (encoding == FrameEncoding::kModular);
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &is_modular));
+  encoding = (is_modular ? FrameEncoding::kModular : FrameEncoding::kVarDCT);
+
+  // Flags
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U64(0, &flags));
+
+  // Color transform
+  bool xyb_encoded = nonserialized_metadata == nullptr ||
+                     nonserialized_metadata->m.xyb_encoded;
+
+  if (xyb_encoded) {
+    color_transform = ColorTransform::kXYB;
+  } else {
+    // Alternate if kYCbCr.
+    bool alternate = color_transform == ColorTransform::kYCbCr;
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &alternate));
+    color_transform =
+        (alternate ? ColorTransform::kYCbCr : ColorTransform::kNone);
+  }
+
+  // Chroma subsampling for YCbCr, if no DC frame is used.
+  if (visitor->Conditional(color_transform == ColorTransform::kYCbCr &&
+                           ((flags & kUseDcFrame) == 0))) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&chroma_subsampling));
+  }
+
+  size_t num_extra_channels =
+      nonserialized_metadata != nullptr
+          ? nonserialized_metadata->m.extra_channel_info.size()
+          : 0;
+
+  // Upsampling
+  if (visitor->Conditional((flags & kUseDcFrame) == 0)) {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(1), Val(2), Val(4), Val(8), 1, &upsampling));
+    if (nonserialized_metadata != nullptr &&
+        visitor->Conditional(num_extra_channels != 0)) {
+      const std::vector<ExtraChannelInfo>& extra_channels =
+          nonserialized_metadata->m.extra_channel_info;
+      extra_channel_upsampling.resize(extra_channels.size(), 1);
+      for (size_t i = 0; i < extra_channels.size(); ++i) {
+        uint32_t dim_shift =
+            nonserialized_metadata->m.extra_channel_info[i].dim_shift;
+        uint32_t& ec_upsampling = extra_channel_upsampling[i];
+        ec_upsampling >>= dim_shift;
+        JXL_QUIET_RETURN_IF_ERROR(
+            visitor->U32(Val(1), Val(2), Val(4), Val(8), 1, &ec_upsampling));
+        ec_upsampling <<= dim_shift;
+        if (ec_upsampling < upsampling) {
+          return JXL_FAILURE(
+              "EC upsampling (%u) < color upsampling (%u), which is invalid.",
+              ec_upsampling, upsampling);
+        }
+        if (ec_upsampling > 8) {
+          return JXL_FAILURE("EC upsampling too large (%u)", ec_upsampling);
+        }
+      }
+    } else {
+      extra_channel_upsampling.clear();
+    }
+  }
+
+  // Modular- or VarDCT-specific data.
+  if (visitor->Conditional(encoding == FrameEncoding::kModular)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 1, &group_size_shift));
+  }
+  if (visitor->Conditional(encoding == FrameEncoding::kVarDCT &&
+                           color_transform == ColorTransform::kXYB)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 3, &x_qm_scale));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 2, &b_qm_scale));
+  } else {
+    x_qm_scale = b_qm_scale = 2;  // noop
+  }
+
+  // Not useful for kPatchSource
+  if (visitor->Conditional(frame_type != FrameType::kReferenceOnly)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&passes));
+  }
+
+  if (visitor->Conditional(frame_type == FrameType::kDCFrame)) {
+    // Up to 4 pyramid levels - for up to 16384x downsampling.
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(1), Val(2), Val(3), Val(4), 1, &dc_level));
+  }
+  if (frame_type != FrameType::kDCFrame) {
+    dc_level = 0;
+  }
+
+  bool is_partial_frame = false;
+  if (visitor->Conditional(frame_type != FrameType::kDCFrame)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &custom_size_or_origin));
+    if (visitor->Conditional(custom_size_or_origin)) {
+      const U32Enc enc(Bits(8), BitsOffset(11, 256), BitsOffset(14, 2304),
+                       BitsOffset(30, 18688));
+      // Frame offset, only if kRegularFrame or kSkipProgressive.
+      if (visitor->Conditional(frame_type == FrameType::kRegularFrame ||
+                               frame_type == FrameType::kSkipProgressive)) {
+        uint32_t ux0 = PackSigned(frame_origin.x0);
+        uint32_t uy0 = PackSigned(frame_origin.y0);
+        JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &ux0));
+        JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &uy0));
+        frame_origin.x0 = UnpackSigned(ux0);
+        frame_origin.y0 = UnpackSigned(uy0);
+      }
+      // Frame size
+      JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &frame_size.xsize));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &frame_size.ysize));
+      if (custom_size_or_origin &&
+          (frame_size.xsize == 0 || frame_size.ysize == 0)) {
+        return JXL_FAILURE(
+            "Invalid crop dimensions for frame: zero width or height");
+      }
+      int32_t image_xsize = default_xsize();
+      int32_t image_ysize = default_ysize();
+      if (frame_type == FrameType::kRegularFrame ||
+          frame_type == FrameType::kSkipProgressive) {
+        is_partial_frame |= frame_origin.x0 > 0;
+        is_partial_frame |= frame_origin.y0 > 0;
+        is_partial_frame |= (static_cast<int32_t>(frame_size.xsize) +
+                             frame_origin.x0) < image_xsize;
+        is_partial_frame |= (static_cast<int32_t>(frame_size.ysize) +
+                             frame_origin.y0) < image_ysize;
+      }
+    }
+  }
+
+  // Blending info, animation info and whether this is the last frame or not.
+  if (visitor->Conditional(frame_type == FrameType::kRegularFrame ||
+                           frame_type == FrameType::kSkipProgressive)) {
+    blending_info.nonserialized_num_extra_channels = num_extra_channels;
+    blending_info.nonserialized_is_partial_frame = is_partial_frame;
+    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&blending_info));
+    bool replace_all = (blending_info.mode == BlendMode::kReplace);
+    extra_channel_blending_info.resize(num_extra_channels);
+    for (size_t i = 0; i < num_extra_channels; i++) {
+      auto& ec_blending_info = extra_channel_blending_info[i];
+      ec_blending_info.nonserialized_is_partial_frame = is_partial_frame;
+      ec_blending_info.nonserialized_num_extra_channels = num_extra_channels;
+      JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&ec_blending_info));
+      replace_all &= (ec_blending_info.mode == BlendMode::kReplace);
+    }
+    if (visitor->IsReading() && nonserialized_is_preview) {
+      if (!replace_all || custom_size_or_origin) {
+        return JXL_FAILURE("Preview is not compatible with blending");
+      }
+    }
+    if (visitor->Conditional(nonserialized_metadata != nullptr &&
+                             nonserialized_metadata->m.have_animation)) {
+      animation_frame.nonserialized_metadata = nonserialized_metadata;
+      JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&animation_frame));
+    }
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(true, &is_last));
+  }
+  if (frame_type != FrameType::kRegularFrame) {
+    is_last = false;
+  }
+
+  // ID of that can be used to refer to this frame. 0 for a non-zero-duration
+  // frame means that it will not be referenced. Not necessary for the last
+  // frame.
+  if (visitor->Conditional(frame_type != kDCFrame && !is_last)) {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(0), Val(1), Val(2), Val(3), 0, &save_as_reference));
+  }
+
+  // If this frame is not blended on another frame post-color-transform, it may
+  // be stored for being referenced either before or after the color transform.
+  // If it is blended post-color-transform, it must be blended after. It must
+  // also be blended after if this is a kRegular frame that does not cover the
+  // full frame, as samples outside the partial region are from a
+  // post-color-transform frame.
+  if (frame_type != FrameType::kDCFrame) {
+    if (visitor->Conditional(CanBeReferenced() &&
+                             blending_info.mode == BlendMode::kReplace &&
+                             !is_partial_frame &&
+                             (frame_type == FrameType::kRegularFrame ||
+                              frame_type == FrameType::kSkipProgressive))) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->Bool(false, &save_before_color_transform));
+    } else if (visitor->Conditional(frame_type == FrameType::kReferenceOnly)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->Bool(true, &save_before_color_transform));
+      if (!save_before_color_transform &&
+          (frame_size.xsize < nonserialized_metadata->xsize() ||
+           frame_size.ysize < nonserialized_metadata->ysize() ||
+           frame_origin.x0 != 0 || frame_origin.y0 != 0)) {
+        return JXL_FAILURE(
+            "non-patch reference frame with invalid crop: %" PRIuS "x%" PRIuS
+            "%+d%+d",
+            static_cast<size_t>(frame_size.xsize),
+            static_cast<size_t>(frame_size.ysize),
+            static_cast<int>(frame_origin.x0),
+            static_cast<int>(frame_origin.y0));
+      }
+    }
+  } else {
+    save_before_color_transform = true;
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(VisitNameString(visitor, &name));
+
+  loop_filter.nonserialized_is_modular = is_modular;
+  JXL_RETURN_IF_ERROR(visitor->VisitNested(&loop_filter));
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions));
+  // Extensions: in chronological order of being added to the format.
+  return visitor->EndExtensions();
+}
+
+std::string FrameHeader::DebugString() const {
+  std::ostringstream os;
+  os << (encoding == FrameEncoding::kVarDCT ? "VarDCT" : "Modular");
+  os << ",";
+  os << (frame_type == FrameType::kRegularFrame    ? "Regular"
+         : frame_type == FrameType::kDCFrame       ? "DC"
+         : frame_type == FrameType::kReferenceOnly ? "Reference"
+                                                   : "SkipProgressive");
+  if (frame_type == FrameType::kDCFrame) {
+    os << "(lv" << dc_level << ")";
+  }
+
+  if (flags) {
+    os << ",";
+    uint32_t remaining = flags;
+
+#define TEST_FLAG(name)           \
+  if (flags & Flags::k##name) {   \
+    remaining &= ~Flags::k##name; \
+    os << #name;                  \
+    if (remaining) os << "|";     \
+  }
+    TEST_FLAG(Noise);
+    TEST_FLAG(Patches);
+    TEST_FLAG(Splines);
+    TEST_FLAG(UseDcFrame);
+    TEST_FLAG(SkipAdaptiveDCSmoothing);
+#undef TEST_FLAG
+  }
+
+  os << ",";
+  os << (color_transform == ColorTransform::kXYB     ? "XYB"
+         : color_transform == ColorTransform::kYCbCr ? "YCbCr"
+                                                     : "None");
+
+  if (encoding == FrameEncoding::kModular) {
+    os << ",shift=" << group_size_shift;
+  } else if (color_transform == ColorTransform::kXYB) {
+    os << ",qm=" << x_qm_scale << ";" << b_qm_scale;
+  }
+  if (frame_type != FrameType::kReferenceOnly) {
+    os << "," << passes.DebugString();
+  }
+  if (custom_size_or_origin) {
+    os << ",xs=" << frame_size.xsize;
+    os << ",ys=" << frame_size.ysize;
+    if (frame_type == FrameType::kRegularFrame ||
+        frame_type == FrameType::kSkipProgressive) {
+      os << ",x0=" << frame_origin.x0;
+      os << ",y0=" << frame_origin.y0;
+    }
+  }
+  if (upsampling > 1) os << ",up=" << upsampling;
+  if (loop_filter.gab) os << ",Gaborish";
+  if (loop_filter.epf_iters > 0) os << ",epf=" << loop_filter.epf_iters;
+  if (animation_frame.duration > 0) os << ",dur=" << animation_frame.duration;
+  if (frame_type == FrameType::kRegularFrame ||
+      frame_type == FrameType::kSkipProgressive) {
+    os << ",";
+    os << blending_info.DebugString();
+    for (size_t i = 0; i < extra_channel_blending_info.size(); ++i) {
+      os << (i == 0 ? "[" : ";");
+      os << extra_channel_blending_info[i].DebugString();
+      if (i + 1 == extra_channel_blending_info.size()) os << "]";
+    }
+  }
+  if (save_as_reference > 0) os << ",ref=" << save_as_reference;
+  os << "," << (save_before_color_transform ? "before" : "after") << "_ct";
+  if (is_last) os << ",last";
+  return os.str();
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/frame_header.h b/third_party/jpeg-xl/lib/jxl/frame_header.h
new file mode 100644
index 0000000000..5580bcd6fe
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/frame_header.h
@@ -0,0 +1,503 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_FRAME_HEADER_H_
+#define LIB_JXL_FRAME_HEADER_H_
+
+// Frame header with backward and forward-compatible extension capability and
+// compressed integer fields.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/loop_filter.h"
+
+namespace jxl {
+
+// TODO(eustas): move to proper place?
+// Also used by extra channel names.
+static inline Status VisitNameString(Visitor* JXL_RESTRICT visitor,
+                                     std::string* name) {
+  uint32_t name_length = static_cast<uint32_t>(name->length());
+  // Allows layer name lengths up to 1071 bytes
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(0), Bits(4), BitsOffset(5, 16),
+                                         BitsOffset(10, 48), 0, &name_length));
+  if (visitor->IsReading()) {
+    name->resize(name_length);
+  }
+  for (size_t i = 0; i < name_length; i++) {
+    uint32_t c = static_cast<uint8_t>((*name)[i]);
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(8, 0, &c));
+    (*name)[i] = static_cast<char>(c);
+  }
+  return true;
+}
+
+enum class FrameEncoding : uint32_t {
+  kVarDCT,
+  kModular,
+};
+
+enum class ColorTransform : uint32_t {
+  kXYB,    // Values are encoded with XYB. May only be used if
+           // ImageBundle::xyb_encoded.
+  kNone,   // Values are encoded according to the attached color profile. May
+           // only be used if !ImageBundle::xyb_encoded.
+  kYCbCr,  // Values are encoded according to the attached color profile, but
+           // transformed to YCbCr. May only be used if
+           // !ImageBundle::xyb_encoded.
+};
+
+inline std::array<int, 3> JpegOrder(ColorTransform ct, bool is_gray) {
+  if (is_gray) {
+    return {{0, 0, 0}};
+  }
+  JXL_ASSERT(ct != ColorTransform::kXYB);
+  if (ct == ColorTransform::kYCbCr) {
+    return {{1, 0, 2}};
+  } else {
+    return {{0, 1, 2}};
+  }
+}
+
+struct YCbCrChromaSubsampling : public Fields {
+  YCbCrChromaSubsampling();
+  JXL_FIELDS_NAME(YCbCrChromaSubsampling)
+  size_t HShift(size_t c) const { return maxhs_ - kHShift[channel_mode_[c]]; }
+  size_t VShift(size_t c) const { return maxvs_ - kVShift[channel_mode_[c]]; }
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override {
+    // TODO(veluca): consider allowing 4x downsamples
+    for (size_t i = 0; i < 3; i++) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 0, &channel_mode_[i]));
+    }
+    Recompute();
+    return true;
+  }
+
+  uint8_t MaxHShift() const { return maxhs_; }
+  uint8_t MaxVShift() const { return maxvs_; }
+
+  uint8_t RawHShift(size_t c) const { return kHShift[channel_mode_[c]]; }
+  uint8_t RawVShift(size_t c) const { return kVShift[channel_mode_[c]]; }
+
+  // Uses JPEG channel order (Y, Cb, Cr).
+  Status Set(const uint8_t* hsample, const uint8_t* vsample) {
+    for (size_t c = 0; c < 3; c++) {
+      size_t cjpeg = c < 2 ? c ^ 1 : c;
+      size_t i = 0;
+      for (; i < 4; i++) {
+        if (1 << kHShift[i] == hsample[cjpeg] &&
+            1 << kVShift[i] == vsample[cjpeg]) {
+          channel_mode_[c] = i;
+          break;
+        }
+      }
+      if (i == 4) {
+        return JXL_FAILURE("Invalid subsample mode");
+      }
+    }
+    Recompute();
+    return true;
+  }
+
+  bool Is444() const {
+    return HShift(0) == 0 && VShift(0) == 0 &&  // Cb
+           HShift(2) == 0 && VShift(2) == 0 &&  // Cr
+           HShift(1) == 0 && VShift(1) == 0;    // Y
+  }
+
+  bool Is420() const {
+    return HShift(0) == 1 && VShift(0) == 1 &&  // Cb
+           HShift(2) == 1 && VShift(2) == 1 &&  // Cr
+           HShift(1) == 0 && VShift(1) == 0;    // Y
+  }
+
+  bool Is422() const {
+    return HShift(0) == 1 && VShift(0) == 0 &&  // Cb
+           HShift(2) == 1 && VShift(2) == 0 &&  // Cr
+           HShift(1) == 0 && VShift(1) == 0;    // Y
+  }
+
+  bool Is440() const {
+    return HShift(0) == 0 && VShift(0) == 1 &&  // Cb
+           HShift(2) == 0 && VShift(2) == 1 &&  // Cr
+           HShift(1) == 0 && VShift(1) == 0;    // Y
+  }
+
+  std::string DebugString() const {
+    if (Is444()) return "444";
+    if (Is420()) return "420";
+    if (Is422()) return "422";
+    if (Is440()) return "440";
+    return "cs" + std::to_string(channel_mode_[0]) +
+           std::to_string(channel_mode_[1]) + std::to_string(channel_mode_[2]);
+  }
+
+ private:
+  void Recompute() {
+    maxhs_ = 0;
+    maxvs_ = 0;
+    for (size_t i = 0; i < 3; i++) {
+      maxhs_ = std::max(maxhs_, kHShift[channel_mode_[i]]);
+      maxvs_ = std::max(maxvs_, kVShift[channel_mode_[i]]);
+    }
+  }
+  static const uint8_t kHShift[4];
+  static const uint8_t kVShift[4];
+  uint32_t channel_mode_[3];
+  uint8_t maxhs_;
+  uint8_t maxvs_;
+};
+
+// Indicates how to combine the current frame with a previously-saved one. Can
+// be independently controlled for color and extra channels. Formulas are
+// indicative and treat alpha as if it is in range 0.0-1.0. In descriptions
+// below, alpha channel is the extra channel of type alpha used for blending
+// according to the blend_channel, or fully opaque if there is no alpha channel.
+// The blending specified here is used for performing blending *after* color
+// transforms - in linear sRGB if blending a XYB-encoded frame on another
+// XYB-encoded frame, in sRGB if blending a frame with kColorSpace == kSRGB, or
+// in the original colorspace otherwise. Blending in XYB or YCbCr is done by
+// using patches.
+enum class BlendMode {
+  // The new values (in the crop) replace the old ones: sample = new
+  kReplace = 0,
+  // The new values (in the crop) get added to the old ones: sample = old + new
+  kAdd = 1,
+  // The new values (in the crop) replace the old ones if alpha>0:
+  // For the alpha channel that is used as source:
+  // alpha = old + new * (1 - old)
+  // For other channels if !alpha_associated:
+  // sample = ((1 - new_alpha) * old * old_alpha + new_alpha * new) / alpha
+  // For other channels if alpha_associated:
+  // sample = (1 - new_alpha) * old + new
+  // The alpha formula applies to the alpha used for the division in the other
+  // channels formula, and applies to the alpha channel itself if its
+  // blend_channel value matches itself.
+  kBlend = 2,
+  // The new values (in the crop) are added to the old ones if alpha>0:
+  // For the alpha channel that is used as source:
+  // sample = sample = old + new * (1 - old)
+  // For other channels: sample = old + alpha * new
+  kAlphaWeightedAdd = 3,
+  // The new values (in the crop) get multiplied by the old ones:
+  // sample = old * new
+  // The range of the new value matters for multiplication purposes, and its
+  // nominal range of 0..1 is computed the same way as this is done for the
+  // alpha values in kBlend and kAlphaWeightedAdd.
+  // If using kMul as a blend mode for color channels, no color transform is
+  // performed on the current frame.
+  kMul = 4,
+};
+
+struct BlendingInfo : public Fields {
+  BlendingInfo();
+  JXL_FIELDS_NAME(BlendingInfo)
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+  BlendMode mode;
+  // Which extra channel to use as alpha channel for blending, only encoded
+  // for blend modes that involve alpha and if there are more than 1 extra
+  // channels.
+  uint32_t alpha_channel;
+  // Clamp alpha or channel values to 0-1 range.
+  bool clamp;
+  // Frame ID to copy from (0-3). Only encoded if blend_mode is not kReplace.
+  uint32_t source;
+
+  std::string DebugString() const;
+
+  size_t nonserialized_num_extra_channels = 0;
+  bool nonserialized_is_partial_frame = false;
+};
+
+// Origin of the current frame. Not present for frames of type
+// kOnlyPatches.
+struct FrameOrigin {
+  int32_t x0, y0;  // can be negative.
+};
+
+// Size of the current frame.
+struct FrameSize {
+  uint32_t xsize, ysize;
+};
+
+// AnimationFrame defines duration of animation frames.
+struct AnimationFrame : public Fields {
+  explicit AnimationFrame(const CodecMetadata* metadata);
+  JXL_FIELDS_NAME(AnimationFrame)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  // How long to wait [in ticks, see Animation{}] after rendering.
+  // May be 0 if the current frame serves as a foundation for another frame.
+  uint32_t duration;
+
+  uint32_t timecode;  // 0xHHMMSSFF
+
+  // Must be set to the one ImageMetadata acting as the full codestream header,
+  // with correct xyb_encoded, list of extra channels, etc...
+  const CodecMetadata* nonserialized_metadata = nullptr;
+};
+
+// For decoding to lower resolutions. Only used for kRegular frames.
+struct Passes : public Fields {
+  Passes();
+  JXL_FIELDS_NAME(Passes)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  void GetDownsamplingBracket(size_t pass, int& minShift, int& maxShift) const {
+    maxShift = 2;
+    minShift = 3;
+    for (size_t i = 0;; i++) {
+      for (uint32_t j = 0; j < num_downsample; ++j) {
+        if (i == last_pass[j]) {
+          if (downsample[j] == 8) minShift = 3;
+          if (downsample[j] == 4) minShift = 2;
+          if (downsample[j] == 2) minShift = 1;
+          if (downsample[j] == 1) minShift = 0;
+        }
+      }
+      if (i == num_passes - 1) minShift = 0;
+      if (i == pass) return;
+      maxShift = minShift - 1;
+    }
+  }
+
+  uint32_t GetDownsamplingTargetForCompletedPasses(uint32_t num_p) const {
+    if (num_p >= num_passes) return 1;
+    uint32_t retval = 8;
+    for (uint32_t i = 0; i < num_downsample; ++i) {
+      if (num_p > last_pass[i]) {
+        retval = std::min(retval, downsample[i]);
+      }
+    }
+    return retval;
+  }
+
+  std::string DebugString() const;
+
+  uint32_t num_passes;      // <= kMaxNumPasses
+  uint32_t num_downsample;  // <= num_passes
+
+  // Array of num_downsample pairs. downsample=1/last_pass=num_passes-1 and
+  // downsample=8/last_pass=0 need not be specified; they are implicit.
+  uint32_t downsample[kMaxNumPasses];
+  uint32_t last_pass[kMaxNumPasses];
+  // Array of shift values for each pass. It is implicitly assumed to be 0 for
+  // the last pass.
+  uint32_t shift[kMaxNumPasses];
+};
+
+enum FrameType {
+  // A "regular" frame: might be a crop, and will be blended on a previous
+  // frame, if any, and displayed or blended in future frames.
+  kRegularFrame = 0,
+  // A DC frame: this frame is downsampled and will be *only* used as the DC of
+  // a future frame and, possibly, for previews. Cannot be cropped, blended, or
+  // referenced by patches or blending modes. Frames that *use* a DC frame
+  // cannot have non-default sizes either.
+  kDCFrame = 1,
+  // A PatchesSource frame: this frame will be only used as a source frame for
+  // taking patches. Can be cropped, but cannot have non-(0, 0) x0 and y0.
+  kReferenceOnly = 2,
+  // Same as kRegularFrame, but not used for progressive rendering. This also
+  // implies no early display of DC.
+  kSkipProgressive = 3,
+};
+
+// Image/frame := one of more of these, where the last has is_last = true.
+// Starts at a byte-aligned address "a"; the next pass starts at "a + size".
+struct FrameHeader : public Fields {
+  // Optional postprocessing steps. These flags are the source of truth;
+  // Override must set/clear them rather than change their meaning. Values
+  // chosen such that typical flags == 0 (encoded in only two bits).
+  enum Flags {
+    // Often but not always off => low bit value:
+
+    // Inject noise into decoded output.
+    kNoise = 1,
+
+    // Overlay patches.
+    kPatches = 2,
+
+    // 4, 8 = reserved for future sometimes-off
+
+    // Overlay splines.
+    kSplines = 16,
+
+    kUseDcFrame = 32,  // Implies kSkipAdaptiveDCSmoothing.
+
+    // 64 = reserved for future often-off
+
+    // Almost always on => negated:
+
+    kSkipAdaptiveDCSmoothing = 128,
+  };
+
+  explicit FrameHeader(const CodecMetadata* metadata);
+  JXL_FIELDS_NAME(FrameHeader)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  // Sets/clears `flag` based upon `condition`.
+  void UpdateFlag(const bool condition, const uint64_t flag) {
+    if (condition) {
+      flags |= flag;
+    } else {
+      flags &= ~flag;
+    }
+  }
+
+  // Returns true if this frame is supposed to be saved for future usage by
+  // other frames.
+  bool CanBeReferenced() const {
+    // DC frames cannot be referenced. The last frame cannot be referenced. A
+    // duration 0 frame makes little sense if it is not referenced. A
+    // non-duration 0 frame may or may not be referenced.
+    return !is_last && frame_type != FrameType::kDCFrame &&
+           (animation_frame.duration == 0 || save_as_reference != 0);
+  }
+
+  mutable bool all_default;
+
+  // Always present
+  FrameEncoding encoding;
+  // Some versions of UBSAN complain in VisitFrameType if not initialized.
+  FrameType frame_type = FrameType::kRegularFrame;
+
+  uint64_t flags;
+
+  ColorTransform color_transform;
+  YCbCrChromaSubsampling chroma_subsampling;
+
+  uint32_t group_size_shift;  // only if encoding == kModular;
+
+  uint32_t x_qm_scale;  // only if VarDCT and color_transform == kXYB
+  uint32_t b_qm_scale;  // only if VarDCT and color_transform == kXYB
+
+  std::string name;
+
+  // Skipped for kReferenceOnly.
+  Passes passes;
+
+  // Skipped for kDCFrame
+  bool custom_size_or_origin;
+  FrameSize frame_size;
+
+  // upsampling factors for color and extra channels.
+  // Upsampling is always performed before applying any inverse color transform.
+  // Skipped (1) if kUseDCFrame
+  uint32_t upsampling;
+  std::vector<uint32_t> extra_channel_upsampling;
+
+  // Only for kRegular frames.
+  FrameOrigin frame_origin;
+
+  BlendingInfo blending_info;
+  std::vector<BlendingInfo> extra_channel_blending_info;
+
+  // Animation info for this frame.
+  AnimationFrame animation_frame;
+
+  // This is the last frame.
+  bool is_last;
+
+  // ID to refer to this frame with. 0-3, not present if kDCFrame.
+  // 0 has a special meaning for kRegular frames of nonzero duration: it defines
+  // a frame that will not be referenced in the future.
+  uint32_t save_as_reference;
+
+  // Whether to save this frame before or after the color transform. A frame
+  // that is saved before the color tansform can only be used for blending
+  // through patches. On the contrary, a frame that is saved after the color
+  // transform can only be used for blending through blending modes.
+  // Irrelevant for extra channel blending. Can only be true if
+  // blending_info.mode == kReplace and this is not a partial kRegularFrame; if
+  // this is a DC frame, it is always true.
+  bool save_before_color_transform;
+
+  uint32_t dc_level;  // 1-4 if kDCFrame (0 otherwise).
+
+  // Must be set to the one ImageMetadata acting as the full codestream header,
+  // with correct xyb_encoded, list of extra channels, etc...
+  const CodecMetadata* nonserialized_metadata = nullptr;
+
+  // NOTE: This is ignored by AllDefault.
+  LoopFilter loop_filter;
+
+  bool nonserialized_is_preview = false;
+
+  size_t default_xsize() const {
+    if (!nonserialized_metadata) return 0;
+    if (nonserialized_is_preview) {
+      return nonserialized_metadata->m.preview_size.xsize();
+    }
+    return nonserialized_metadata->xsize();
+  }
+
+  size_t default_ysize() const {
+    if (!nonserialized_metadata) return 0;
+    if (nonserialized_is_preview) {
+      return nonserialized_metadata->m.preview_size.ysize();
+    }
+    return nonserialized_metadata->ysize();
+  }
+
+  FrameDimensions ToFrameDimensions() const {
+    size_t xsize = default_xsize();
+    size_t ysize = default_ysize();
+
+    xsize = frame_size.xsize ? frame_size.xsize : xsize;
+    ysize = frame_size.ysize ? frame_size.ysize : ysize;
+
+    if (dc_level != 0) {
+      xsize = DivCeil(xsize, 1 << (3 * dc_level));
+      ysize = DivCeil(ysize, 1 << (3 * dc_level));
+    }
+
+    FrameDimensions frame_dim;
+    frame_dim.Set(xsize, ysize, group_size_shift,
+                  chroma_subsampling.MaxHShift(),
+                  chroma_subsampling.MaxVShift(),
+                  encoding == FrameEncoding::kModular, upsampling);
+    return frame_dim;
+  }
+
+  // True if a color transform should be applied to this frame.
+  bool needs_color_transform() const {
+    return !save_before_color_transform ||
+           frame_type == FrameType::kRegularFrame ||
+           frame_type == FrameType::kSkipProgressive;
+  }
+
+  std::string DebugString() const;
+
+  uint64_t extensions;
+};
+
+Status ReadFrameHeader(BitReader* JXL_RESTRICT reader,
+                       FrameHeader* JXL_RESTRICT frame);
+
+// Shared by enc/dec. 5F and 13 are by far the most common for d1/2/4/8, 0
+// ensures low overhead for small images.
+static constexpr U32Enc kOrderEnc =
+    U32Enc(Val(0x5F), Val(0x13), Val(0), Bits(kNumOrders));
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_FRAME_HEADER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/gamma_correct_test.cc b/third_party/jpeg-xl/lib/jxl/gamma_correct_test.cc
new file mode 100644
index 0000000000..131ec4fa83
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/gamma_correct_test.cc
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdlib.h>
+
+#include <algorithm>
+
+#include "lib/jxl/enc_gamma_correct.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(GammaCorrectTest, TestLinearToSrgbEdgeCases) {
+  EXPECT_EQ(0, LinearToSrgb8Direct(0.0));
+  EXPECT_NEAR(0, LinearToSrgb8Direct(1E-6f), 2E-5);
+  EXPECT_EQ(0, LinearToSrgb8Direct(-1E-6f));
+  EXPECT_EQ(0, LinearToSrgb8Direct(-1E6));
+  EXPECT_NEAR(1, LinearToSrgb8Direct(1 - 1E-6f), 1E-5);
+  EXPECT_EQ(1, LinearToSrgb8Direct(1 + 1E-6f));
+  EXPECT_EQ(1, LinearToSrgb8Direct(1E6));
+}
+
+TEST(GammaCorrectTest, TestRoundTrip) {
+  // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter)
+  for (double linear = 0.0; linear <= 1.0; linear += 1E-7) {
+    const double srgb = LinearToSrgb8Direct(linear);
+    const double linear2 = Srgb8ToLinearDirect(srgb);
+    ASSERT_LT(std::abs(linear - linear2), 2E-13)
+        << "linear = " << linear << ", linear2 = " << linear2;
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/gauss_blur.cc b/third_party/jpeg-xl/lib/jxl/gauss_blur.cc
new file mode 100644
index 0000000000..82384e4c64
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/gauss_blur.cc
@@ -0,0 +1,623 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/gauss_blur.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <cmath>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/gauss_blur.cc"
+#include <hwy/cache_control.h>
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/matrix_ops.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Broadcast;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::NegMulSub;
+#if HWY_TARGET != HWY_SCALAR
+using hwy::HWY_NAMESPACE::ShiftLeftLanes;
+#endif
+using hwy::HWY_NAMESPACE::Vec;
+
+void FastGaussian1D(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                    const float* JXL_RESTRICT in, intptr_t width,
+                    float* JXL_RESTRICT out) {
+  // Although the current output depends on the previous output, we can unroll
+  // up to 4x by precomputing up to fourth powers of the constants. Beyond that,
+  // numerical precision might become a problem. Macro because this is tested
+  // in #if alongside HWY_TARGET.
+#define JXL_GAUSS_MAX_LANES 4
+  using D = HWY_CAPPED(float, JXL_GAUSS_MAX_LANES);
+  using V = Vec<D>;
+  const D d;
+  const V mul_in_1 = Load(d, rg->mul_in + 0 * 4);
+  const V mul_in_3 = Load(d, rg->mul_in + 1 * 4);
+  const V mul_in_5 = Load(d, rg->mul_in + 2 * 4);
+  const V mul_prev_1 = Load(d, rg->mul_prev + 0 * 4);
+  const V mul_prev_3 = Load(d, rg->mul_prev + 1 * 4);
+  const V mul_prev_5 = Load(d, rg->mul_prev + 2 * 4);
+  const V mul_prev2_1 = Load(d, rg->mul_prev2 + 0 * 4);
+  const V mul_prev2_3 = Load(d, rg->mul_prev2 + 1 * 4);
+  const V mul_prev2_5 = Load(d, rg->mul_prev2 + 2 * 4);
+  V prev_1 = Zero(d);
+  V prev_3 = Zero(d);
+  V prev_5 = Zero(d);
+  V prev2_1 = Zero(d);
+  V prev2_3 = Zero(d);
+  V prev2_5 = Zero(d);
+
+  const intptr_t N = rg->radius;
+
+  intptr_t n = -N + 1;
+  // Left side with bounds checks and only write output after n >= 0.
+  const intptr_t first_aligned = RoundUpTo(N + 1, Lanes(d));
+  for (; n < std::min(first_aligned, width); ++n) {
+    const intptr_t left = n - N - 1;
+    const intptr_t right = n + N - 1;
+    const float left_val = left >= 0 ? in[left] : 0.0f;
+    const float right_val = right < width ? in[right] : 0.0f;
+    const V sum = Set(d, left_val + right_val);
+
+    // (Only processing a single lane here, no need to broadcast)
+    V out_1 = Mul(sum, mul_in_1);
+    V out_3 = Mul(sum, mul_in_3);
+    V out_5 = Mul(sum, mul_in_5);
+
+    out_1 = MulAdd(mul_prev2_1, prev2_1, out_1);
+    out_3 = MulAdd(mul_prev2_3, prev2_3, out_3);
+    out_5 = MulAdd(mul_prev2_5, prev2_5, out_5);
+    prev2_1 = prev_1;
+    prev2_3 = prev_3;
+    prev2_5 = prev_5;
+
+    out_1 = MulAdd(mul_prev_1, prev_1, out_1);
+    out_3 = MulAdd(mul_prev_3, prev_3, out_3);
+    out_5 = MulAdd(mul_prev_5, prev_5, out_5);
+    prev_1 = out_1;
+    prev_3 = out_3;
+    prev_5 = out_5;
+
+    if (n >= 0) {
+      out[n] = GetLane(Add(out_1, Add(out_3, out_5)));
+    }
+  }
+
+  // The above loop is effectively scalar but it is convenient to use the same
+  // prev/prev2 variables, so broadcast to each lane before the unrolled loop.
+#if HWY_TARGET != HWY_SCALAR && JXL_GAUSS_MAX_LANES > 1
+  prev2_1 = Broadcast<0>(prev2_1);
+  prev2_3 = Broadcast<0>(prev2_3);
+  prev2_5 = Broadcast<0>(prev2_5);
+  prev_1 = Broadcast<0>(prev_1);
+  prev_3 = Broadcast<0>(prev_3);
+  prev_5 = Broadcast<0>(prev_5);
+#endif
+
+  // Unrolled, no bounds checking needed.
+  for (; n < width - N + 1 - (JXL_GAUSS_MAX_LANES - 1); n += Lanes(d)) {
+    const V sum = Add(LoadU(d, in + n - N - 1), LoadU(d, in + n + N - 1));
+
+    // To get a vector of output(s), we multiply broadcasted vectors (of each
+    // input plus the two previous outputs) and add them all together.
+    // Incremental broadcasting and shifting is expected to be cheaper than
+    // horizontal adds or transposing 4x4 values because they run on a different
+    // port, concurrently with the FMA.
+    const V in0 = Broadcast<0>(sum);
+    V out_1 = Mul(in0, mul_in_1);
+    V out_3 = Mul(in0, mul_in_3);
+    V out_5 = Mul(in0, mul_in_5);
+
+#if HWY_TARGET != HWY_SCALAR && JXL_GAUSS_MAX_LANES >= 2
+    const V in1 = Broadcast<1>(sum);
+    out_1 = MulAdd(ShiftLeftLanes<1>(mul_in_1), in1, out_1);
+    out_3 = MulAdd(ShiftLeftLanes<1>(mul_in_3), in1, out_3);
+    out_5 = MulAdd(ShiftLeftLanes<1>(mul_in_5), in1, out_5);
+
+#if JXL_GAUSS_MAX_LANES >= 4
+    const V in2 = Broadcast<2>(sum);
+    out_1 = MulAdd(ShiftLeftLanes<2>(mul_in_1), in2, out_1);
+    out_3 = MulAdd(ShiftLeftLanes<2>(mul_in_3), in2, out_3);
+    out_5 = MulAdd(ShiftLeftLanes<2>(mul_in_5), in2, out_5);
+
+    const V in3 = Broadcast<3>(sum);
+    out_1 = MulAdd(ShiftLeftLanes<3>(mul_in_1), in3, out_1);
+    out_3 = MulAdd(ShiftLeftLanes<3>(mul_in_3), in3, out_3);
+    out_5 = MulAdd(ShiftLeftLanes<3>(mul_in_5), in3, out_5);
+#endif
+#endif
+
+    out_1 = MulAdd(mul_prev2_1, prev2_1, out_1);
+    out_3 = MulAdd(mul_prev2_3, prev2_3, out_3);
+    out_5 = MulAdd(mul_prev2_5, prev2_5, out_5);
+
+    out_1 = MulAdd(mul_prev_1, prev_1, out_1);
+    out_3 = MulAdd(mul_prev_3, prev_3, out_3);
+    out_5 = MulAdd(mul_prev_5, prev_5, out_5);
+#if HWY_TARGET == HWY_SCALAR || JXL_GAUSS_MAX_LANES == 1
+    prev2_1 = prev_1;
+    prev2_3 = prev_3;
+    prev2_5 = prev_5;
+    prev_1 = out_1;
+    prev_3 = out_3;
+    prev_5 = out_5;
+#else
+    prev2_1 = Broadcast<JXL_GAUSS_MAX_LANES - 2>(out_1);
+    prev2_3 = Broadcast<JXL_GAUSS_MAX_LANES - 2>(out_3);
+    prev2_5 = Broadcast<JXL_GAUSS_MAX_LANES - 2>(out_5);
+    prev_1 = Broadcast<JXL_GAUSS_MAX_LANES - 1>(out_1);
+    prev_3 = Broadcast<JXL_GAUSS_MAX_LANES - 1>(out_3);
+    prev_5 = Broadcast<JXL_GAUSS_MAX_LANES - 1>(out_5);
+#endif
+
+    Store(Add(out_1, Add(out_3, out_5)), d, out + n);
+  }
+
+  // Remainder handling with bounds checks
+  for (; n < width; ++n) {
+    const intptr_t left = n - N - 1;
+    const intptr_t right = n + N - 1;
+    const float left_val = left >= 0 ? in[left] : 0.0f;
+    const float right_val = right < width ? in[right] : 0.0f;
+    const V sum = Set(d, left_val + right_val);
+
+    // (Only processing a single lane here, no need to broadcast)
+    V out_1 = Mul(sum, mul_in_1);
+    V out_3 = Mul(sum, mul_in_3);
+    V out_5 = Mul(sum, mul_in_5);
+
+    out_1 = MulAdd(mul_prev2_1, prev2_1, out_1);
+    out_3 = MulAdd(mul_prev2_3, prev2_3, out_3);
+    out_5 = MulAdd(mul_prev2_5, prev2_5, out_5);
+    prev2_1 = prev_1;
+    prev2_3 = prev_3;
+    prev2_5 = prev_5;
+
+    out_1 = MulAdd(mul_prev_1, prev_1, out_1);
+    out_3 = MulAdd(mul_prev_3, prev_3, out_3);
+    out_5 = MulAdd(mul_prev_5, prev_5, out_5);
+    prev_1 = out_1;
+    prev_3 = out_3;
+    prev_5 = out_5;
+
+    out[n] = GetLane(Add(out_1, Add(out_3, out_5)));
+  }
+}
+
+// Ring buffer is for n, n-1, n-2; round up to 4 for faster modulo.
+constexpr size_t kMod = 4;
+
+// Avoids an unnecessary store during warmup.
+struct OutputNone {
+  template <class V>
+  void operator()(const V& /*unused*/, float* JXL_RESTRICT /*pos*/,
+                  ptrdiff_t /*offset*/) const {}
+};
+
+// Common case: write output vectors in all VerticalBlock except warmup.
+struct OutputStore {
+  template <class V>
+  void operator()(const V& out, float* JXL_RESTRICT pos,
+                  ptrdiff_t offset) const {
+    // Stream helps for large images but is slower for images that fit in cache.
+    Store(out, HWY_FULL(float)(), pos + offset);
+  }
+};
+
+// At top/bottom borders, we don't have two inputs to load, so avoid addition.
+// pos may even point to all zeros if the row is outside the input image.
+class SingleInput {
+ public:
+  explicit SingleInput(const float* pos) : pos_(pos) {}
+  Vec<HWY_FULL(float)> operator()(const size_t offset) const {
+    return Load(HWY_FULL(float)(), pos_ + offset);
+  }
+  const float* pos_;
+};
+
+// In the middle of the image, we need to load from a row above and below, and
+// return the sum.
+class TwoInputs {
+ public:
+  TwoInputs(const float* pos1, const float* pos2) : pos1_(pos1), pos2_(pos2) {}
+  Vec<HWY_FULL(float)> operator()(const size_t offset) const {
+    const auto in1 = Load(HWY_FULL(float)(), pos1_ + offset);
+    const auto in2 = Load(HWY_FULL(float)(), pos2_ + offset);
+    return Add(in1, in2);
+  }
+
+ private:
+  const float* pos1_;
+  const float* pos2_;
+};
+
+// Block := kVectors consecutive full vectors (one cache line except on the
+// right boundary, where we can only rely on having one vector). Unrolling to
+// the cache line size improves cache utilization.
+template <size_t kVectors, class V, class Input, class Output>
+void VerticalBlock(const V& d1_1, const V& d1_3, const V& d1_5, const V& n2_1,
+                   const V& n2_3, const V& n2_5, const Input& input,
+                   size_t& ctr, float* ring_buffer, const Output output,
+                   float* JXL_RESTRICT out_pos) {
+  const HWY_FULL(float) d;
+  constexpr size_t kVN = MaxLanes(d);
+  // More cache-friendly to process an entirely cache line at a time
+  constexpr size_t kLanes = kVectors * kVN;
+
+  float* JXL_RESTRICT y_1 = ring_buffer + 0 * kLanes * kMod;
+  float* JXL_RESTRICT y_3 = ring_buffer + 1 * kLanes * kMod;
+  float* JXL_RESTRICT y_5 = ring_buffer + 2 * kLanes * kMod;
+
+  const size_t n_0 = (++ctr) % kMod;
+  const size_t n_1 = (ctr - 1) % kMod;
+  const size_t n_2 = (ctr - 2) % kMod;
+
+  for (size_t idx_vec = 0; idx_vec < kVectors; ++idx_vec) {
+    const V sum = input(idx_vec * kVN);
+
+    const V y_n1_1 = Load(d, y_1 + kLanes * n_1 + idx_vec * kVN);
+    const V y_n1_3 = Load(d, y_3 + kLanes * n_1 + idx_vec * kVN);
+    const V y_n1_5 = Load(d, y_5 + kLanes * n_1 + idx_vec * kVN);
+    const V y_n2_1 = Load(d, y_1 + kLanes * n_2 + idx_vec * kVN);
+    const V y_n2_3 = Load(d, y_3 + kLanes * n_2 + idx_vec * kVN);
+    const V y_n2_5 = Load(d, y_5 + kLanes * n_2 + idx_vec * kVN);
+    // (35)
+    const V y1 = MulAdd(n2_1, sum, NegMulSub(d1_1, y_n1_1, y_n2_1));
+    const V y3 = MulAdd(n2_3, sum, NegMulSub(d1_3, y_n1_3, y_n2_3));
+    const V y5 = MulAdd(n2_5, sum, NegMulSub(d1_5, y_n1_5, y_n2_5));
+    Store(y1, d, y_1 + kLanes * n_0 + idx_vec * kVN);
+    Store(y3, d, y_3 + kLanes * n_0 + idx_vec * kVN);
+    Store(y5, d, y_5 + kLanes * n_0 + idx_vec * kVN);
+    output(Add(y1, Add(y3, y5)), out_pos, idx_vec * kVN);
+  }
+  // NOTE: flushing cache line out_pos hurts performance - less so with
+  // clflushopt than clflush but still a significant slowdown.
+}
+
+// Reads/writes one block (kVectors full vectors) in each row.
+template <size_t kVectors>
+void VerticalStrip(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                   const ImageF& in, const size_t x, ImageF* JXL_RESTRICT out) {
+  // We're iterating vertically, so use multiple full-length vectors (each lane
+  // is one column of row n).
+  using D = HWY_FULL(float);
+  using V = Vec<D>;
+  const D d;
+  constexpr size_t kVN = MaxLanes(d);
+  // More cache-friendly to process an entirely cache line at a time
+  constexpr size_t kLanes = kVectors * kVN;
+#if HWY_TARGET == HWY_SCALAR
+  const V d1_1 = Set(d, rg->d1[0 * 4]);
+  const V d1_3 = Set(d, rg->d1[1 * 4]);
+  const V d1_5 = Set(d, rg->d1[2 * 4]);
+  const V n2_1 = Set(d, rg->n2[0 * 4]);
+  const V n2_3 = Set(d, rg->n2[1 * 4]);
+  const V n2_5 = Set(d, rg->n2[2 * 4]);
+#else
+  const V d1_1 = LoadDup128(d, rg->d1 + 0 * 4);
+  const V d1_3 = LoadDup128(d, rg->d1 + 1 * 4);
+  const V d1_5 = LoadDup128(d, rg->d1 + 2 * 4);
+  const V n2_1 = LoadDup128(d, rg->n2 + 0 * 4);
+  const V n2_3 = LoadDup128(d, rg->n2 + 1 * 4);
+  const V n2_5 = LoadDup128(d, rg->n2 + 2 * 4);
+#endif
+
+  const size_t N = rg->radius;
+  const size_t ysize = in.ysize();
+
+  size_t ctr = 0;
+  HWY_ALIGN float ring_buffer[3 * kLanes * kMod] = {0};
+  HWY_ALIGN static constexpr float zero[kLanes] = {0};
+
+  // Warmup: top is out of bounds (zero padded), bottom is usually in-bounds.
+  ssize_t n = -static_cast<ssize_t>(N) + 1;
+  for (; n < 0; ++n) {
+    // bottom is always non-negative since n is initialized in -N + 1.
+    const size_t bottom = n + N - 1;
+    VerticalBlock<kVectors>(
+        d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
+        SingleInput(bottom < ysize ? in.ConstRow(bottom) + x : zero), ctr,
+        ring_buffer, OutputNone(), nullptr);
+  }
+  JXL_DASSERT(n >= 0);
+
+  // Start producing output; top is still out of bounds.
+  for (; static_cast<size_t>(n) < std::min(N + 1, ysize); ++n) {
+    const size_t bottom = n + N - 1;
+    VerticalBlock<kVectors>(
+        d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
+        SingleInput(bottom < ysize ? in.ConstRow(bottom) + x : zero), ctr,
+        ring_buffer, OutputStore(), out->Row(n) + x);
+  }
+
+  // Interior outputs with prefetching and without bounds checks.
+  constexpr size_t kPrefetchRows = 8;
+  for (; n < static_cast<ssize_t>(ysize - N + 1 - kPrefetchRows); ++n) {
+    const size_t top = n - N - 1;
+    const size_t bottom = n + N - 1;
+    VerticalBlock<kVectors>(
+        d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
+        TwoInputs(in.ConstRow(top) + x, in.ConstRow(bottom) + x), ctr,
+        ring_buffer, OutputStore(), out->Row(n) + x);
+    hwy::Prefetch(in.ConstRow(top + kPrefetchRows) + x);
+    hwy::Prefetch(in.ConstRow(bottom + kPrefetchRows) + x);
+  }
+
+  // Bottom border without prefetching and with bounds checks.
+  for (; static_cast<size_t>(n) < ysize; ++n) {
+    const size_t top = n - N - 1;
+    const size_t bottom = n + N - 1;
+    VerticalBlock<kVectors>(
+        d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
+        TwoInputs(in.ConstRow(top) + x,
+                  bottom < ysize ? in.ConstRow(bottom) + x : zero),
+        ctr, ring_buffer, OutputStore(), out->Row(n) + x);
+  }
+}
+
+// Apply 1D vertical scan to multiple columns (one per vector lane).
+// Not yet parallelized.
+void FastGaussianVertical(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                          const ImageF& in, ThreadPool* /*pool*/,
+                          ImageF* JXL_RESTRICT out) {
+  PROFILER_FUNC;
+  JXL_CHECK(SameSize(in, *out));
+
+  constexpr size_t kCacheLineLanes = 64 / sizeof(float);
+  constexpr size_t kVN = MaxLanes(HWY_FULL(float)());
+  constexpr size_t kCacheLineVectors =
+      (kVN < kCacheLineLanes) ? (kCacheLineLanes / kVN) : 4;
+  constexpr size_t kFastPace = kCacheLineVectors * kVN;
+
+  size_t x = 0;
+  for (; x + kFastPace <= in.xsize(); x += kFastPace) {
+    VerticalStrip<kCacheLineVectors>(rg, in, x, out);
+  }
+  for (; x < in.xsize(); x += kVN) {
+    VerticalStrip<1>(rg, in, x, out);
+  }
+}
+
+// TODO(veluca): consider replacing with FastGaussian.
+ImageF ConvolveXSampleAndTranspose(const ImageF& in,
+                                   const std::vector<float>& kernel,
+                                   const size_t res) {
+  JXL_ASSERT(kernel.size() % 2 == 1);
+  JXL_ASSERT(in.xsize() % res == 0);
+  const size_t offset = res / 2;
+  const size_t out_xsize = in.xsize() / res;
+  ImageF out(in.ysize(), out_xsize);
+  const int r = kernel.size() / 2;
+  HWY_FULL(float) df;
+  std::vector<float> row_tmp(in.xsize() + 2 * r + Lanes(df));
+  float* const JXL_RESTRICT rowp = &row_tmp[r];
+  std::vector<float> padded_k = kernel;
+  padded_k.resize(padded_k.size() + Lanes(df));
+  const float* const kernelp = &padded_k[r];
+  for (size_t y = 0; y < in.ysize(); ++y) {
+    ExtrapolateBorders(in.Row(y), rowp, in.xsize(), r);
+    size_t x = offset, ox = 0;
+    for (; x < static_cast<uint32_t>(r) && x < in.xsize(); x += res, ++ox) {
+      float sum = 0.0f;
+      for (int i = -r; i <= r; ++i) {
+        sum += rowp[std::max<int>(
+                   0, std::min<int>(static_cast<int>(x) + i, in.xsize()))] *
+               kernelp[i];
+      }
+      out.Row(ox)[y] = sum;
+    }
+    for (; x + r < in.xsize(); x += res, ++ox) {
+      auto sum = Zero(df);
+      for (int i = -r; i <= r; i += Lanes(df)) {
+        sum = MulAdd(LoadU(df, rowp + x + i), LoadU(df, kernelp + i), sum);
+      }
+      out.Row(ox)[y] = GetLane(SumOfLanes(df, sum));
+    }
+    for (; x < in.xsize(); x += res, ++ox) {
+      float sum = 0.0f;
+      for (int i = -r; i <= r; ++i) {
+        sum += rowp[std::max<int>(
+                   0, std::min<int>(static_cast<int>(x) + i, in.xsize()))] *
+               kernelp[i];
+      }
+      out.Row(ox)[y] = sum;
+    }
+  }
+  return out;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(FastGaussian1D);
+HWY_EXPORT(ConvolveXSampleAndTranspose);
+void FastGaussian1D(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                    const float* JXL_RESTRICT in, intptr_t width,
+                    float* JXL_RESTRICT out) {
+  return HWY_DYNAMIC_DISPATCH(FastGaussian1D)(rg, in, width, out);
+}
+
+HWY_EXPORT(FastGaussianVertical);  // Local function.
+
+void ExtrapolateBorders(const float* const JXL_RESTRICT row_in,
+                        float* const JXL_RESTRICT row_out, const int xsize,
+                        const int radius) {
+  const int lastcol = xsize - 1;
+  for (int x = 1; x <= radius; ++x) {
+    row_out[-x] = row_in[std::min(x, xsize - 1)];
+  }
+  memcpy(row_out, row_in, xsize * sizeof(row_out[0]));
+  for (int x = 1; x <= radius; ++x) {
+    row_out[lastcol + x] = row_in[std::max(0, lastcol - x)];
+  }
+}
+
+ImageF ConvolveXSampleAndTranspose(const ImageF& in,
+                                   const std::vector<float>& kernel,
+                                   const size_t res) {
+  return HWY_DYNAMIC_DISPATCH(ConvolveXSampleAndTranspose)(in, kernel, res);
+}
+
+Image3F ConvolveXSampleAndTranspose(const Image3F& in,
+                                    const std::vector<float>& kernel,
+                                    const size_t res) {
+  return Image3F(ConvolveXSampleAndTranspose(in.Plane(0), kernel, res),
+                 ConvolveXSampleAndTranspose(in.Plane(1), kernel, res),
+                 ConvolveXSampleAndTranspose(in.Plane(2), kernel, res));
+}
+
+ImageF ConvolveAndSample(const ImageF& in, const std::vector<float>& kernel,
+                         const size_t res) {
+  ImageF tmp = ConvolveXSampleAndTranspose(in, kernel, res);
+  return ConvolveXSampleAndTranspose(tmp, kernel, res);
+}
+
+// Implements "Recursive Implementation of the Gaussian Filter Using Truncated
+// Cosine Functions" by Charalampidis [2016].
+hwy::AlignedUniquePtr<RecursiveGaussian> CreateRecursiveGaussian(double sigma) {
+  PROFILER_FUNC;
+  auto rg = hwy::MakeUniqueAligned<RecursiveGaussian>();
+  constexpr double kPi = 3.141592653589793238;
+
+  const double radius = roundf(3.2795 * sigma + 0.2546);  // (57), "N"
+
+  // Table I, first row
+  const double pi_div_2r = kPi / (2.0 * radius);
+  const double omega[3] = {pi_div_2r, 3.0 * pi_div_2r, 5.0 * pi_div_2r};
+
+  // (37), k={1,3,5}
+  const double p_1 = +1.0 / std::tan(0.5 * omega[0]);
+  const double p_3 = -1.0 / std::tan(0.5 * omega[1]);
+  const double p_5 = +1.0 / std::tan(0.5 * omega[2]);
+
+  // (44), k={1,3,5}
+  const double r_1 = +p_1 * p_1 / std::sin(omega[0]);
+  const double r_3 = -p_3 * p_3 / std::sin(omega[1]);
+  const double r_5 = +p_5 * p_5 / std::sin(omega[2]);
+
+  // (50), k={1,3,5}
+  const double neg_half_sigma2 = -0.5 * sigma * sigma;
+  const double recip_radius = 1.0 / radius;
+  double rho[3];
+  for (size_t i = 0; i < 3; ++i) {
+    rho[i] = std::exp(neg_half_sigma2 * omega[i] * omega[i]) * recip_radius;
+  }
+
+  // second part of (52), k1,k2 = 1,3; 3,5; 5,1
+  const double D_13 = p_1 * r_3 - r_1 * p_3;
+  const double D_35 = p_3 * r_5 - r_3 * p_5;
+  const double D_51 = p_5 * r_1 - r_5 * p_1;
+
+  // (52), k=5
+  const double recip_d13 = 1.0 / D_13;
+  const double zeta_15 = D_35 * recip_d13;
+  const double zeta_35 = D_51 * recip_d13;
+
+  double A[9] = {p_1,     p_3,     p_5,  //
+                 r_1,     r_3,     r_5,  //  (56)
+                 zeta_15, zeta_35, 1};
+  JXL_CHECK(Inv3x3Matrix(A));
+  const double gamma[3] = {1, radius * radius - sigma * sigma,  // (55)
+                           zeta_15 * rho[0] + zeta_35 * rho[1] + rho[2]};
+  double beta[3];
+  Mul3x3Vector(A, gamma, beta);  // (53)
+
+  // Sanity check: correctly solved for beta (IIR filter weights are normalized)
+  const double sum = beta[0] * p_1 + beta[1] * p_3 + beta[2] * p_5;  // (39)
+  JXL_ASSERT(std::abs(sum - 1) < 1E-12);
+  (void)sum;
+
+  rg->radius = static_cast<int>(radius);
+
+  double n2[3];
+  double d1[3];
+  for (size_t i = 0; i < 3; ++i) {
+    n2[i] = -beta[i] * std::cos(omega[i] * (radius + 1.0));  // (33)
+    d1[i] = -2.0 * std::cos(omega[i]);                       // (33)
+
+    for (size_t lane = 0; lane < 4; ++lane) {
+      rg->n2[4 * i + lane] = static_cast<float>(n2[i]);
+      rg->d1[4 * i + lane] = static_cast<float>(d1[i]);
+    }
+
+    const double d_2 = d1[i] * d1[i];
+
+    // Obtained by expanding (35) for four consecutive outputs via sympy:
+    // n, d, p, pp = symbols('n d p pp')
+    // i0, i1, i2, i3 = symbols('i0 i1 i2 i3')
+    // o0, o1, o2, o3 = symbols('o0 o1 o2 o3')
+    // o0 = n*i0 - d*p - pp
+    // o1 = n*i1 - d*o0 - p
+    // o2 = n*i2 - d*o1 - o0
+    // o3 = n*i3 - d*o2 - o1
+    // Then expand(o3) and gather terms for p(prev), pp(prev2) etc.
+    rg->mul_prev[4 * i + 0] = -d1[i];
+    rg->mul_prev[4 * i + 1] = d_2 - 1.0;
+    rg->mul_prev[4 * i + 2] = -d_2 * d1[i] + 2.0 * d1[i];
+    rg->mul_prev[4 * i + 3] = d_2 * d_2 - 3.0 * d_2 + 1.0;
+    rg->mul_prev2[4 * i + 0] = -1.0;
+    rg->mul_prev2[4 * i + 1] = d1[i];
+    rg->mul_prev2[4 * i + 2] = -d_2 + 1.0;
+    rg->mul_prev2[4 * i + 3] = d_2 * d1[i] - 2.0 * d1[i];
+    rg->mul_in[4 * i + 0] = n2[i];
+    rg->mul_in[4 * i + 1] = -d1[i] * n2[i];
+    rg->mul_in[4 * i + 2] = d_2 * n2[i] - n2[i];
+    rg->mul_in[4 * i + 3] = -d_2 * d1[i] * n2[i] + 2.0 * d1[i] * n2[i];
+  }
+  return rg;
+}
+
+namespace {
+
+// Apply 1D horizontal scan to each row.
+void FastGaussianHorizontal(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                            const ImageF& in, ThreadPool* pool,
+                            ImageF* JXL_RESTRICT out) {
+  PROFILER_FUNC;
+  JXL_CHECK(SameSize(in, *out));
+
+  const intptr_t xsize = in.xsize();
+  JXL_CHECK(RunOnPool(
+      pool, 0, in.ysize(), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const size_t y = task;
+        const float* row_in = in.ConstRow(y);
+        float* JXL_RESTRICT row_out = out->Row(y);
+        FastGaussian1D(rg, row_in, xsize, row_out);
+      },
+      "FastGaussianHorizontal"));
+}
+
+}  // namespace
+
+void FastGaussian(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                  const ImageF& in, ThreadPool* pool, ImageF* JXL_RESTRICT temp,
+                  ImageF* JXL_RESTRICT out) {
+  FastGaussianHorizontal(rg, in, pool, temp);
+  HWY_DYNAMIC_DISPATCH(FastGaussianVertical)(rg, *temp, pool, out);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/gauss_blur.h b/third_party/jpeg-xl/lib/jxl/gauss_blur.h
new file mode 100644
index 0000000000..fb4741f03a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/gauss_blur.h
@@ -0,0 +1,94 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_GAUSS_BLUR_H_
+#define LIB_JXL_GAUSS_BLUR_H_
+
+#include <stddef.h>
+
+#include <cmath>
+#include <hwy/aligned_allocator.h>
+#include <vector>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+template <typename T>
+std::vector<T> GaussianKernel(int radius, T sigma) {
+  JXL_ASSERT(sigma > 0.0);
+  std::vector<T> kernel(2 * radius + 1);
+  const T scaler = -1.0 / (2 * sigma * sigma);
+  double sum = 0.0;
+  for (int i = -radius; i <= radius; ++i) {
+    const T val = std::exp(scaler * i * i);
+    kernel[i + radius] = val;
+    sum += val;
+  }
+  for (size_t i = 0; i < kernel.size(); ++i) {
+    kernel[i] /= sum;
+  }
+  return kernel;
+}
+
+// All convolution functions below apply mirroring of the input on the borders
+// in the following way:
+//
+//     input: [a0 a1 a2 ...  aN]
+//     mirrored input: [aR ... a1 | a0 a1 a2 .... aN | aN-1 ... aN-R]
+//
+// where R is the radius of the kernel (i.e. kernel size is 2*R+1).
+
+// REQUIRES: in.xsize() and in.ysize() are integer multiples of res.
+ImageF ConvolveAndSample(const ImageF& in, const std::vector<float>& kernel,
+                         const size_t res);
+
+// Private, used by test.
+void ExtrapolateBorders(const float* const JXL_RESTRICT row_in,
+                        float* const JXL_RESTRICT row_out, const int xsize,
+                        const int radius);
+
+// Only for use by CreateRecursiveGaussian and FastGaussian*.
+#pragma pack(push, 1)
+struct RecursiveGaussian {
+  // For k={1,3,5} in that order, each broadcasted 4x for LoadDup128. Used only
+  // for vertical passes.
+  float n2[3 * 4];
+  float d1[3 * 4];
+
+  // We unroll horizontal passes 4x - one output per lane. These are each lane's
+  // multiplier for the previous output (relative to the first of the four
+  // outputs). Indexing: 4 * 0..2 (for {1,3,5}) + 0..3 for the lane index.
+  float mul_prev[3 * 4];
+  // Ditto for the second to last output.
+  float mul_prev2[3 * 4];
+
+  // We multiply a vector of inputs 0..3 by a vector shifted from this array.
+  // in=0 uses all 4 (nonzero) terms; for in=3, the lower three lanes are 0.
+  float mul_in[3 * 4];
+
+  size_t radius;
+};
+#pragma pack(pop)
+
+// Precomputation for FastGaussian*; users may use the same pointer/storage in
+// subsequent calls to FastGaussian* with the same sigma.
+hwy::AlignedUniquePtr<RecursiveGaussian> CreateRecursiveGaussian(double sigma);
+
+// 1D Gaussian with zero-pad boundary handling and runtime independent of sigma.
+void FastGaussian1D(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                    const float* JXL_RESTRICT in, intptr_t width,
+                    float* JXL_RESTRICT out);
+
+// 2D Gaussian with zero-pad boundary handling and runtime independent of sigma.
+void FastGaussian(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                  const ImageF& in, ThreadPool* pool, ImageF* JXL_RESTRICT temp,
+                  ImageF* JXL_RESTRICT out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_GAUSS_BLUR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/gauss_blur_gbench.cc b/third_party/jpeg-xl/lib/jxl/gauss_blur_gbench.cc
new file mode 100644
index 0000000000..b1bb64abc5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/gauss_blur_gbench.cc
@@ -0,0 +1,126 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <hwy/targets.h>
+
+#include "benchmark/benchmark.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/gauss_blur.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+namespace {
+
+JXL_MAYBE_UNUSED ImageF Convolve(const ImageF& in,
+                                 const std::vector<float>& kernel) {
+  return ConvolveAndSample(in, kernel, 1);
+}
+
+void BM_GaussBlur1d(benchmark::State& state) {
+  // Uncomment to disable SIMD and force and scalar implementation
+  // hwy::DisableTargets(~HWY_SCALAR);
+  // Uncomment to run AVX2
+  // hwy::DisableTargets(HWY_AVX3);
+
+  const size_t length = state.range();
+  const double sigma = 7.0;  // (from Butteraugli application)
+  ImageF in(length, 1);
+  const float expected = length;
+  FillImage(expected, &in);
+
+  ImageF temp(length, 1);
+  ImageF out(length, 1);
+  const auto rg = CreateRecursiveGaussian(sigma);
+  for (auto _ : state) {
+    FastGaussian1D(rg, in.Row(0), length, out.Row(0));
+    // Prevent optimizing out
+    JXL_ASSERT(std::abs(out.ConstRow(0)[length / 2] - expected) / expected <
+               9E-5);
+  }
+  state.SetItemsProcessed(length * state.iterations());
+}
+
+void BM_GaussBlur2d(benchmark::State& state) {
+  // See GaussBlur1d for SIMD changes.
+
+  const size_t xsize = state.range();
+  const size_t ysize = xsize;
+  const double sigma = 7.0;  // (from Butteraugli application)
+  ImageF in(xsize, ysize);
+  const float expected = xsize + ysize;
+  FillImage(expected, &in);
+
+  ImageF temp(xsize, ysize);
+  ImageF out(xsize, ysize);
+  ThreadPool* null_pool = nullptr;
+  const auto rg = CreateRecursiveGaussian(sigma);
+  for (auto _ : state) {
+    FastGaussian(rg, in, null_pool, &temp, &out);
+    // Prevent optimizing out
+    JXL_ASSERT(std::abs(out.ConstRow(ysize / 2)[xsize / 2] - expected) /
+                   expected <
+               9E-5);
+  }
+  state.SetItemsProcessed(xsize * ysize * state.iterations());
+}
+
+void BM_GaussBlurFir(benchmark::State& state) {
+  // See GaussBlur1d for SIMD changes.
+
+  const size_t xsize = state.range();
+  const size_t ysize = xsize;
+  const double sigma = 7.0;  // (from Butteraugli application)
+  ImageF in(xsize, ysize);
+  const float expected = xsize + ysize;
+  FillImage(expected, &in);
+
+  ImageF temp(xsize, ysize);
+  ImageF out(xsize, ysize);
+  const std::vector<float> kernel =
+      GaussianKernel(static_cast<int>(4 * sigma), static_cast<float>(sigma));
+  for (auto _ : state) {
+    // Prevent optimizing out
+    JXL_ASSERT(std::abs(Convolve(in, kernel).ConstRow(ysize / 2)[xsize / 2] -
+                        expected) /
+                   expected <
+               9E-5);
+  }
+  state.SetItemsProcessed(xsize * ysize * state.iterations());
+}
+
+void BM_GaussBlurSep7(benchmark::State& state) {
+  // See GaussBlur1d for SIMD changes.
+
+  const size_t xsize = state.range();
+  const size_t ysize = xsize;
+  ImageF in(xsize, ysize);
+  const float expected = xsize + ysize;
+  FillImage(expected, &in);
+
+  ImageF temp(xsize, ysize);
+  ImageF out(xsize, ysize);
+  ThreadPool* null_pool = nullptr;
+  // Gaussian with sigma 1
+  const WeightsSeparable7 weights = {{HWY_REP4(0.383103f), HWY_REP4(0.241843f),
+                                      HWY_REP4(0.060626f), HWY_REP4(0.00598f)},
+                                     {HWY_REP4(0.383103f), HWY_REP4(0.241843f),
+                                      HWY_REP4(0.060626f), HWY_REP4(0.00598f)}};
+  for (auto _ : state) {
+    Separable7(in, Rect(in), weights, null_pool, &out);
+    // Prevent optimizing out
+    JXL_ASSERT(std::abs(out.ConstRow(ysize / 2)[xsize / 2] - expected) /
+                   expected <
+               9E-5);
+  }
+  state.SetItemsProcessed(xsize * ysize * state.iterations());
+}
+
+BENCHMARK(BM_GaussBlur1d)->Range(1 << 8, 1 << 14);
+BENCHMARK(BM_GaussBlur2d)->Range(1 << 7, 1 << 10);
+BENCHMARK(BM_GaussBlurFir)->Range(1 << 7, 1 << 10);
+BENCHMARK(BM_GaussBlurSep7)->Range(1 << 7, 1 << 10);
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/gauss_blur_test.cc b/third_party/jpeg-xl/lib/jxl/gauss_blur_test.cc
new file mode 100644
index 0000000000..097c1aa8df
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/gauss_blur_test.cc
@@ -0,0 +1,453 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/gauss_blur.h"
+
+#include <cmath>
+#include <hwy/targets.h>
+#include <vector>
+
+#include "lib/extras/time.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+
+bool NearEdge(const int64_t width, const int64_t peak) {
+  // When around 3*sigma from the edge, there is negligible truncation.
+  return peak < 10 || peak > width - 10;
+}
+
+// Follow the curve downwards by scanning right from `peak` and verifying
+// identical values at the same offset to the left.
+void VerifySymmetric(const int64_t width, const int64_t peak,
+                     const float* out) {
+  const double tolerance = NearEdge(width, peak) ? 0.015 : 6E-7;
+  for (int64_t i = 1;; ++i) {
+    // Stop if we passed either end of the array
+    if (peak - i < 0 || peak + i >= width) break;
+    EXPECT_GT(out[peak + i - 1] + tolerance, out[peak + i]);  // descending
+    EXPECT_NEAR(out[peak - i], out[peak + i], tolerance);     // symmetric
+  }
+}
+
+void TestImpulseResponse(size_t width, size_t peak) {
+  const auto rg3 = CreateRecursiveGaussian(3.0);
+  const auto rg4 = CreateRecursiveGaussian(4.0);
+  const auto rg5 = CreateRecursiveGaussian(5.0);
+
+  // Extra padding for 4x unrolling
+  auto in = hwy::AllocateAligned<float>(width + 3);
+  memset(in.get(), 0, sizeof(float) * (width + 3));
+  in[peak] = 1.0f;
+
+  auto out3 = hwy::AllocateAligned<float>(width + 3);
+  auto out4 = hwy::AllocateAligned<float>(width + 3);
+  auto out5 = hwy::AllocateAligned<float>(width + 3);
+  FastGaussian1D(rg3, in.get(), width, out3.get());
+  FastGaussian1D(rg4, out3.get(), width, out4.get());
+  FastGaussian1D(rg5, in.get(), width, out5.get());
+
+  VerifySymmetric(width, peak, out3.get());
+  VerifySymmetric(width, peak, out4.get());
+  VerifySymmetric(width, peak, out5.get());
+
+  // Wider kernel has flatter peak
+  EXPECT_LT(out5[peak] + 0.05, out3[peak]);
+
+  // Gauss3 o Gauss4 ~= Gauss5
+  const double tolerance = NearEdge(width, peak) ? 0.04 : 0.01;
+  for (size_t i = 0; i < width; ++i) {
+    EXPECT_NEAR(out4[i], out5[i], tolerance);
+  }
+}
+
+void TestImpulseResponseForWidth(size_t width) {
+  for (size_t i = 0; i < width; ++i) {
+    TestImpulseResponse(width, i);
+  }
+}
+
+TEST(GaussBlurTest, ImpulseResponse) {
+  TestImpulseResponseForWidth(10);  // tiny even
+  TestImpulseResponseForWidth(15);  // small odd
+  TestImpulseResponseForWidth(32);  // power of two
+  TestImpulseResponseForWidth(31);  // power of two - 1
+  TestImpulseResponseForWidth(33);  // power of two + 1
+}
+
+ImageF Convolve(const ImageF& in, const std::vector<float>& kernel) {
+  return ConvolveAndSample(in, kernel, 1);
+}
+
+// Higher-precision version for accuracy test.
+ImageF ConvolveAndTransposeF64(const ImageF& in,
+                               const std::vector<double>& kernel) {
+  JXL_ASSERT(kernel.size() % 2 == 1);
+  ImageF out(in.ysize(), in.xsize());
+  const int r = kernel.size() / 2;
+  std::vector<float> row_tmp(in.xsize() + 2 * r);
+  float* const JXL_RESTRICT rowp = &row_tmp[r];
+  const double* const kernelp = &kernel[r];
+  for (size_t y = 0; y < in.ysize(); ++y) {
+    ExtrapolateBorders(in.Row(y), rowp, in.xsize(), r);
+    for (size_t x = 0, ox = 0; x < in.xsize(); ++x, ++ox) {
+      double sum = 0.0;
+      for (int i = -r; i <= r; ++i) {
+        sum += rowp[std::max<int>(
+                   0, std::min<int>(static_cast<int>(x) + i, in.xsize()))] *
+               kernelp[i];
+      }
+      out.Row(ox)[y] = static_cast<float>(sum);
+    }
+  }
+  return out;
+}
+
+ImageF ConvolveF64(const ImageF& in, const std::vector<double>& kernel) {
+  ImageF tmp = ConvolveAndTransposeF64(in, kernel);
+  return ConvolveAndTransposeF64(tmp, kernel);
+}
+
+void TestDirac2D(size_t xsize, size_t ysize, double sigma) {
+  ImageF in(xsize, ysize);
+  ZeroFillImage(&in);
+  // We anyway ignore the border below, so might as well choose the middle.
+  in.Row(ysize / 2)[xsize / 2] = 1.0f;
+
+  ImageF temp(xsize, ysize);
+  ImageF out(xsize, ysize);
+  const auto rg = CreateRecursiveGaussian(sigma);
+  ThreadPool* null_pool = nullptr;
+  FastGaussian(rg, in, null_pool, &temp, &out);
+
+  const std::vector<float> kernel =
+      GaussianKernel(static_cast<int>(4 * sigma), static_cast<float>(sigma));
+  const ImageF expected = Convolve(in, kernel);
+
+  const double max_l1 = sigma < 1.5 ? 5E-3 : 6E-4;
+  const size_t border = 2 * sigma;
+
+  JXL_ASSERT_OK(VerifyRelativeError(expected, out, max_l1, 1E-8, _, border));
+}
+
+TEST(GaussBlurTest, Test2D) {
+  const std::vector<int> dimensions{6, 15, 17, 64, 50, 49};
+  for (int xsize : dimensions) {
+    for (int ysize : dimensions) {
+      for (double sigma : {1.0, 2.5, 3.6, 7.0}) {
+        TestDirac2D(static_cast<size_t>(xsize), static_cast<size_t>(ysize),
+                    sigma);
+      }
+    }
+  }
+}
+
+// Slow (44 sec). To run, remove the disabled prefix.
+TEST(GaussBlurTest, DISABLED_SlowTestDirac1D) {
+  const double sigma = 7.0;
+  const auto rg = CreateRecursiveGaussian(sigma);
+
+  // IPOL accuracy test uses 10^-15 tolerance, this is 2*10^-11.
+  const size_t radius = static_cast<size_t>(7 * sigma);
+  const std::vector<double> kernel = GaussianKernel(radius, sigma);
+
+  const size_t length = 16384;
+  ImageF inputs(length, 1);
+  ZeroFillImage(&inputs);
+
+  auto outputs = hwy::AllocateAligned<float>(length);
+
+  // One per center position
+  auto sum_abs_err = hwy::AllocateAligned<double>(length);
+  std::fill(sum_abs_err.get(), sum_abs_err.get() + length, 0.0);
+
+  for (size_t center = radius; center < length - radius; ++center) {
+    inputs.Row(0)[center - 1] = 0.0f;  // reset last peak, entire array now 0
+    inputs.Row(0)[center] = 1.0f;
+    FastGaussian1D(rg, inputs.Row(0), length, outputs.get());
+
+    const ImageF outputs_fir = ConvolveF64(inputs, kernel);
+
+    for (size_t i = 0; i < length; ++i) {
+      const float abs_err = std::abs(outputs[i] - outputs_fir.Row(0)[i]);
+      sum_abs_err[i] += static_cast<double>(abs_err);
+    }
+  }
+
+  const double max_abs_err =
+      *std::max_element(sum_abs_err.get(), sum_abs_err.get() + length);
+  printf("Max abs err: %.8e\n", max_abs_err);
+}
+
+void TestRandom(size_t xsize, size_t ysize, float min, float max, double sigma,
+                double max_l1, double max_rel) {
+  printf("%4" PRIuS " x %4" PRIuS " %4.1f %4.1f sigma %.1f\n", xsize, ysize,
+         min, max, sigma);
+  ImageF in(xsize, ysize);
+  RandomFillImage(&in, min, max, 65537 + xsize * 129 + ysize);
+  // FastGaussian/Convolve handle borders differently, so keep those pixels 0.
+  const size_t border = 4 * sigma;
+  SetBorder(border, 0.0f, &in);
+
+  ImageF temp(xsize, ysize);
+  ImageF out(xsize, ysize);
+  const auto rg = CreateRecursiveGaussian(sigma);
+  ThreadPool* null_pool = nullptr;
+  FastGaussian(rg, in, null_pool, &temp, &out);
+
+  const std::vector<float> kernel =
+      GaussianKernel(static_cast<int>(4 * sigma), static_cast<float>(sigma));
+  const ImageF expected = Convolve(in, kernel);
+
+  JXL_ASSERT_OK(VerifyRelativeError(expected, out, max_l1, max_rel, _, border));
+}
+
+void TestRandomForSizes(float min, float max, double sigma) {
+  double max_l1 = 6E-3;
+  double max_rel = 3E-3;
+  TestRandom(128, 1, min, max, sigma, max_l1, max_rel);
+  TestRandom(1, 128, min, max, sigma, max_l1, max_rel);
+  TestRandom(30, 201, min, max, sigma, max_l1 * 1.6, max_rel * 1.2);
+  TestRandom(201, 30, min, max, sigma, max_l1 * 1.6, max_rel * 1.2);
+  TestRandom(201, 201, min, max, sigma, max_l1 * 2.0, max_rel * 1.2);
+}
+
+TEST(GaussBlurTest, TestRandom) {
+  // small non-negative
+  TestRandomForSizes(0.0f, 10.0f, 3.0f);
+  TestRandomForSizes(0.0f, 10.0f, 7.0f);
+
+  // small negative
+  TestRandomForSizes(-4.0f, -1.0f, 3.0f);
+  TestRandomForSizes(-4.0f, -1.0f, 7.0f);
+
+  // mixed positive/negative
+  TestRandomForSizes(-6.0f, 6.0f, 3.0f);
+  TestRandomForSizes(-6.0f, 6.0f, 7.0f);
+}
+
+TEST(GaussBlurTest, TestSign) {
+  const size_t xsize = 500;
+  const size_t ysize = 606;
+  ImageF in(xsize, ysize);
+
+  ZeroFillImage(&in);
+  const float center[33 * 33] = {
+      -0.128445f, -0.098473f, -0.121883f, -0.093601f, 0.095665f,  -0.271332f,
+      -0.705475f, -1.324005f, -2.020741f, -1.329464f, 1.834064f,  4.787300f,
+      5.834560f,  5.272720f,  3.967960f,  3.547935f,  3.432732f,  3.383015f,
+      3.239326f,  3.290806f,  3.298954f,  3.397808f,  3.359730f,  3.533844f,
+      3.511856f,  3.436787f,  3.428310f,  3.460209f,  3.550011f,  3.590942f,
+      3.593109f,  3.560005f,  3.443165f,  0.089741f,  0.179230f,  -0.032997f,
+      -0.182610f, 0.005669f,  -0.244759f, -0.395123f, -0.514961f, -1.003529f,
+      -1.798656f, -2.377975f, 0.222191f,  3.957664f,  5.946804f,  5.543129f,
+      4.290096f,  3.621010f,  3.407257f,  3.392494f,  3.345367f,  3.391903f,
+      3.441605f,  3.429260f,  3.444969f,  3.507130f,  3.518612f,  3.443111f,
+      3.475948f,  3.536148f,  3.470333f,  3.628311f,  3.600243f,  3.292892f,
+      -0.226730f, -0.573616f, -0.762165f, -0.398739f, -0.189842f, -0.275921f,
+      -0.446739f, -0.550037f, -0.461033f, -0.724792f, -1.448349f, -1.814064f,
+      -0.491032f, 2.817703f,  5.213242f,  5.675629f,  4.864548f,  3.876324f,
+      3.535587f,  3.530312f,  3.413765f,  3.386261f,  3.404854f,  3.383472f,
+      3.420830f,  3.326496f,  3.257877f,  3.362152f,  3.489609f,  3.619587f,
+      3.555805f,  3.423164f,  3.309708f,  -0.483940f, -0.502926f, -0.592983f,
+      -0.492527f, -0.413616f, -0.482555f, -0.475506f, -0.447990f, -0.338120f,
+      -0.189072f, -0.376427f, -0.910828f, -1.878044f, -1.937927f, 1.423218f,
+      4.871609f,  5.767548f,  5.103741f,  3.983868f,  3.633003f,  3.458263f,
+      3.507309f,  3.247021f,  3.220612f,  3.326061f,  3.352814f,  3.291061f,
+      3.322739f,  3.444302f,  3.506207f,  3.556839f,  3.529575f,  3.457024f,
+      -0.408161f, -0.431343f, -0.454369f, -0.356419f, -0.380924f, -0.399452f,
+      -0.439476f, -0.412189f, -0.306816f, -0.008213f, -0.325813f, -0.537842f,
+      -0.984100f, -1.805332f, -2.028198f, 0.773205f,  4.423046f,  5.604839f,
+      5.231617f,  4.080299f,  3.603008f,  3.498741f,  3.517010f,  3.333897f,
+      3.381336f,  3.342617f,  3.369686f,  3.434155f,  3.490452f,  3.607029f,
+      3.555298f,  3.702297f,  3.618679f,  -0.503609f, -0.578564f, -0.419014f,
+      -0.239883f, 0.269836f,  0.022984f,  -0.455067f, -0.621777f, -0.304176f,
+      -0.163792f, -0.490250f, -0.466637f, -0.391792f, -0.657940f, -1.498035f,
+      -1.895836f, 0.036537f,  3.462456f,  5.586445f,  5.658791f,  4.434784f,
+      3.423435f,  3.318848f,  3.202328f,  3.532764f,  3.436687f,  3.354881f,
+      3.356941f,  3.382645f,  3.503902f,  3.512867f,  3.632366f,  3.537312f,
+      -0.274734f, -0.658829f, -0.726532f, -0.281254f, 0.053196f,  -0.064991f,
+      -0.608517f, -0.720966f, -0.070602f, -0.111320f, -0.440956f, -0.492180f,
+      -0.488762f, -0.569283f, -1.012741f, -1.582779f, -2.101479f, -1.392380f,
+      2.451153f,  5.555855f,  6.096313f,  5.230045f,  4.068172f,  3.404274f,
+      3.392586f,  3.326065f,  3.156670f,  3.284828f,  3.347012f,  3.319252f,
+      3.352310f,  3.610790f,  3.499847f,  -0.150600f, -0.314445f, -0.093575f,
+      -0.057384f, 0.053688f,  -0.189255f, -0.263515f, -0.318653f, 0.053246f,
+      0.080627f,  -0.119553f, -0.152454f, -0.305420f, -0.404869f, -0.385944f,
+      -0.689949f, -1.204914f, -1.985748f, -1.711361f, 1.260658f,  4.626896f,
+      5.888351f,  5.450989f,  4.070587f,  3.539200f,  3.383492f,  3.296318f,
+      3.267334f,  3.436028f,  3.463005f,  3.502625f,  3.522282f,  3.403763f,
+      -0.348049f, -0.302303f, -0.137016f, -0.041737f, -0.164001f, -0.358849f,
+      -0.469627f, -0.428291f, -0.375797f, -0.246346f, -0.118950f, -0.084229f,
+      -0.205681f, -0.241199f, -0.391796f, -0.323151f, -0.241211f, -0.834137f,
+      -1.684219f, -1.972137f, 0.448399f,  4.019985f,  5.648144f,  5.647846f,
+      4.295094f,  3.641884f,  3.374790f,  3.197342f,  3.425545f,  3.507481f,
+      3.478065f,  3.430889f,  3.341900f,  -1.016304f, -0.959221f, -0.909466f,
+      -0.810715f, -0.590729f, -0.594467f, -0.646721f, -0.629364f, -0.528561f,
+      -0.551819f, -0.301086f, -0.149101f, -0.060146f, -0.162220f, -0.326210f,
+      -0.156548f, -0.036293f, -0.426098f, -1.145470f, -1.628998f, -2.003052f,
+      -1.142891f, 2.885162f,  5.652863f,  5.718426f,  4.911140f,  3.234222f,
+      3.473373f,  3.577183f,  3.271603f,  3.410435f,  3.505489f,  3.434032f,
+      -0.508911f, -0.438797f, -0.437450f, -0.627426f, -0.511745f, -0.304874f,
+      -0.274246f, -0.261841f, -0.228466f, -0.342491f, -0.528206f, -0.490082f,
+      -0.516350f, -0.361694f, -0.398514f, -0.276020f, -0.210369f, -0.355938f,
+      -0.402622f, -0.538864f, -1.249573f, -2.100105f, -0.996178f, 1.886410f,
+      4.929745f,  5.630871f,  5.444199f,  4.042740f,  3.739189f,  3.691399f,
+      3.391956f,  3.469696f,  3.431232f,  0.204849f,  0.205433f,  -0.131927f,
+      -0.367908f, -0.374378f, -0.126820f, -0.186951f, -0.228565f, -0.081776f,
+      -0.143143f, -0.379230f, -0.598701f, -0.458019f, -0.295586f, -0.407730f,
+      -0.245853f, -0.043140f, 0.024242f,  -0.038998f, -0.044151f, -0.425991f,
+      -1.240753f, -1.943146f, -2.174755f, 0.523415f,  4.376751f,  5.956558f,
+      5.850082f,  4.403152f,  3.517399f,  3.560753f,  3.554836f,  3.471985f,
+      -0.508503f, -0.109783f, 0.057747f,  0.190079f,  -0.257153f, -0.591980f,
+      -0.666771f, -0.525391f, -0.293060f, -0.489731f, -0.304855f, -0.259644f,
+      -0.367825f, -0.346977f, -0.292889f, -0.215652f, -0.120705f, -0.176010f,
+      -0.422905f, -0.114647f, -0.289749f, -0.374203f, -0.606754f, -1.127949f,
+      -1.994583f, -0.588058f, 3.415840f,  5.603470f,  5.811581f,  4.959423f,
+      3.721760f,  3.710499f,  3.785461f,  -0.554588f, -0.565517f, -0.434578f,
+      -0.012482f, -0.284660f, -0.699795f, -0.957535f, -0.755135f, -0.382034f,
+      -0.321552f, -0.287571f, -0.279537f, -0.314972f, -0.256287f, -0.372818f,
+      -0.316017f, -0.287975f, -0.365639f, -0.512589f, -0.420692f, -0.436485f,
+      -0.295353f, -0.451958f, -0.755459f, -1.272358f, -2.301353f, -1.776161f,
+      1.572483f,  4.826286f,  5.741898f,  5.162853f,  4.028049f,  3.686325f,
+      -0.495590f, -0.664413f, -0.760044f, -0.152634f, -0.286480f, -0.340462f,
+      0.076477f,  0.187706f,  -0.068787f, -0.293491f, -0.361145f, -0.292515f,
+      -0.140671f, -0.190723f, -0.333302f, -0.368168f, -0.192581f, -0.154499f,
+      -0.236544f, -0.124405f, -0.208321f, -0.465607f, -0.883080f, -1.104813f,
+      -1.210567f, -1.415665f, -1.924683f, -1.634758f, 0.601017f,  4.276672f,
+      5.501350f,  5.331257f,  3.809288f,  -0.727722f, -0.533619f, -0.511524f,
+      -0.470688f, -0.610710f, -0.575130f, -0.311115f, -0.090420f, -0.297676f,
+      -0.646118f, -0.742805f, -0.485050f, -0.330910f, -0.275417f, -0.357037f,
+      -0.425598f, -0.481876f, -0.488941f, -0.393551f, -0.051105f, -0.090755f,
+      -0.328674f, -0.536369f, -0.533684f, -0.336960f, -0.689194f, -1.187195f,
+      -1.860954f, -2.290253f, -0.424774f, 3.050060f,  5.083332f,  5.291920f,
+      -0.343605f, -0.190975f, -0.303692f, -0.456512f, -0.681820f, -0.690693f,
+      -0.416729f, -0.286446f, -0.442055f, -0.709148f, -0.569160f, -0.382423f,
+      -0.402321f, -0.383362f, -0.366413f, -0.290718f, -0.110069f, -0.220280f,
+      -0.279018f, -0.255424f, -0.262081f, -0.487556f, -0.444492f, -0.250500f,
+      -0.119583f, -0.291557f, -0.537781f, -1.104073f, -1.737091f, -1.697441f,
+      -0.323456f, 2.042049f,  4.605103f,  -0.310631f, -0.279568f, -0.012695f,
+      -0.160130f, -0.358746f, -0.421101f, -0.559677f, -0.474136f, -0.416565f,
+      -0.561817f, -0.534672f, -0.519157f, -0.767197f, -0.605831f, -0.186523f,
+      0.219872f,  0.264984f,  -0.193432f, -0.363182f, -0.467472f, -0.462009f,
+      -0.571053f, -0.522476f, -0.315903f, -0.237427f, -0.147320f, -0.100201f,
+      -0.237568f, -0.763435f, -1.242043f, -2.135159f, -1.409485f, 1.236370f,
+      -0.474247f, -0.517906f, -0.410217f, -0.542244f, -0.795986f, -0.590004f,
+      -0.388863f, -0.462921f, -0.810627f, -0.778637f, -0.512486f, -0.718025f,
+      -0.710854f, -0.482513f, -0.318233f, -0.194962f, -0.220116f, -0.421673f,
+      -0.534233f, -0.403339f, -0.389332f, -0.407303f, -0.437355f, -0.469730f,
+      -0.359600f, -0.352745f, -0.466755f, -0.414585f, -0.430756f, -0.656822f,
+      -1.237038f, -2.046097f, -1.574898f, -0.593815f, -0.582165f, -0.336098f,
+      -0.372612f, -0.554386f, -0.410603f, -0.428276f, -0.647644f, -0.640720f,
+      -0.582207f, -0.414112f, -0.435547f, -0.435505f, -0.332561f, -0.248116f,
+      -0.340221f, -0.277855f, -0.352699f, -0.377319f, -0.230850f, -0.313267f,
+      -0.446270f, -0.346237f, -0.420422f, -0.530781f, -0.400341f, -0.463661f,
+      -0.209091f, -0.056705f, -0.011772f, -0.169388f, -0.736275f, -1.463017f,
+      -0.752701f, -0.668865f, -0.329765f, -0.299347f, -0.245667f, -0.286999f,
+      -0.520420f, -0.675438f, -0.255753f, 0.141357f,  -0.079639f, -0.419476f,
+      -0.374069f, -0.046253f, 0.116116f,  -0.145847f, -0.380371f, -0.563412f,
+      -0.638634f, -0.310116f, -0.260914f, -0.508404f, -0.465508f, -0.527824f,
+      -0.370979f, -0.305595f, -0.244694f, -0.254490f, 0.009968f,  -0.050201f,
+      -0.331219f, -0.614960f, -0.788208f, -0.483242f, -0.367516f, -0.186951f,
+      -0.180031f, 0.129711f,  -0.127811f, -0.384750f, -0.499542f, -0.418613f,
+      -0.121635f, 0.203197f,  -0.167290f, -0.397270f, -0.355461f, -0.218746f,
+      -0.376785f, -0.521698f, -0.721581f, -0.845741f, -0.535439f, -0.220882f,
+      -0.309067f, -0.555248f, -0.690342f, -0.664948f, -0.390102f, 0.020355f,
+      -0.130447f, -0.173252f, -0.170059f, -0.633663f, -0.956001f, -0.621696f,
+      -0.388302f, -0.342262f, -0.244370f, -0.386948f, -0.401421f, -0.172979f,
+      -0.206163f, -0.450058f, -0.525789f, -0.549274f, -0.349251f, -0.474613f,
+      -0.667976f, -0.435600f, -0.175369f, -0.196877f, -0.202976f, -0.242481f,
+      -0.258369f, -0.189133f, -0.395397f, -0.765499f, -0.944016f, -0.850967f,
+      -0.631561f, -0.152493f, -0.046432f, -0.262066f, -0.195919f, 0.048218f,
+      0.084972f,  0.039902f,  0.000618f,  -0.404430f, -0.447456f, -0.418076f,
+      -0.631935f, -0.717415f, -0.502888f, -0.530514f, -0.747826f, -0.704041f,
+      -0.674969f, -0.516853f, -0.418446f, -0.327740f, -0.308815f, -0.481636f,
+      -0.440083f, -0.481720f, -0.341053f, -0.283897f, -0.324368f, -0.352829f,
+      -0.434349f, -0.545589f, -0.533104f, -0.472755f, -0.570496f, -0.557735f,
+      -0.708176f, -0.493332f, -0.194416f, -0.186249f, -0.256710f, -0.271835f,
+      -0.304752f, -0.431267f, -0.422398f, -0.646725f, -0.680801f, -0.249031f,
+      -0.058567f, -0.213890f, -0.383949f, -0.540291f, -0.549877f, -0.225567f,
+      -0.037174f, -0.499874f, -0.641010f, -0.628044f, -0.390549f, -0.311497f,
+      -0.542313f, -0.569565f, -0.473408f, -0.331245f, -0.357197f, -0.285599f,
+      -0.200157f, -0.201866f, -0.124428f, -0.346016f, -0.392311f, -0.264496f,
+      -0.285370f, -0.436974f, -0.523483f, -0.410461f, -0.267925f, -0.055016f,
+      -0.382458f, -0.319771f, -0.049927f, 0.124329f,  0.266102f,  -0.106606f,
+      -0.773647f, -0.973053f, -0.708206f, -0.486137f, -0.319923f, -0.493900f,
+      -0.490860f, -0.324986f, -0.147346f, -0.146088f, -0.161758f, -0.084396f,
+      -0.379494f, 0.041626f,  -0.113361f, -0.277767f, 0.083366f,  0.126476f,
+      0.139057f,  0.038040f,  0.038162f,  -0.242126f, -0.411736f, -0.370049f,
+      -0.455357f, -0.039257f, 0.264442f,  -0.271492f, -0.425346f, -0.514847f,
+      -0.448650f, -0.580399f, -0.652603f, -0.774803f, -0.692524f, -0.579578f,
+      -0.465206f, -0.386265f, -0.458012f, -0.446594f, -0.284893f, -0.345448f,
+      -0.350876f, -0.440350f, -0.360378f, -0.270428f, 0.237213f,  -0.063602f,
+      -0.364529f, -0.179867f, 0.078197f,  0.117947f,  -0.093410f, -0.359119f,
+      -0.480961f, -0.540638f, -0.436287f, -0.598576f, -0.253735f, -0.060093f,
+      -0.549145f, -0.808327f, -0.698593f, -0.595764f, -0.582508f, -0.497353f,
+      -0.480892f, -0.584240f, -0.665791f, -0.690903f, -0.743446f, -0.796677f,
+      -0.782391f, -0.649010f, -0.628139f, -0.880848f, -0.829361f, -0.373272f,
+      -0.223667f, 0.174572f,  -0.348743f, -0.798901f, -0.692307f, -0.607609f,
+      -0.401455f, -0.480919f, -0.450798f, -0.435413f, -0.322338f, -0.228382f,
+      -0.450466f, -0.504440f, -0.477402f, -0.662224f, -0.583397f, -0.217445f,
+      -0.157459f, -0.079584f, -0.226168f, -0.488720f, -0.669624f, -0.666878f,
+      -0.565311f, -0.549625f, -0.364601f, -0.497627f, -0.736897f, -0.763023f,
+      -0.741020f, -0.404503f, 0.184814f,  -0.075315f, -0.281513f, -0.532906f,
+      -0.405800f, -0.313438f, -0.536652f, -0.403381f, 0.011967f,  0.103310f,
+      -0.269848f, -0.508656f, -0.445923f, -0.644859f, -0.617870f, -0.500927f,
+      -0.371559f, -0.125580f, 0.028625f,  -0.154713f, -0.442024f, -0.492764f,
+      -0.199371f, 0.236305f,  0.225925f,  0.075577f,  -0.285812f, -0.437145f,
+      -0.374260f, -0.156693f, -0.129635f, -0.243206f, -0.123058f, 0.162148f,
+      -0.313152f, -0.337982f, -0.358421f, 0.040070f,  0.038925f,  -0.333313f,
+      -0.351662f, 0.023014f,  0.091362f,  -0.282890f, -0.373253f, -0.389050f,
+      -0.532707f, -0.423347f, -0.349968f, -0.287045f, -0.202442f, -0.308430f,
+      -0.222801f, -0.106323f, -0.056358f, 0.027222f,  0.390732f,  0.033558f,
+      -0.160088f, -0.382217f, -0.535282f, -0.515900f, -0.022736f, 0.165665f,
+      -0.111408f, -0.233784f, -0.312357f, -0.541885f, -0.480022f, -0.482513f,
+      -0.246254f, 0.132244f,  0.090134f,  0.234634f,  -0.089249f, -0.460854f,
+      -0.515457f, -0.450874f, -0.311031f, -0.387680f, -0.360554f, -0.179241f,
+      -0.283817f, -0.475815f, -0.246399f, -0.388958f, -0.551140f, -0.496239f,
+      -0.559879f, -0.379761f, -0.254288f, -0.395111f, -0.613018f, -0.459427f,
+      -0.263580f, -0.268929f, 0.080826f,  0.115616f,  -0.097324f, -0.325310f,
+      -0.480450f, -0.313286f, -0.310371f, -0.517361f, -0.288288f, -0.112679f,
+      -0.173241f, -0.221664f, -0.039452f, -0.107578f, -0.089630f, -0.483768f,
+      -0.571087f, -0.497108f, -0.321533f, -0.375492f, -0.540363f, -0.406815f,
+      -0.388512f, -0.514561f, -0.540192f, -0.402412f, -0.232246f, -0.304749f,
+      -0.383724f, -0.679596f, -0.685463f, -0.694538f, -0.642937f, -0.425789f,
+      0.103271f,  -0.194862f, -0.487999f, -0.717281f, -0.681850f, -0.709286f,
+      -0.615398f, -0.554245f, -0.254681f, -0.049950f, -0.002914f, -0.095383f,
+      -0.370911f, -0.564224f, -0.242714f};
+  const size_t xtest = xsize / 2;
+  const size_t ytest = ysize / 2;
+
+  for (intptr_t dy = -16; dy <= 16; ++dy) {
+    float* row = in.Row(ytest + dy);
+    for (intptr_t dx = -16; dx <= 16; ++dx)
+      row[xtest + dx] = center[(dy + 16) * 33 + (dx + 16)];
+  }
+
+  const double sigma = 7.155933;
+
+  ImageF temp(xsize, ysize);
+  ImageF out_rg(xsize, ysize);
+  const auto rg = CreateRecursiveGaussian(sigma);
+  ThreadPool* null_pool = nullptr;
+  FastGaussian(rg, in, null_pool, &temp, &out_rg);
+
+  ImageF out_old;
+  {
+    const std::vector<float> kernel =
+        GaussianKernel(static_cast<int>(4 * sigma), static_cast<float>(sigma));
+    printf("old kernel size %" PRIuS "\n", kernel.size());
+    out_old = Convolve(in, kernel);
+  }
+
+  printf("rg %.4f old %.4f\n", out_rg.Row(ytest)[xtest],
+         out_old.Row(ytest)[xtest]);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/gradient_test.cc b/third_party/jpeg-xl/lib/jxl/gradient_test.cc
new file mode 100644
index 0000000000..282fe89f0a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/gradient_test.cc
@@ -0,0 +1,207 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <array>
+#include <utility>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+namespace {
+
+// Returns distance of point p to line p0..p1, the result is signed and is not
+// normalized.
+double PointLineDist(double x0, double y0, double x1, double y1, double x,
+                     double y) {
+  return (y1 - y0) * x - (x1 - x0) * y + x1 * y0 - y1 * x0;
+}
+
+// Generates a test image with a gradient from one color to another.
+// Angle in degrees, colors can be given in hex as 0xRRGGBB. The angle is the
+// angle in which the change direction happens.
+Image3F GenerateTestGradient(uint32_t color0, uint32_t color1, double angle,
+                             size_t xsize, size_t ysize) {
+  Image3F image(xsize, ysize);
+
+  double x0 = xsize / 2;
+  double y0 = ysize / 2;
+  double x1 = x0 + std::sin(angle / 360.0 * 2.0 * kPi);
+  double y1 = y0 + std::cos(angle / 360.0 * 2.0 * kPi);
+
+  double maxdist =
+      std::max<double>(fabs(PointLineDist(x0, y0, x1, y1, 0, 0)),
+                       fabs(PointLineDist(x0, y0, x1, y1, xsize, 0)));
+
+  for (size_t c = 0; c < 3; ++c) {
+    float c0 = ((color0 >> (8 * (2 - c))) & 255);
+    float c1 = ((color1 >> (8 * (2 - c))) & 255);
+    for (size_t y = 0; y < ysize; ++y) {
+      float* row = image.PlaneRow(c, y);
+      for (size_t x = 0; x < xsize; ++x) {
+        double dist = PointLineDist(x0, y0, x1, y1, x, y);
+        double v = ((dist / maxdist) + 1.0) / 2.0;
+        float color = c0 * (1.0 - v) + c1 * v;
+        row[x] = color;
+      }
+    }
+  }
+
+  return image;
+}
+
+// Computes the max of the horizontal and vertical second derivative for each
+// pixel, where second derivative means absolute value of difference of left
+// delta and right delta (top/bottom for vertical direction).
+// The radius over which the derivative is computed is only 1 pixel and it only
+// checks two angles (hor and ver), but this approximation works well enough.
+static ImageF Gradient2(const ImageF& image) {
+  size_t xsize = image.xsize();
+  size_t ysize = image.ysize();
+  ImageF image2(image.xsize(), image.ysize());
+  for (size_t y = 1; y + 1 < ysize; y++) {
+    const auto* JXL_RESTRICT row0 = image.Row(y - 1);
+    const auto* JXL_RESTRICT row1 = image.Row(y);
+    const auto* JXL_RESTRICT row2 = image.Row(y + 1);
+    auto* row_out = image2.Row(y);
+    for (size_t x = 1; x + 1 < xsize; x++) {
+      float ddx = (row1[x] - row1[x - 1]) - (row1[x + 1] - row1[x]);
+      float ddy = (row1[x] - row0[x]) - (row2[x] - row1[x]);
+      row_out[x] = std::max(fabsf(ddx), fabsf(ddy));
+    }
+  }
+  // Copy to the borders
+  if (ysize > 2) {
+    auto* JXL_RESTRICT row0 = image2.Row(0);
+    const auto* JXL_RESTRICT row1 = image2.Row(1);
+    const auto* JXL_RESTRICT row2 = image2.Row(ysize - 2);
+    auto* JXL_RESTRICT row3 = image2.Row(ysize - 1);
+    for (size_t x = 1; x + 1 < xsize; x++) {
+      row0[x] = row1[x];
+      row3[x] = row2[x];
+    }
+  } else {
+    const auto* row0_in = image.Row(0);
+    const auto* row1_in = image.Row(ysize - 1);
+    auto* row0_out = image2.Row(0);
+    auto* row1_out = image2.Row(ysize - 1);
+    for (size_t x = 1; x + 1 < xsize; x++) {
+      // Image too narrow, take first derivative instead
+      row0_out[x] = row1_out[x] = fabsf(row0_in[x] - row1_in[x]);
+    }
+  }
+  if (xsize > 2) {
+    for (size_t y = 0; y < ysize; y++) {
+      auto* row = image2.Row(y);
+      row[0] = row[1];
+      row[xsize - 1] = row[xsize - 2];
+    }
+  } else {
+    for (size_t y = 0; y < ysize; y++) {
+      const auto* JXL_RESTRICT row_in = image.Row(y);
+      auto* row_out = image2.Row(y);
+      // Image too narrow, take first derivative instead
+      row_out[0] = row_out[xsize - 1] = fabsf(row_in[0] - row_in[xsize - 1]);
+    }
+  }
+  return image2;
+}
+
+static Image3F Gradient2(const Image3F& image) {
+  return Image3F(Gradient2(image.Plane(0)), Gradient2(image.Plane(1)),
+                 Gradient2(image.Plane(2)));
+}
+
+/*
+Tests if roundtrip with jxl on a gradient image doesn't cause banding.
+Only tests if use_gradient is true. Set to false for debugging to see the
+distance values.
+Angle in degrees, colors can be given in hex as 0xRRGGBB.
+*/
+void TestGradient(ThreadPool* pool, uint32_t color0, uint32_t color1,
+                  size_t xsize, size_t ysize, float angle, bool fast_mode,
+                  float butteraugli_distance, bool use_gradient = true) {
+  CompressParams cparams;
+  cparams.butteraugli_distance = butteraugli_distance;
+  if (fast_mode) {
+    cparams.speed_tier = SpeedTier::kSquirrel;
+  }
+  Image3F gradient = GenerateTestGradient(color0, color1, angle, xsize, ysize);
+
+  CodecInOut io;
+  io.metadata.m.SetUintSamples(8);
+  io.metadata.m.color_encoding = ColorEncoding::SRGB();
+  io.SetFromImage(std::move(gradient), io.metadata.m.color_encoding);
+
+  CodecInOut io2;
+
+  PaddedBytes compressed;
+  AuxOut* aux_out = nullptr;
+  PassesEncoderState enc_state;
+  EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                         aux_out, pool));
+  EXPECT_TRUE(
+      test::DecodeFile({}, Span<const uint8_t>(compressed), &io2, pool));
+  EXPECT_TRUE(
+      io2.Main().TransformTo(io2.metadata.m.color_encoding, GetJxlCms(), pool));
+
+  if (use_gradient) {
+    // Test that the gradient map worked. For that, we take a second derivative
+    // of the image with Gradient2 to measure how linear the change is in x and
+    // y direction. For a well handled gradient, we expect max values around
+    // 0.1, while if there is noticeable banding, which means the gradient map
+    // failed, the values are around 0.5-1.0 (regardless of
+    // butteraugli_distance).
+    Image3F gradient2 = Gradient2(*io2.Main().color());
+
+    std::array<float, 3> image_max;
+    Image3Max(gradient2, &image_max);
+
+    // TODO(jyrki): These values used to work with 0.2, 0.2, 0.2.
+    EXPECT_LE(image_max[0], 3.15);
+    EXPECT_LE(image_max[1], 1.72);
+    EXPECT_LE(image_max[2], 5.05);
+  }
+}
+
+static constexpr bool fast_mode = true;
+
+TEST(GradientTest, SteepGradient) {
+  test::ThreadPoolForTests pool(8);
+  // Relatively steep gradients, colors from the sky of stp.png
+  TestGradient(&pool, 0xd99d58, 0x889ab1, 512, 512, 90, fast_mode, 3.0);
+}
+
+TEST(GradientTest, SubtleGradient) {
+  test::ThreadPoolForTests pool(8);
+  // Very subtle gradient
+  TestGradient(&pool, 0xb89b7b, 0xa89b8d, 512, 512, 90, fast_mode, 4.0);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/headers.cc b/third_party/jpeg-xl/lib/jxl/headers.cc
new file mode 100644
index 0000000000..dc53726385
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/headers.cc
@@ -0,0 +1,194 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/headers.h"
+
+#include "lib/jxl/common.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+namespace {
+
+struct Rational {
+  constexpr explicit Rational(uint32_t num, uint32_t den)
+      : num(num), den(den) {}
+
+  // Returns floor(multiplicand * rational).
+  constexpr uint32_t MulTruncate(uint32_t multiplicand) const {
+    return uint64_t(multiplicand) * num / den;
+  }
+
+  uint32_t num;
+  uint32_t den;
+};
+
+Rational FixedAspectRatios(uint32_t ratio) {
+  JXL_ASSERT(0 != ratio && ratio < 8);
+  // Other candidates: 5/4, 7/5, 14/9, 16/10, 5/3, 21/9, 12/5
+  constexpr Rational kRatios[7] = {Rational(1, 1),    // square
+                                   Rational(12, 10),  //
+                                   Rational(4, 3),    // camera
+                                   Rational(3, 2),    // mobile camera
+                                   Rational(16, 9),   // camera/display
+                                   Rational(5, 4),    //
+                                   Rational(2, 1)};   //
+  return kRatios[ratio - 1];
+}
+
+uint32_t FindAspectRatio(uint32_t xsize, uint32_t ysize) {
+  for (uint32_t r = 1; r < 8; ++r) {
+    if (xsize == FixedAspectRatios(r).MulTruncate(ysize)) {
+      return r;
+    }
+  }
+  return 0;  // Must send xsize instead
+}
+
+}  // namespace
+
+size_t SizeHeader::xsize() const {
+  if (ratio_ != 0) {
+    return FixedAspectRatios(ratio_).MulTruncate(
+        static_cast<uint32_t>(ysize()));
+  }
+  return small_ ? ((xsize_div8_minus_1_ + 1) * 8) : xsize_;
+}
+
+Status SizeHeader::Set(size_t xsize64, size_t ysize64) {
+  if (xsize64 > 0xFFFFFFFFull || ysize64 > 0xFFFFFFFFull) {
+    return JXL_FAILURE("Image too large");
+  }
+  const uint32_t xsize32 = static_cast<uint32_t>(xsize64);
+  const uint32_t ysize32 = static_cast<uint32_t>(ysize64);
+  if (xsize64 == 0 || ysize64 == 0) return JXL_FAILURE("Empty image");
+  ratio_ = FindAspectRatio(xsize32, ysize32);
+  small_ = ysize64 <= 256 && (ysize64 % kBlockDim) == 0 &&
+           (ratio_ != 0 || (xsize64 <= 256 && (xsize64 % kBlockDim) == 0));
+  if (small_) {
+    ysize_div8_minus_1_ = ysize32 / 8 - 1;
+  } else {
+    ysize_ = ysize32;
+  }
+
+  if (ratio_ == 0) {
+    if (small_) {
+      xsize_div8_minus_1_ = xsize32 / 8 - 1;
+    } else {
+      xsize_ = xsize32;
+    }
+  }
+  JXL_ASSERT(xsize() == xsize64);
+  JXL_ASSERT(ysize() == ysize64);
+  return true;
+}
+
+Status PreviewHeader::Set(size_t xsize64, size_t ysize64) {
+  const uint32_t xsize32 = static_cast<uint32_t>(xsize64);
+  const uint32_t ysize32 = static_cast<uint32_t>(ysize64);
+  if (xsize64 == 0 || ysize64 == 0) return JXL_FAILURE("Empty preview");
+  div8_ = (xsize64 % kBlockDim) == 0 && (ysize64 % kBlockDim) == 0;
+  if (div8_) {
+    ysize_div8_ = ysize32 / 8;
+  } else {
+    ysize_ = ysize32;
+  }
+
+  ratio_ = FindAspectRatio(xsize32, ysize32);
+  if (ratio_ == 0) {
+    if (div8_) {
+      xsize_div8_ = xsize32 / 8;
+    } else {
+      xsize_ = xsize32;
+    }
+  }
+  JXL_ASSERT(xsize() == xsize64);
+  JXL_ASSERT(ysize() == ysize64);
+  return true;
+}
+
+size_t PreviewHeader::xsize() const {
+  if (ratio_ != 0) {
+    return FixedAspectRatios(ratio_).MulTruncate(
+        static_cast<uint32_t>(ysize()));
+  }
+  return div8_ ? (xsize_div8_ * 8) : xsize_;
+}
+
+SizeHeader::SizeHeader() { Bundle::Init(this); }
+Status SizeHeader::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &small_));
+
+  if (visitor->Conditional(small_)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(5, 0, &ysize_div8_minus_1_));
+  }
+  if (visitor->Conditional(!small_)) {
+    // (Could still be small, but non-multiple of 8.)
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(9, 1), BitsOffset(13, 1),
+                                           BitsOffset(18, 1), BitsOffset(30, 1),
+                                           1, &ysize_));
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &ratio_));
+  if (visitor->Conditional(ratio_ == 0 && small_)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(5, 0, &xsize_div8_minus_1_));
+  }
+  if (visitor->Conditional(ratio_ == 0 && !small_)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(9, 1), BitsOffset(13, 1),
+                                           BitsOffset(18, 1), BitsOffset(30, 1),
+                                           1, &xsize_));
+  }
+
+  return true;
+}
+
+PreviewHeader::PreviewHeader() { Bundle::Init(this); }
+Status PreviewHeader::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &div8_));
+
+  if (visitor->Conditional(div8_)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(16), Val(32), BitsOffset(5, 1),
+                                           BitsOffset(9, 33), 1, &ysize_div8_));
+  }
+  if (visitor->Conditional(!div8_)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(6, 1), BitsOffset(8, 65),
+                                           BitsOffset(10, 321),
+                                           BitsOffset(12, 1345), 1, &ysize_));
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &ratio_));
+  if (visitor->Conditional(ratio_ == 0 && div8_)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(16), Val(32), BitsOffset(5, 1),
+                                           BitsOffset(9, 33), 1, &xsize_div8_));
+  }
+  if (visitor->Conditional(ratio_ == 0 && !div8_)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(6, 1), BitsOffset(8, 65),
+                                           BitsOffset(10, 321),
+                                           BitsOffset(12, 1345), 1, &xsize_));
+  }
+
+  return true;
+}
+
+AnimationHeader::AnimationHeader() { Bundle::Init(this); }
+Status AnimationHeader::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(100), Val(1000), BitsOffset(10, 1),
+                                         BitsOffset(30, 1), 1, &tps_numerator));
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(1), Val(1001), BitsOffset(8, 1),
+                                         BitsOffset(10, 1), 1,
+                                         &tps_denominator));
+
+  JXL_QUIET_RETURN_IF_ERROR(
+      visitor->U32(Val(0), Bits(3), Bits(16), Bits(32), 0, &num_loops));
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_timecodes));
+  return true;
+}
+
+Status ReadSizeHeader(BitReader* JXL_RESTRICT reader,
+                      SizeHeader* JXL_RESTRICT size) {
+  return Bundle::Read(reader, size);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/headers.h b/third_party/jpeg-xl/lib/jxl/headers.h
new file mode 100644
index 0000000000..3cce84dabc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/headers.h
@@ -0,0 +1,97 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_HEADERS_H_
+#define LIB_JXL_HEADERS_H_
+
+// Codestream headers.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/field_encodings.h"
+
+namespace jxl {
+
+// Reserved by ISO/IEC 10918-1. LF causes files opened in text mode to be
+// rejected because the marker changes to 0x0D instead. The 0xFF prefix also
+// ensures there were no 7-bit transmission limitations.
+static constexpr uint8_t kCodestreamMarker = 0x0A;
+
+// Compact representation of image dimensions (best case: 9 bits) so decoders
+// can preallocate early.
+class SizeHeader : public Fields {
+ public:
+  SizeHeader();
+  JXL_FIELDS_NAME(SizeHeader)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  Status Set(size_t xsize, size_t ysize);
+
+  size_t xsize() const;
+  size_t ysize() const {
+    return small_ ? ((ysize_div8_minus_1_ + 1) * 8) : ysize_;
+  }
+
+ private:
+  bool small_;  // xsize and ysize <= 256 and divisible by 8.
+
+  uint32_t ysize_div8_minus_1_;
+  uint32_t ysize_;
+
+  uint32_t ratio_;
+  uint32_t xsize_div8_minus_1_;
+  uint32_t xsize_;
+};
+
+// (Similar to SizeHeader but different encoding because previews are smaller)
+class PreviewHeader : public Fields {
+ public:
+  PreviewHeader();
+  JXL_FIELDS_NAME(PreviewHeader)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  Status Set(size_t xsize, size_t ysize);
+
+  size_t xsize() const;
+  size_t ysize() const { return div8_ ? (ysize_div8_ * 8) : ysize_; }
+
+ private:
+  bool div8_;  // xsize and ysize divisible by 8.
+
+  uint32_t ysize_div8_;
+  uint32_t ysize_;
+
+  uint32_t ratio_;
+  uint32_t xsize_div8_;
+  uint32_t xsize_;
+};
+
+struct AnimationHeader : public Fields {
+  AnimationHeader();
+  JXL_FIELDS_NAME(AnimationHeader)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  // Ticks per second (expressed as rational number to support NTSC)
+  uint32_t tps_numerator;
+  uint32_t tps_denominator;
+
+  uint32_t num_loops;  // 0 means to repeat infinitely.
+
+  bool have_timecodes;
+};
+
+Status ReadSizeHeader(BitReader* JXL_RESTRICT reader,
+                      SizeHeader* JXL_RESTRICT size);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_HEADERS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/huffman_table.cc b/third_party/jpeg-xl/lib/jxl/huffman_table.cc
new file mode 100644
index 0000000000..9ae7865af6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/huffman_table.cc
@@ -0,0 +1,161 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/huffman_table.h"
+
+#include <cstring> /* for memcpy */
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/dec_huffman.h"
+
+namespace jxl {
+
+/* Returns reverse(reverse(key, len) + 1, len), where reverse(key, len) is the
+   bit-wise reversal of the len least significant bits of key. */
+static inline int GetNextKey(int key, int len) {
+  int step = 1u << (len - 1);
+  while (key & step) {
+    step >>= 1;
+  }
+  return (key & (step - 1)) + step;
+}
+
+/* Stores code in table[0], table[step], table[2*step], ..., table[end] */
+/* Assumes that end is an integer multiple of step */
+static inline void ReplicateValue(HuffmanCode* table, int step, int end,
+                                  HuffmanCode code) {
+  do {
+    end -= step;
+    table[end] = code;
+  } while (end > 0);
+}
+
+/* Returns the table width of the next 2nd level table. count is the histogram
+   of bit lengths for the remaining symbols, len is the code length of the next
+   processed symbol */
+static inline size_t NextTableBitSize(const uint16_t* const count, size_t len,
+                                      int root_bits) {
+  size_t left = 1u << (len - root_bits);
+  while (len < PREFIX_MAX_BITS) {
+    if (left <= count[len]) break;
+    left -= count[len];
+    ++len;
+    left <<= 1;
+  }
+  return len - root_bits;
+}
+
+uint32_t BuildHuffmanTable(HuffmanCode* root_table, int root_bits,
+                           const uint8_t* const code_lengths,
+                           size_t code_lengths_size, uint16_t* count) {
+  HuffmanCode code;   /* current table entry */
+  HuffmanCode* table; /* next available space in table */
+  size_t len;         /* current code length */
+  size_t symbol;      /* symbol index in original or sorted table */
+  int key;            /* reversed prefix code */
+  int step;           /* step size to replicate values in current table */
+  int low;            /* low bits for current root entry */
+  int mask;           /* mask for low bits */
+  size_t table_bits;  /* key length of current table */
+  int table_size;     /* size of current table */
+  int total_size;     /* sum of root table size and 2nd level table sizes */
+  /* offsets in sorted table for each length */
+  uint16_t offset[PREFIX_MAX_BITS + 1];
+  size_t max_length = 1;
+
+  if (code_lengths_size > 1u << PREFIX_MAX_BITS) return 0;
+
+  /* symbols sorted by code length */
+  std::vector<uint16_t> sorted_storage(code_lengths_size);
+  uint16_t* sorted = sorted_storage.data();
+
+  /* generate offsets into sorted symbol table by code length */
+  {
+    uint16_t sum = 0;
+    for (len = 1; len <= PREFIX_MAX_BITS; len++) {
+      offset[len] = sum;
+      if (count[len]) {
+        sum = static_cast<uint16_t>(sum + count[len]);
+        max_length = len;
+      }
+    }
+  }
+
+  /* sort symbols by length, by symbol order within each length */
+  for (symbol = 0; symbol < code_lengths_size; symbol++) {
+    if (code_lengths[symbol] != 0) {
+      sorted[offset[code_lengths[symbol]]++] = symbol;
+    }
+  }
+
+  table = root_table;
+  table_bits = root_bits;
+  table_size = 1u << table_bits;
+  total_size = table_size;
+
+  /* special case code with only one value */
+  if (offset[PREFIX_MAX_BITS] == 1) {
+    code.bits = 0;
+    code.value = static_cast<uint16_t>(sorted[0]);
+    for (key = 0; key < total_size; ++key) {
+      table[key] = code;
+    }
+    return total_size;
+  }
+
+  /* fill in root table */
+  /* let's reduce the table size to a smaller size if possible, and */
+  /* create the repetitions by memcpy if possible in the coming loop */
+  if (table_bits > max_length) {
+    table_bits = max_length;
+    table_size = 1u << table_bits;
+  }
+  key = 0;
+  symbol = 0;
+  code.bits = 1;
+  step = 2;
+  do {
+    for (; count[code.bits] != 0; --count[code.bits]) {
+      code.value = static_cast<uint16_t>(sorted[symbol++]);
+      ReplicateValue(&table[key], step, table_size, code);
+      key = GetNextKey(key, code.bits);
+    }
+    step <<= 1;
+  } while (++code.bits <= table_bits);
+
+  /* if root_bits != table_bits we only created one fraction of the */
+  /* table, and we need to replicate it now. */
+  while (total_size != table_size) {
+    memcpy(&table[table_size], &table[0], table_size * sizeof(table[0]));
+    table_size <<= 1;
+  }
+
+  /* fill in 2nd level tables and add pointers to root table */
+  mask = total_size - 1;
+  low = -1;
+  for (len = root_bits + 1, step = 2; len <= max_length; ++len, step <<= 1) {
+    for (; count[len] != 0; --count[len]) {
+      if ((key & mask) != low) {
+        table += table_size;
+        table_bits = NextTableBitSize(count, len, root_bits);
+        table_size = 1u << table_bits;
+        total_size += table_size;
+        low = key & mask;
+        root_table[low].bits = static_cast<uint8_t>(table_bits + root_bits);
+        root_table[low].value =
+            static_cast<uint16_t>((table - root_table) - low);
+      }
+      code.bits = static_cast<uint8_t>(len - root_bits);
+      code.value = static_cast<uint16_t>(sorted[symbol++]);
+      ReplicateValue(&table[key >> root_bits], step, table_size, code);
+      key = GetNextKey(key, len);
+    }
+  }
+
+  return total_size;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/huffman_table.h b/third_party/jpeg-xl/lib/jxl/huffman_table.h
new file mode 100644
index 0000000000..11cdb2fc45
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/huffman_table.h
@@ -0,0 +1,28 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_HUFFMAN_TABLE_H_
+#define LIB_JXL_HUFFMAN_TABLE_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+namespace jxl {
+
+struct HuffmanCode {
+  uint8_t bits;   /* number of bits used for this symbol */
+  uint16_t value; /* symbol value or table offset */
+};
+
+/* Builds Huffman lookup table assuming code lengths are in symbol order. */
+/* Returns 0 in case of error (invalid tree or memory error), otherwise
+   populated size of table. */
+uint32_t BuildHuffmanTable(HuffmanCode* root_table, int root_bits,
+                           const uint8_t* code_lengths,
+                           size_t code_lengths_size, uint16_t* count);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_HUFFMAN_TABLE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/iaca_test.cc b/third_party/jpeg-xl/lib/jxl/iaca_test.cc
new file mode 100644
index 0000000000..e25d9316d5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/iaca_test.cc
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/iaca.h"
+
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(IacaTest, MarkersDefaultToDisabledAndDoNotCrash) {
+  BeginIACA();
+  EndIACA();
+}
+
+TEST(IacaTest, ScopeDefaultToDisabledAndDoNotCrash) { ScopeIACA iaca; }
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/icc_codec.cc b/third_party/jpeg-xl/lib/jxl/icc_codec.cc
new file mode 100644
index 0000000000..f367461c0f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/icc_codec.cc
@@ -0,0 +1,389 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/icc_codec.h"
+
+#include <stdint.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/icc_codec_common.h"
+
+namespace jxl {
+namespace {
+
+// Shuffles or interleaves bytes, for example with width 2, turns "ABCDabcd"
+// into "AaBbCcDc". Transposes a matrix of ceil(size / width) columns and
+// width rows. There are size elements, size may be < width * height, if so the
+// last elements of the rightmost column are missing, the missing spots are
+// transposed along with the filled spots, and the result has the missing
+// elements at the end of the bottom row. The input is the input matrix in
+// scanline order but with missing elements skipped (which may occur in multiple
+// locations), the output is the result matrix in scanline order (with
+// no need to skip missing elements as they are past the end of the data).
+void Shuffle(uint8_t* data, size_t size, size_t width) {
+  size_t height = (size + width - 1) / width;  // amount of rows of output
+  PaddedBytes result(size);
+  // i = output index, j input index
+  size_t s = 0, j = 0;
+  for (size_t i = 0; i < size; i++) {
+    result[i] = data[j];
+    j += height;
+    if (j >= size) j = ++s;
+  }
+
+  for (size_t i = 0; i < size; i++) {
+    data[i] = result[i];
+  }
+}
+
+// TODO(eustas): should be 20, or even 18, once DecodeVarInt is improved;
+//               currently DecodeVarInt does not signal the errors, and marks
+//               11 bytes as used even if only 10 are used (and 9 is enough for
+//               63-bit values).
+constexpr const size_t kPreambleSize = 22;  // enough for reading 2 VarInts
+
+}  // namespace
+
+// Mimics the beginning of UnpredictICC for quick validity check.
+// At least kPreambleSize bytes of data should be valid at invocation time.
+Status CheckPreamble(const PaddedBytes& data, size_t enc_size,
+                     size_t output_limit) {
+  const uint8_t* enc = data.data();
+  size_t size = data.size();
+  size_t pos = 0;
+  uint64_t osize = DecodeVarInt(enc, size, &pos);
+  JXL_RETURN_IF_ERROR(CheckIs32Bit(osize));
+  if (pos >= size) return JXL_FAILURE("Out of bounds");
+  uint64_t csize = DecodeVarInt(enc, size, &pos);
+  JXL_RETURN_IF_ERROR(CheckIs32Bit(csize));
+  JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, csize, size));
+  // We expect that UnpredictICC inflates input, not the other way round.
+  if (osize + 65536 < enc_size) return JXL_FAILURE("Malformed ICC");
+  if (output_limit && osize > output_limit) {
+    return JXL_FAILURE("Decoded ICC is too large");
+  }
+  return true;
+}
+
+// Decodes the result of PredictICC back to a valid ICC profile.
+Status UnpredictICC(const uint8_t* enc, size_t size, PaddedBytes* result) {
+  if (!result->empty()) return JXL_FAILURE("result must be empty initially");
+  size_t pos = 0;
+  // TODO(lode): technically speaking we need to check that the entire varint
+  // decoding never goes out of bounds, not just the first byte. This requires
+  // a DecodeVarInt function that returns an error code. It is safe to use
+  // DecodeVarInt with out of bounds values, it silently returns, but the
+  // specification requires an error. Idem for all DecodeVarInt below.
+  if (pos >= size) return JXL_FAILURE("Out of bounds");
+  uint64_t osize = DecodeVarInt(enc, size, &pos);  // Output size
+  JXL_RETURN_IF_ERROR(CheckIs32Bit(osize));
+  if (pos >= size) return JXL_FAILURE("Out of bounds");
+  uint64_t csize = DecodeVarInt(enc, size, &pos);  // Commands size
+  // Every command is translated to at least on byte.
+  JXL_RETURN_IF_ERROR(CheckIs32Bit(csize));
+  size_t cpos = pos;  // pos in commands stream
+  JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, csize, size));
+  size_t commands_end = cpos + csize;
+  pos = commands_end;  // pos in data stream
+
+  // Header
+  PaddedBytes header = ICCInitialHeaderPrediction();
+  EncodeUint32(0, osize, &header);
+  for (size_t i = 0; i <= kICCHeaderSize; i++) {
+    if (result->size() == osize) {
+      if (cpos != commands_end) return JXL_FAILURE("Not all commands used");
+      if (pos != size) return JXL_FAILURE("Not all data used");
+      return true;  // Valid end
+    }
+    if (i == kICCHeaderSize) break;  // Done
+    ICCPredictHeader(result->data(), result->size(), header.data(), i);
+    if (pos >= size) return JXL_FAILURE("Out of bounds");
+    result->push_back(enc[pos++] + header[i]);
+  }
+  if (cpos >= commands_end) return JXL_FAILURE("Out of bounds");
+
+  // Tag list
+  uint64_t numtags = DecodeVarInt(enc, size, &cpos);
+
+  if (numtags != 0) {
+    numtags--;
+    JXL_RETURN_IF_ERROR(CheckIs32Bit(numtags));
+    AppendUint32(numtags, result);
+    uint64_t prevtagstart = kICCHeaderSize + numtags * 12;
+    uint64_t prevtagsize = 0;
+    for (;;) {
+      if (result->size() > osize) return JXL_FAILURE("Invalid result size");
+      if (cpos > commands_end) return JXL_FAILURE("Out of bounds");
+      if (cpos == commands_end) break;  // Valid end
+      uint8_t command = enc[cpos++];
+      uint8_t tagcode = command & 63;
+      Tag tag;
+      if (tagcode == 0) {
+        break;
+      } else if (tagcode == kCommandTagUnknown) {
+        JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, 4, size));
+        tag = DecodeKeyword(enc, size, pos);
+        pos += 4;
+      } else if (tagcode == kCommandTagTRC) {
+        tag = kRtrcTag;
+      } else if (tagcode == kCommandTagXYZ) {
+        tag = kRxyzTag;
+      } else {
+        if (tagcode - kCommandTagStringFirst >= kNumTagStrings) {
+          return JXL_FAILURE("Unknown tagcode");
+        }
+        tag = *kTagStrings[tagcode - kCommandTagStringFirst];
+      }
+      AppendKeyword(tag, result);
+
+      uint64_t tagstart;
+      uint64_t tagsize = prevtagsize;
+      if (tag == kRxyzTag || tag == kGxyzTag || tag == kBxyzTag ||
+          tag == kKxyzTag || tag == kWtptTag || tag == kBkptTag ||
+          tag == kLumiTag) {
+        tagsize = 20;
+      }
+
+      if (command & kFlagBitOffset) {
+        if (cpos >= commands_end) return JXL_FAILURE("Out of bounds");
+        tagstart = DecodeVarInt(enc, size, &cpos);
+      } else {
+        JXL_RETURN_IF_ERROR(CheckIs32Bit(prevtagstart));
+        tagstart = prevtagstart + prevtagsize;
+      }
+      JXL_RETURN_IF_ERROR(CheckIs32Bit(tagstart));
+      AppendUint32(tagstart, result);
+      if (command & kFlagBitSize) {
+        if (cpos >= commands_end) return JXL_FAILURE("Out of bounds");
+        tagsize = DecodeVarInt(enc, size, &cpos);
+      }
+      JXL_RETURN_IF_ERROR(CheckIs32Bit(tagsize));
+      AppendUint32(tagsize, result);
+      prevtagstart = tagstart;
+      prevtagsize = tagsize;
+
+      if (tagcode == kCommandTagTRC) {
+        AppendKeyword(kGtrcTag, result);
+        AppendUint32(tagstart, result);
+        AppendUint32(tagsize, result);
+        AppendKeyword(kBtrcTag, result);
+        AppendUint32(tagstart, result);
+        AppendUint32(tagsize, result);
+      }
+
+      if (tagcode == kCommandTagXYZ) {
+        JXL_RETURN_IF_ERROR(CheckIs32Bit(tagstart + tagsize * 2));
+        AppendKeyword(kGxyzTag, result);
+        AppendUint32(tagstart + tagsize, result);
+        AppendUint32(tagsize, result);
+        AppendKeyword(kBxyzTag, result);
+        AppendUint32(tagstart + tagsize * 2, result);
+        AppendUint32(tagsize, result);
+      }
+    }
+  }
+
+  // Main Content
+  for (;;) {
+    if (result->size() > osize) return JXL_FAILURE("Invalid result size");
+    if (cpos > commands_end) return JXL_FAILURE("Out of bounds");
+    if (cpos == commands_end) break;  // Valid end
+    uint8_t command = enc[cpos++];
+    if (command == kCommandInsert) {
+      if (cpos >= commands_end) return JXL_FAILURE("Out of bounds");
+      uint64_t num = DecodeVarInt(enc, size, &cpos);
+      JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, num, size));
+      for (size_t i = 0; i < num; i++) {
+        result->push_back(enc[pos++]);
+      }
+    } else if (command == kCommandShuffle2 || command == kCommandShuffle4) {
+      if (cpos >= commands_end) return JXL_FAILURE("Out of bounds");
+      uint64_t num = DecodeVarInt(enc, size, &cpos);
+      JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, num, size));
+      PaddedBytes shuffled(num);
+      for (size_t i = 0; i < num; i++) {
+        shuffled[i] = enc[pos + i];
+      }
+      if (command == kCommandShuffle2) {
+        Shuffle(shuffled.data(), num, 2);
+      } else if (command == kCommandShuffle4) {
+        Shuffle(shuffled.data(), num, 4);
+      }
+      for (size_t i = 0; i < num; i++) {
+        result->push_back(shuffled[i]);
+        pos++;
+      }
+    } else if (command == kCommandPredict) {
+      JXL_RETURN_IF_ERROR(CheckOutOfBounds(cpos, 2, commands_end));
+      uint8_t flags = enc[cpos++];
+
+      size_t width = (flags & 3) + 1;
+      if (width == 3) return JXL_FAILURE("Invalid width");
+
+      int order = (flags & 12) >> 2;
+      if (order == 3) return JXL_FAILURE("Invalid order");
+
+      uint64_t stride = width;
+      if (flags & 16) {
+        if (cpos >= commands_end) return JXL_FAILURE("Out of bounds");
+        stride = DecodeVarInt(enc, size, &cpos);
+        if (stride < width) {
+          return JXL_FAILURE("Invalid stride");
+        }
+      }
+      // If stride * 4 >= result->size(), return failure. The check
+      // "size == 0 || ((size - 1) >> 2) < stride" corresponds to
+      // "stride * 4 >= size", but does not suffer from integer overflow.
+      // This check is more strict than necessary but follows the specification
+      // and the encoder should ensure this is followed.
+      if (result->empty() || ((result->size() - 1u) >> 2u) < stride) {
+        return JXL_FAILURE("Invalid stride");
+      }
+
+      if (cpos >= commands_end) return JXL_FAILURE("Out of bounds");
+      uint64_t num = DecodeVarInt(enc, size, &cpos);  // in bytes
+      JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, num, size));
+
+      PaddedBytes shuffled(num);
+      for (size_t i = 0; i < num; i++) {
+        shuffled[i] = enc[pos + i];
+      }
+      if (width > 1) Shuffle(shuffled.data(), num, width);
+
+      size_t start = result->size();
+      for (size_t i = 0; i < num; i++) {
+        uint8_t predicted = LinearPredictICCValue(result->data(), start, i,
+                                                  stride, width, order);
+        result->push_back(predicted + shuffled[i]);
+      }
+      pos += num;
+    } else if (command == kCommandXYZ) {
+      AppendKeyword(kXyz_Tag, result);
+      for (int i = 0; i < 4; i++) result->push_back(0);
+      JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, 12, size));
+      for (size_t i = 0; i < 12; i++) {
+        result->push_back(enc[pos++]);
+      }
+    } else if (command >= kCommandTypeStartFirst &&
+               command < kCommandTypeStartFirst + kNumTypeStrings) {
+      AppendKeyword(*kTypeStrings[command - kCommandTypeStartFirst], result);
+      for (size_t i = 0; i < 4; i++) {
+        result->push_back(0);
+      }
+    } else {
+      return JXL_FAILURE("Unknown command");
+    }
+  }
+
+  if (pos != size) return JXL_FAILURE("Not all data used");
+  if (result->size() != osize) return JXL_FAILURE("Invalid result size");
+
+  return true;
+}
+
+Status ICCReader::Init(BitReader* reader, size_t output_limit) {
+  JXL_RETURN_IF_ERROR(CheckEOI(reader));
+  used_bits_base_ = reader->TotalBitsConsumed();
+  if (bits_to_skip_ == 0) {
+    enc_size_ = U64Coder::Read(reader);
+    if (enc_size_ > 268435456) {
+      // Avoid too large memory allocation for invalid file.
+      return JXL_FAILURE("Too large encoded profile");
+    }
+    JXL_RETURN_IF_ERROR(
+        DecodeHistograms(reader, kNumICCContexts, &code_, &context_map_));
+    ans_reader_ = ANSSymbolReader(&code_, reader);
+    i_ = 0;
+    decompressed_.resize(std::min<size_t>(i_ + 0x400, enc_size_));
+    for (; i_ < std::min<size_t>(2, enc_size_); i_++) {
+      decompressed_[i_] = ans_reader_.ReadHybridUint(
+          ICCANSContext(i_, i_ > 0 ? decompressed_[i_ - 1] : 0,
+                        i_ > 1 ? decompressed_[i_ - 2] : 0),
+          reader, context_map_);
+    }
+    if (enc_size_ > kPreambleSize) {
+      for (; i_ < kPreambleSize; i_++) {
+        decompressed_[i_] = ans_reader_.ReadHybridUint(
+            ICCANSContext(i_, decompressed_[i_ - 1], decompressed_[i_ - 2]),
+            reader, context_map_);
+      }
+      JXL_RETURN_IF_ERROR(CheckEOI(reader));
+      JXL_RETURN_IF_ERROR(
+          CheckPreamble(decompressed_, enc_size_, output_limit));
+    }
+    bits_to_skip_ = reader->TotalBitsConsumed() - used_bits_base_;
+  } else {
+    reader->SkipBits(bits_to_skip_);
+  }
+  return true;
+}
+
+Status ICCReader::Process(BitReader* reader, PaddedBytes* icc) {
+  ANSSymbolReader::Checkpoint checkpoint;
+  size_t saved_i = 0;
+  auto save = [&]() {
+    ans_reader_.Save(&checkpoint);
+    bits_to_skip_ = reader->TotalBitsConsumed() - used_bits_base_;
+    saved_i = i_;
+  };
+  save();
+  auto check_and_restore = [&]() {
+    Status status = CheckEOI(reader);
+    if (!status) {
+      // not enough bytes.
+      ans_reader_.Restore(checkpoint);
+      i_ = saved_i;
+      return status;
+    }
+    return Status(true);
+  };
+  for (; i_ < enc_size_; i_++) {
+    if (i_ % ANSSymbolReader::kMaxCheckpointInterval == 0 && i_ > 0) {
+      JXL_RETURN_IF_ERROR(check_and_restore());
+      save();
+      if ((i_ > 0) && (((i_ & 0xFFFF) == 0))) {
+        float used_bytes =
+            (reader->TotalBitsConsumed() - used_bits_base_) / 8.0f;
+        if (i_ > used_bytes * 256) return JXL_FAILURE("Corrupted stream");
+      }
+      decompressed_.resize(std::min<size_t>(i_ + 0x400, enc_size_));
+    }
+    JXL_DASSERT(i_ >= 2);
+    decompressed_[i_] = ans_reader_.ReadHybridUint(
+        ICCANSContext(i_, decompressed_[i_ - 1], decompressed_[i_ - 2]), reader,
+        context_map_);
+  }
+  JXL_RETURN_IF_ERROR(check_and_restore());
+  bits_to_skip_ = reader->TotalBitsConsumed() - used_bits_base_;
+  if (!ans_reader_.CheckANSFinalState()) {
+    return JXL_FAILURE("Corrupted ICC profile");
+  }
+
+  icc->clear();
+  return UnpredictICC(decompressed_.data(), decompressed_.size(), icc);
+}
+
+Status ICCReader::CheckEOI(BitReader* reader) {
+  if (reader->AllReadsWithinBounds()) return true;
+  return JXL_STATUS(StatusCode::kNotEnoughBytes,
+                    "Not enough bytes for reading ICC profile");
+}
+
+Status ReadICC(BitReader* JXL_RESTRICT reader, PaddedBytes* JXL_RESTRICT icc,
+               size_t output_limit) {
+  ICCReader icc_reader;
+  JXL_RETURN_IF_ERROR(icc_reader.Init(reader, output_limit));
+  JXL_RETURN_IF_ERROR(icc_reader.Process(reader, icc));
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/icc_codec.h b/third_party/jpeg-xl/lib/jxl/icc_codec.h
new file mode 100644
index 0000000000..a6c7477c60
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/icc_codec.h
@@ -0,0 +1,57 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ICC_CODEC_H_
+#define LIB_JXL_ICC_CODEC_H_
+
+// Compressed representation of ICC profiles.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+
+namespace jxl {
+
+struct ICCReader {
+  Status Init(BitReader* reader, size_t output_limit);
+  Status Process(BitReader* reader, PaddedBytes* icc);
+  void Reset() {
+    bits_to_skip_ = 0;
+    decompressed_.clear();
+  }
+
+ private:
+  Status CheckEOI(BitReader* reader);
+  size_t i_ = 0;
+  size_t bits_to_skip_ = 0;
+  size_t used_bits_base_ = 0;
+  uint64_t enc_size_ = 0;
+  std::vector<uint8_t> context_map_;
+  ANSCode code_;
+  ANSSymbolReader ans_reader_;
+  PaddedBytes decompressed_;
+};
+
+// `icc` may be empty afterwards - if so, call CreateProfile. Does not append,
+// clears any original data that was in icc.
+// If `output_limit` is not 0, then returns error if resulting profile would be
+// longer than `output_limit`
+Status ReadICC(BitReader* JXL_RESTRICT reader, PaddedBytes* JXL_RESTRICT icc,
+               size_t output_limit = 0);
+
+// Exposed only for testing
+Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result);
+
+// Exposed only for testing
+Status UnpredictICC(const uint8_t* enc, size_t size, PaddedBytes* result);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ICC_CODEC_H_
diff --git a/third_party/jpeg-xl/lib/jxl/icc_codec_common.cc b/third_party/jpeg-xl/lib/jxl/icc_codec_common.cc
new file mode 100644
index 0000000000..212387e78f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/icc_codec_common.cc
@@ -0,0 +1,190 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/icc_codec_common.h"
+
+#include <stdint.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+namespace {
+static uint8_t ByteKind1(uint8_t b) {
+  if ('a' <= b && b <= 'z') return 0;
+  if ('A' <= b && b <= 'Z') return 0;
+  if ('0' <= b && b <= '9') return 1;
+  if (b == '.' || b == ',') return 1;
+  if (b == 0) return 2;
+  if (b == 1) return 3;
+  if (b < 16) return 4;
+  if (b == 255) return 6;
+  if (b > 240) return 5;
+  return 7;
+}
+
+static uint8_t ByteKind2(uint8_t b) {
+  if ('a' <= b && b <= 'z') return 0;
+  if ('A' <= b && b <= 'Z') return 0;
+  if ('0' <= b && b <= '9') return 1;
+  if (b == '.' || b == ',') return 1;
+  if (b < 16) return 2;
+  if (b > 240) return 3;
+  return 4;
+}
+
+template <typename T>
+T PredictValue(T p1, T p2, T p3, int order) {
+  if (order == 0) return p1;
+  if (order == 1) return 2 * p1 - p2;
+  if (order == 2) return 3 * p1 - 3 * p2 + p3;
+  return 0;
+}
+}  // namespace
+
+uint32_t DecodeUint32(const uint8_t* data, size_t size, size_t pos) {
+  return pos + 4 > size ? 0 : LoadBE32(data + pos);
+}
+
+void EncodeUint32(size_t pos, uint32_t value, PaddedBytes* data) {
+  if (pos + 4 > data->size()) return;
+  StoreBE32(value, data->data() + pos);
+}
+
+void AppendUint32(uint32_t value, PaddedBytes* data) {
+  data->resize(data->size() + 4);
+  EncodeUint32(data->size() - 4, value, data);
+}
+
+typedef std::array<uint8_t, 4> Tag;
+
+Tag DecodeKeyword(const uint8_t* data, size_t size, size_t pos) {
+  if (pos + 4 > size) return {{' ', ' ', ' ', ' '}};
+  return {{data[pos], data[pos + 1], data[pos + 2], data[pos + 3]}};
+}
+
+void EncodeKeyword(const Tag& keyword, uint8_t* data, size_t size, size_t pos) {
+  if (keyword.size() != 4 || pos + 3 >= size) return;
+  for (size_t i = 0; i < 4; ++i) data[pos + i] = keyword[i];
+}
+
+void AppendKeyword(const Tag& keyword, PaddedBytes* data) {
+  JXL_ASSERT(keyword.size() == 4);
+  data->append(keyword);
+}
+
+// Checks if a + b > size, taking possible integer overflow into account.
+Status CheckOutOfBounds(size_t a, size_t b, size_t size) {
+  size_t pos = a + b;
+  if (pos > size) return JXL_FAILURE("Out of bounds");
+  if (pos < a) return JXL_FAILURE("Out of bounds");  // overflow happened
+  return true;
+}
+
+Status CheckIs32Bit(uint64_t v) {
+  static constexpr const uint64_t kUpper32 = ~static_cast<uint64_t>(0xFFFFFFFF);
+  if ((v & kUpper32) != 0) return JXL_FAILURE("32-bit value expected");
+  return true;
+}
+
+PaddedBytes ICCInitialHeaderPrediction() {
+  PaddedBytes result(kICCHeaderSize);
+  for (size_t i = 0; i < kICCHeaderSize; i++) {
+    result[i] = 0;
+  }
+  result[8] = 4;
+  EncodeKeyword(kMntrTag, result.data(), result.size(), 12);
+  EncodeKeyword(kRgb_Tag, result.data(), result.size(), 16);
+  EncodeKeyword(kXyz_Tag, result.data(), result.size(), 20);
+  EncodeKeyword(kAcspTag, result.data(), result.size(), 36);
+  result[68] = 0;
+  result[69] = 0;
+  result[70] = 246;
+  result[71] = 214;
+  result[72] = 0;
+  result[73] = 1;
+  result[74] = 0;
+  result[75] = 0;
+  result[76] = 0;
+  result[77] = 0;
+  result[78] = 211;
+  result[79] = 45;
+  return result;
+}
+
+void ICCPredictHeader(const uint8_t* icc, size_t size, uint8_t* header,
+                      size_t pos) {
+  if (pos == 8 && size >= 8) {
+    header[80] = icc[4];
+    header[81] = icc[5];
+    header[82] = icc[6];
+    header[83] = icc[7];
+  }
+  if (pos == 41 && size >= 41) {
+    if (icc[40] == 'A') {
+      header[41] = 'P';
+      header[42] = 'P';
+      header[43] = 'L';
+    }
+    if (icc[40] == 'M') {
+      header[41] = 'S';
+      header[42] = 'F';
+      header[43] = 'T';
+    }
+  }
+  if (pos == 42 && size >= 42) {
+    if (icc[40] == 'S' && icc[41] == 'G') {
+      header[42] = 'I';
+      header[43] = ' ';
+    }
+    if (icc[40] == 'S' && icc[41] == 'U') {
+      header[42] = 'N';
+      header[43] = 'W';
+    }
+  }
+}
+
+// Predicts a value with linear prediction of given order (0-2), for integers
+// with width bytes and given stride in bytes between values.
+// The start position is at start + i, and the relevant modulus of i describes
+// which byte of the multi-byte integer is being handled.
+// The value start + i must be at least stride * 4.
+uint8_t LinearPredictICCValue(const uint8_t* data, size_t start, size_t i,
+                              size_t stride, size_t width, int order) {
+  size_t pos = start + i;
+  if (width == 1) {
+    uint8_t p1 = data[pos - stride];
+    uint8_t p2 = data[pos - stride * 2];
+    uint8_t p3 = data[pos - stride * 3];
+    return PredictValue(p1, p2, p3, order);
+  } else if (width == 2) {
+    size_t p = start + (i & ~1);
+    uint16_t p1 = (data[p - stride * 1] << 8) + data[p - stride * 1 + 1];
+    uint16_t p2 = (data[p - stride * 2] << 8) + data[p - stride * 2 + 1];
+    uint16_t p3 = (data[p - stride * 3] << 8) + data[p - stride * 3 + 1];
+    uint16_t pred = PredictValue(p1, p2, p3, order);
+    return (i & 1) ? (pred & 255) : ((pred >> 8) & 255);
+  } else {
+    size_t p = start + (i & ~3);
+    uint32_t p1 = DecodeUint32(data, pos, p - stride);
+    uint32_t p2 = DecodeUint32(data, pos, p - stride * 2);
+    uint32_t p3 = DecodeUint32(data, pos, p - stride * 3);
+    uint32_t pred = PredictValue(p1, p2, p3, order);
+    unsigned shiftbytes = 3 - (i & 3);
+    return (pred >> (shiftbytes * 8)) & 255;
+  }
+}
+
+size_t ICCANSContext(size_t i, size_t b1, size_t b2) {
+  if (i <= 128) return 0;
+  return 1 + ByteKind1(b1) + ByteKind2(b2) * 8;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/icc_codec_common.h b/third_party/jpeg-xl/lib/jxl/icc_codec_common.h
new file mode 100644
index 0000000000..e91e908669
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/icc_codec_common.h
@@ -0,0 +1,106 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ICC_CODEC_COMMON_H_
+#define LIB_JXL_ICC_CODEC_COMMON_H_
+
+// Compressed representation of ICC profiles.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <array>
+
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+static constexpr size_t kICCHeaderSize = 128;
+
+typedef std::array<uint8_t, 4> Tag;
+
+static const Tag kAcspTag = {{'a', 'c', 's', 'p'}};
+static const Tag kBkptTag = {{'b', 'k', 'p', 't'}};
+static const Tag kBtrcTag = {{'b', 'T', 'R', 'C'}};
+static const Tag kBxyzTag = {{'b', 'X', 'Y', 'Z'}};
+static const Tag kChadTag = {{'c', 'h', 'a', 'd'}};
+static const Tag kChrmTag = {{'c', 'h', 'r', 'm'}};
+static const Tag kCprtTag = {{'c', 'p', 'r', 't'}};
+static const Tag kCurvTag = {{'c', 'u', 'r', 'v'}};
+static const Tag kDescTag = {{'d', 'e', 's', 'c'}};
+static const Tag kDmddTag = {{'d', 'm', 'd', 'd'}};
+static const Tag kDmndTag = {{'d', 'm', 'n', 'd'}};
+static const Tag kGbd_Tag = {{'g', 'b', 'd', ' '}};
+static const Tag kGtrcTag = {{'g', 'T', 'R', 'C'}};
+static const Tag kGxyzTag = {{'g', 'X', 'Y', 'Z'}};
+static const Tag kKtrcTag = {{'k', 'T', 'R', 'C'}};
+static const Tag kKxyzTag = {{'k', 'X', 'Y', 'Z'}};
+static const Tag kLumiTag = {{'l', 'u', 'm', 'i'}};
+static const Tag kMab_Tag = {{'m', 'A', 'B', ' '}};
+static const Tag kMba_Tag = {{'m', 'B', 'A', ' '}};
+static const Tag kMlucTag = {{'m', 'l', 'u', 'c'}};
+static const Tag kMntrTag = {{'m', 'n', 't', 'r'}};
+static const Tag kParaTag = {{'p', 'a', 'r', 'a'}};
+static const Tag kRgb_Tag = {{'R', 'G', 'B', ' '}};
+static const Tag kRtrcTag = {{'r', 'T', 'R', 'C'}};
+static const Tag kRxyzTag = {{'r', 'X', 'Y', 'Z'}};
+static const Tag kSf32Tag = {{'s', 'f', '3', '2'}};
+static const Tag kTextTag = {{'t', 'e', 'x', 't'}};
+static const Tag kVcgtTag = {{'v', 'c', 'g', 't'}};
+static const Tag kWtptTag = {{'w', 't', 'p', 't'}};
+static const Tag kXyz_Tag = {{'X', 'Y', 'Z', ' '}};
+
+// Tag names focused on RGB and GRAY monitor profiles
+static constexpr size_t kNumTagStrings = 17;
+static constexpr const Tag* kTagStrings[kNumTagStrings] = {
+    &kCprtTag, &kWtptTag, &kBkptTag, &kRxyzTag, &kGxyzTag, &kBxyzTag,
+    &kKxyzTag, &kRtrcTag, &kGtrcTag, &kBtrcTag, &kKtrcTag, &kChadTag,
+    &kDescTag, &kChrmTag, &kDmndTag, &kDmddTag, &kLumiTag};
+
+static constexpr size_t kCommandTagUnknown = 1;
+static constexpr size_t kCommandTagTRC = 2;
+static constexpr size_t kCommandTagXYZ = 3;
+static constexpr size_t kCommandTagStringFirst = 4;
+
+// Tag types focused on RGB and GRAY monitor profiles
+static constexpr size_t kNumTypeStrings = 8;
+static constexpr const Tag* kTypeStrings[kNumTypeStrings] = {
+    &kXyz_Tag, &kDescTag, &kTextTag, &kMlucTag,
+    &kParaTag, &kCurvTag, &kSf32Tag, &kGbd_Tag};
+
+static constexpr size_t kCommandInsert = 1;
+static constexpr size_t kCommandShuffle2 = 2;
+static constexpr size_t kCommandShuffle4 = 3;
+static constexpr size_t kCommandPredict = 4;
+static constexpr size_t kCommandXYZ = 10;
+static constexpr size_t kCommandTypeStartFirst = 16;
+
+static constexpr size_t kFlagBitOffset = 64;
+static constexpr size_t kFlagBitSize = 128;
+
+static constexpr size_t kNumICCContexts = 41;
+
+uint32_t DecodeUint32(const uint8_t* data, size_t size, size_t pos);
+void EncodeUint32(size_t pos, uint32_t value, PaddedBytes* data);
+void AppendUint32(uint32_t value, PaddedBytes* data);
+Tag DecodeKeyword(const uint8_t* data, size_t size, size_t pos);
+void EncodeKeyword(const Tag& keyword, uint8_t* data, size_t size, size_t pos);
+void AppendKeyword(const Tag& keyword, PaddedBytes* data);
+
+// Checks if a + b > size, taking possible integer overflow into account.
+Status CheckOutOfBounds(size_t a, size_t b, size_t size);
+Status CheckIs32Bit(uint64_t v);
+
+PaddedBytes ICCInitialHeaderPrediction();
+void ICCPredictHeader(const uint8_t* icc, size_t size, uint8_t* header,
+                      size_t pos);
+uint8_t LinearPredictICCValue(const uint8_t* data, size_t start, size_t i,
+                              size_t stride, size_t width, int order);
+size_t ICCANSContext(size_t i, size_t b1, size_t b2);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ICC_CODEC_COMMON_H_
diff --git a/third_party/jpeg-xl/lib/jxl/icc_codec_test.cc b/third_party/jpeg-xl/lib/jxl/icc_codec_test.cc
new file mode 100644
index 0000000000..af02094e99
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/icc_codec_test.cc
@@ -0,0 +1,207 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/icc_codec.h"
+
+#include <string>
+
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/enc_icc_codec.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+void TestProfile(const PaddedBytes& icc) {
+  BitWriter writer;
+  ASSERT_TRUE(WriteICC(icc, &writer, 0, nullptr));
+  writer.ZeroPadToByte();
+  PaddedBytes dec;
+  BitReader reader(writer.GetSpan());
+  ASSERT_TRUE(ReadICC(&reader, &dec));
+  ASSERT_TRUE(reader.Close());
+  EXPECT_EQ(icc.size(), dec.size());
+  if (icc.size() == dec.size()) {
+    for (size_t i = 0; i < icc.size(); i++) {
+      EXPECT_EQ(icc[i], dec[i]);
+      if (icc[i] != dec[i]) break;  // One output is enough
+    }
+  }
+}
+
+void TestProfile(const std::string& icc) {
+  PaddedBytes bytes(icc.size());
+  for (size_t i = 0; i < icc.size(); i++) {
+    bytes[i] = icc[i];
+  }
+  TestProfile(bytes);
+}
+
+// Valid profile from one of the images output by the decoder.
+static const unsigned char kTestProfile[] = {
+    0x00, 0x00, 0x03, 0x80, 0x6c, 0x63, 0x6d, 0x73, 0x04, 0x30, 0x00, 0x00,
+    0x6d, 0x6e, 0x74, 0x72, 0x52, 0x47, 0x42, 0x20, 0x58, 0x59, 0x5a, 0x20,
+    0x07, 0xe3, 0x00, 0x04, 0x00, 0x1d, 0x00, 0x0f, 0x00, 0x32, 0x00, 0x2e,
+    0x61, 0x63, 0x73, 0x70, 0x41, 0x50, 0x50, 0x4c, 0x00, 0x00, 0x00, 0x01,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0xf6, 0xd6,
+    0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x6c, 0x63, 0x6d, 0x73,
+    0x5f, 0x07, 0x0d, 0x3e, 0x4d, 0x32, 0xf2, 0x6e, 0x5d, 0x77, 0x26, 0xcc,
+    0x23, 0xb0, 0x6a, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d,
+    0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x01, 0x20, 0x00, 0x00, 0x00, 0x42,
+    0x63, 0x70, 0x72, 0x74, 0x00, 0x00, 0x01, 0x64, 0x00, 0x00, 0x01, 0x00,
+    0x77, 0x74, 0x70, 0x74, 0x00, 0x00, 0x02, 0x64, 0x00, 0x00, 0x00, 0x14,
+    0x63, 0x68, 0x61, 0x64, 0x00, 0x00, 0x02, 0x78, 0x00, 0x00, 0x00, 0x2c,
+    0x72, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x02, 0xa4, 0x00, 0x00, 0x00, 0x14,
+    0x62, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x02, 0xb8, 0x00, 0x00, 0x00, 0x14,
+    0x67, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x02, 0xcc, 0x00, 0x00, 0x00, 0x14,
+    0x72, 0x54, 0x52, 0x43, 0x00, 0x00, 0x02, 0xe0, 0x00, 0x00, 0x00, 0x20,
+    0x67, 0x54, 0x52, 0x43, 0x00, 0x00, 0x02, 0xe0, 0x00, 0x00, 0x00, 0x20,
+    0x62, 0x54, 0x52, 0x43, 0x00, 0x00, 0x02, 0xe0, 0x00, 0x00, 0x00, 0x20,
+    0x63, 0x68, 0x72, 0x6d, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x24,
+    0x64, 0x6d, 0x6e, 0x64, 0x00, 0x00, 0x03, 0x24, 0x00, 0x00, 0x00, 0x28,
+    0x64, 0x6d, 0x64, 0x64, 0x00, 0x00, 0x03, 0x4c, 0x00, 0x00, 0x00, 0x32,
+    0x6d, 0x6c, 0x75, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+    0x00, 0x00, 0x00, 0x0c, 0x65, 0x6e, 0x55, 0x53, 0x00, 0x00, 0x00, 0x26,
+    0x00, 0x00, 0x00, 0x1c, 0x00, 0x52, 0x00, 0x47, 0x00, 0x42, 0x00, 0x5f,
+    0x00, 0x44, 0x00, 0x36, 0x00, 0x35, 0x00, 0x5f, 0x00, 0x53, 0x00, 0x52,
+    0x00, 0x47, 0x00, 0x5f, 0x00, 0x52, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x5f,
+    0x00, 0x37, 0x00, 0x30, 0x00, 0x39, 0x00, 0x00, 0x6d, 0x6c, 0x75, 0x63,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c,
+    0x65, 0x6e, 0x55, 0x53, 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00, 0x1c,
+    0x00, 0x43, 0x00, 0x6f, 0x00, 0x70, 0x00, 0x79, 0x00, 0x72, 0x00, 0x69,
+    0x00, 0x67, 0x00, 0x68, 0x00, 0x74, 0x00, 0x20, 0x00, 0x32, 0x00, 0x30,
+    0x00, 0x31, 0x00, 0x38, 0x00, 0x20, 0x00, 0x47, 0x00, 0x6f, 0x00, 0x6f,
+    0x00, 0x67, 0x00, 0x6c, 0x00, 0x65, 0x00, 0x20, 0x00, 0x4c, 0x00, 0x4c,
+    0x00, 0x43, 0x00, 0x2c, 0x00, 0x20, 0x00, 0x43, 0x00, 0x43, 0x00, 0x2d,
+    0x00, 0x42, 0x00, 0x59, 0x00, 0x2d, 0x00, 0x53, 0x00, 0x41, 0x00, 0x20,
+    0x00, 0x33, 0x00, 0x2e, 0x00, 0x30, 0x00, 0x20, 0x00, 0x55, 0x00, 0x6e,
+    0x00, 0x70, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x74, 0x00, 0x65, 0x00, 0x64,
+    0x00, 0x20, 0x00, 0x6c, 0x00, 0x69, 0x00, 0x63, 0x00, 0x65, 0x00, 0x6e,
+    0x00, 0x73, 0x00, 0x65, 0x00, 0x28, 0x00, 0x68, 0x00, 0x74, 0x00, 0x74,
+    0x00, 0x70, 0x00, 0x73, 0x00, 0x3a, 0x00, 0x2f, 0x00, 0x2f, 0x00, 0x63,
+    0x00, 0x72, 0x00, 0x65, 0x00, 0x61, 0x00, 0x74, 0x00, 0x69, 0x00, 0x76,
+    0x00, 0x65, 0x00, 0x63, 0x00, 0x6f, 0x00, 0x6d, 0x00, 0x6d, 0x00, 0x6f,
+    0x00, 0x6e, 0x00, 0x73, 0x00, 0x2e, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x67,
+    0x00, 0x2f, 0x00, 0x6c, 0x00, 0x69, 0x00, 0x63, 0x00, 0x65, 0x00, 0x6e,
+    0x00, 0x73, 0x00, 0x65, 0x00, 0x73, 0x00, 0x2f, 0x00, 0x62, 0x00, 0x79,
+    0x00, 0x2d, 0x00, 0x73, 0x00, 0x61, 0x00, 0x2f, 0x00, 0x33, 0x00, 0x2e,
+    0x00, 0x30, 0x00, 0x2f, 0x00, 0x6c, 0x00, 0x65, 0x00, 0x67, 0x00, 0x61,
+    0x00, 0x6c, 0x00, 0x63, 0x00, 0x6f, 0x00, 0x64, 0x00, 0x65, 0x00, 0x29,
+    0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xd6,
+    0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x73, 0x66, 0x33, 0x32,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x0c, 0x42, 0x00, 0x00, 0x05, 0xde,
+    0xff, 0xff, 0xf3, 0x25, 0x00, 0x00, 0x07, 0x93, 0x00, 0x00, 0xfd, 0x90,
+    0xff, 0xff, 0xfb, 0xa1, 0xff, 0xff, 0xfd, 0xa2, 0x00, 0x00, 0x03, 0xdc,
+    0x00, 0x00, 0xc0, 0x6e, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x6f, 0xa0, 0x00, 0x00, 0x38, 0xf5, 0x00, 0x00, 0x03, 0x90,
+    0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x24, 0x9f,
+    0x00, 0x00, 0x0f, 0x84, 0x00, 0x00, 0xb6, 0xc4, 0x58, 0x59, 0x5a, 0x20,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x62, 0x97, 0x00, 0x00, 0xb7, 0x87,
+    0x00, 0x00, 0x18, 0xd9, 0x70, 0x61, 0x72, 0x61, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x38, 0xe4, 0x00, 0x00, 0xe8, 0xf0,
+    0x00, 0x00, 0x17, 0x10, 0x00, 0x00, 0x38, 0xe4, 0x00, 0x00, 0x14, 0xbc,
+    0x63, 0x68, 0x72, 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00,
+    0x00, 0x00, 0xa3, 0xd7, 0x00, 0x00, 0x54, 0x7c, 0x00, 0x00, 0x4c, 0xcd,
+    0x00, 0x00, 0x99, 0x9a, 0x00, 0x00, 0x26, 0x67, 0x00, 0x00, 0x0f, 0x5c,
+    0x6d, 0x6c, 0x75, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+    0x00, 0x00, 0x00, 0x0c, 0x65, 0x6e, 0x55, 0x53, 0x00, 0x00, 0x00, 0x0c,
+    0x00, 0x00, 0x00, 0x1c, 0x00, 0x47, 0x00, 0x6f, 0x00, 0x6f, 0x00, 0x67,
+    0x00, 0x6c, 0x00, 0x65, 0x6d, 0x6c, 0x75, 0x63, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x65, 0x6e, 0x55, 0x53,
+    0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x49, 0x00, 0x6d,
+    0x00, 0x61, 0x00, 0x67, 0x00, 0x65, 0x00, 0x20, 0x00, 0x63, 0x00, 0x6f,
+    0x00, 0x64, 0x00, 0x65, 0x00, 0x63, 0x00, 0x00,
+};
+
+}  // namespace
+
+TEST(IccCodecTest, Icc) {
+  // Empty string cannot be tested, encoder checks against writing it.
+  TestProfile("a");
+  TestProfile("ab");
+  TestProfile("aaaa");
+
+  {
+    // Exactly the ICC header size
+    PaddedBytes profile(128);
+    for (size_t i = 0; i < 128; i++) {
+      profile[i] = 0;
+    }
+    TestProfile(profile);
+  }
+
+  {
+    PaddedBytes profile;
+    profile.append(kTestProfile, kTestProfile + sizeof(kTestProfile));
+    TestProfile(profile);
+  }
+
+  // Test substrings of full profile
+  {
+    PaddedBytes profile;
+    for (size_t i = 0; i <= 256; i++) {
+      profile.push_back(kTestProfile[i]);
+      TestProfile(profile);
+    }
+  }
+}
+
+// kTestProfile after encoding with the ICC codec
+static const unsigned char kEncodedTestProfile[] = {
+    0x1f, 0x8b, 0x1,  0x13, 0x10, 0x0,  0x0,  0x0,  0x20, 0x4c, 0xcc, 0x3,
+    0xe7, 0xa0, 0xa5, 0xa2, 0x90, 0xa4, 0x27, 0xe8, 0x79, 0x1d, 0xe3, 0x26,
+    0x57, 0x54, 0xef, 0x0,  0xe8, 0x97, 0x2,  0xce, 0xa1, 0xd7, 0x85, 0x16,
+    0xb4, 0x29, 0x94, 0x58, 0xf2, 0x56, 0xc0, 0x76, 0xea, 0x23, 0xec, 0x7c,
+    0x73, 0x51, 0x41, 0x40, 0x23, 0x21, 0x95, 0x4,  0x75, 0x12, 0xc9, 0xcc,
+    0x16, 0xbd, 0xb6, 0x99, 0xad, 0xf8, 0x75, 0x35, 0xb6, 0x42, 0xae, 0xae,
+    0xae, 0x86, 0x56, 0xf8, 0xcc, 0x16, 0x30, 0xb3, 0x45, 0xad, 0xd,  0x40,
+    0xd6, 0xd1, 0xd6, 0x99, 0x40, 0xbe, 0xe2, 0xdc, 0x31, 0x7,  0xa6, 0xb9,
+    0x27, 0x92, 0x38, 0x0,  0x3,  0x5e, 0x2c, 0xbe, 0xe6, 0xfb, 0x19, 0xbf,
+    0xf3, 0x6d, 0xbc, 0x4d, 0x64, 0xe5, 0xba, 0x76, 0xde, 0x31, 0x65, 0x66,
+    0x14, 0xa6, 0x3a, 0xc5, 0x8f, 0xb1, 0xb4, 0xba, 0x1f, 0xb1, 0xb8, 0xd4,
+    0x75, 0xba, 0x18, 0x86, 0x95, 0x3c, 0x26, 0xf6, 0x25, 0x62, 0x53, 0xfd,
+    0x9c, 0x94, 0x76, 0xf6, 0x95, 0x2c, 0xb1, 0xfd, 0xdc, 0xc0, 0xe4, 0x3f,
+    0xb3, 0xff, 0x67, 0xde, 0xd5, 0x94, 0xcc, 0xb0, 0x83, 0x2f, 0x28, 0x93,
+    0x92, 0x3,  0xa1, 0x41, 0x64, 0x60, 0x62, 0x70, 0x80, 0x87, 0xaf, 0xe7,
+    0x60, 0x4a, 0x20, 0x23, 0xb3, 0x11, 0x7,  0x38, 0x38, 0xd4, 0xa,  0x66,
+    0xb5, 0x93, 0x41, 0x90, 0x19, 0x17, 0x18, 0x60, 0xa5, 0xb,  0x7a, 0x24,
+    0xaa, 0x20, 0x81, 0xac, 0xa9, 0xa1, 0x70, 0xa6, 0x12, 0x8a, 0x4a, 0xa3,
+    0xa0, 0xf9, 0x9a, 0x97, 0xe7, 0xa8, 0xac, 0x8,  0xa8, 0xc4, 0x2a, 0x86,
+    0xa7, 0x69, 0x1e, 0x67, 0xe6, 0xbe, 0xa4, 0xd3, 0xff, 0x91, 0x61, 0xf6,
+    0x8a, 0xe6, 0xb5, 0xb3, 0x61, 0x9f, 0x19, 0x17, 0x98, 0x27, 0x6b, 0xe9,
+    0x8,  0x98, 0xe1, 0x21, 0x4a, 0x9,  0xb5, 0xd7, 0xca, 0xfa, 0x94, 0xd0,
+    0x69, 0x1a, 0xeb, 0x52, 0x1,  0x4e, 0xf5, 0xf6, 0xdf, 0x7f, 0xe7, 0x29,
+    0x70, 0xee, 0x4,  0xda, 0x2f, 0xa4, 0xff, 0xfe, 0xbb, 0x6f, 0xa8, 0xff,
+    0xfe, 0xdb, 0xaf, 0x8,  0xf6, 0x72, 0xa1, 0x40, 0x5d, 0xf0, 0x2d, 0x8,
+    0x82, 0x5b, 0x87, 0xbd, 0x10, 0x8,  0xe9, 0x7,  0xee, 0x4b, 0x80, 0xda,
+    0x4a, 0x4,  0xc5, 0x5e, 0xa0, 0xb7, 0x1e, 0x60, 0xb0, 0x59, 0x76, 0x60,
+    0xb,  0x2e, 0x19, 0x8a, 0x2e, 0x1c, 0xe6, 0x6,  0x20, 0xb8, 0x64, 0x18,
+    0x2a, 0xcf, 0x51, 0x94, 0xd4, 0xee, 0xc3, 0xfe, 0x39, 0x74, 0xd4, 0x2b,
+    0x48, 0xc9, 0x83, 0x4c, 0x9b, 0xd0, 0x4c, 0x35, 0x10, 0xe3, 0x9,  0xf7,
+    0x72, 0xf0, 0x7a, 0xe,  0xbf, 0x7d, 0x36, 0x2e, 0x19, 0x7e, 0x3f, 0xc,
+    0xf7, 0x93, 0xe7, 0xf4, 0x1d, 0x32, 0xc6, 0xb0, 0x89, 0xad, 0xe0, 0x28,
+    0xc1, 0xa7, 0x59, 0xe3, 0x0,
+};
+
+// Tests that the decoded kEncodedTestProfile matches kTestProfile.
+TEST(IccCodecTest, EncodedIccProfile) {
+  jxl::BitReader reader(jxl::Span<const uint8_t>(kEncodedTestProfile,
+                                                 sizeof(kEncodedTestProfile)));
+  jxl::PaddedBytes dec;
+  ASSERT_TRUE(ReadICC(&reader, &dec));
+  ASSERT_TRUE(reader.Close());
+  EXPECT_EQ(sizeof(kTestProfile), dec.size());
+  if (sizeof(kTestProfile) == dec.size()) {
+    for (size_t i = 0; i < dec.size(); i++) {
+      EXPECT_EQ(kTestProfile[i], dec[i]);
+      if (kTestProfile[i] != dec[i]) break;  // One output is enough
+    }
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/image.cc b/third_party/jpeg-xl/lib/jxl/image.cc
new file mode 100644
index 0000000000..3faff6aefb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image.cc
@@ -0,0 +1,251 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/image.h"
+
+#include <algorithm>  // swap
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/image.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/sanitizers.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+
+namespace HWY_NAMESPACE {
+size_t GetVectorSize() { return HWY_LANES(uint8_t); }
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+namespace {
+
+HWY_EXPORT(GetVectorSize);  // Local function.
+
+// Returns distance [bytes] between the start of two consecutive rows, a
+// multiple of vector/cache line size but NOT CacheAligned::kAlias - see below.
+size_t BytesPerRow(const size_t xsize, const size_t sizeof_t) {
+  const size_t vec_size = VectorSize();
+  size_t valid_bytes = xsize * sizeof_t;
+
+  // Allow unaligned accesses starting at the last valid value - this may raise
+  // msan errors unless the user calls InitializePaddingForUnalignedAccesses.
+  // Skip for the scalar case because no extra lanes will be loaded.
+  if (vec_size != 0) {
+    valid_bytes += vec_size - sizeof_t;
+  }
+
+  // Round up to vector and cache line size.
+  const size_t align = std::max(vec_size, CacheAligned::kAlignment);
+  size_t bytes_per_row = RoundUpTo(valid_bytes, align);
+
+  // During the lengthy window before writes are committed to memory, CPUs
+  // guard against read after write hazards by checking the address, but
+  // only the lower 11 bits. We avoid a false dependency between writes to
+  // consecutive rows by ensuring their sizes are not multiples of 2 KiB.
+  // Avoid2K prevents the same problem for the planes of an Image3.
+  if (bytes_per_row % CacheAligned::kAlias == 0) {
+    bytes_per_row += align;
+  }
+
+  JXL_ASSERT(bytes_per_row % align == 0);
+  return bytes_per_row;
+}
+
+}  // namespace
+
+size_t VectorSize() {
+  static size_t bytes = HWY_DYNAMIC_DISPATCH(GetVectorSize)();
+  return bytes;
+}
+
+PlaneBase::PlaneBase(const size_t xsize, const size_t ysize,
+                     const size_t sizeof_t)
+    : xsize_(static_cast<uint32_t>(xsize)),
+      ysize_(static_cast<uint32_t>(ysize)),
+      orig_xsize_(static_cast<uint32_t>(xsize)),
+      orig_ysize_(static_cast<uint32_t>(ysize)) {
+  // (Can't profile CacheAligned itself because it is used by profiler.h)
+  PROFILER_FUNC;
+
+  JXL_CHECK(xsize == xsize_);
+  JXL_CHECK(ysize == ysize_);
+
+  JXL_ASSERT(sizeof_t == 1 || sizeof_t == 2 || sizeof_t == 4 || sizeof_t == 8);
+
+  bytes_per_row_ = 0;
+  // Dimensions can be zero, e.g. for lazily-allocated images. Only allocate
+  // if nonzero, because "zero" bytes still have padding/bookkeeping overhead.
+  if (xsize != 0 && ysize != 0) {
+    bytes_per_row_ = BytesPerRow(xsize, sizeof_t);
+    bytes_ = AllocateArray(bytes_per_row_ * ysize);
+    JXL_CHECK(bytes_.get());
+    InitializePadding(sizeof_t, Padding::kRoundUp);
+  }
+}
+
+void PlaneBase::InitializePadding(const size_t sizeof_t, Padding padding) {
+#if defined(MEMORY_SANITIZER) || HWY_IDE
+  if (xsize_ == 0 || ysize_ == 0) return;
+
+  const size_t vec_size = VectorSize();
+  if (vec_size == 0) return;  // Scalar mode: no padding needed
+
+  const size_t valid_size = xsize_ * sizeof_t;
+  const size_t initialize_size = padding == Padding::kRoundUp
+                                     ? RoundUpTo(valid_size, vec_size)
+                                     : valid_size + vec_size - sizeof_t;
+  if (valid_size == initialize_size) return;
+
+  for (size_t y = 0; y < ysize_; ++y) {
+    uint8_t* JXL_RESTRICT row = static_cast<uint8_t*>(VoidRow(y));
+#if defined(__clang__) &&                                           \
+    ((!defined(__apple_build_version__) && __clang_major__ <= 6) || \
+     (defined(__apple_build_version__) &&                           \
+      __apple_build_version__ <= 10001145))
+    // There's a bug in msan in clang-6 when handling AVX2 operations. This
+    // workaround allows tests to pass on msan, although it is slower and
+    // prevents msan warnings from uninitialized images.
+    std::fill(row, msan::kSanitizerSentinelByte, initialize_size);
+#else
+    memset(row + valid_size, msan::kSanitizerSentinelByte,
+           initialize_size - valid_size);
+#endif  // clang6
+  }
+#endif  // MEMORY_SANITIZER
+}
+
+void PlaneBase::Swap(PlaneBase& other) {
+  std::swap(xsize_, other.xsize_);
+  std::swap(ysize_, other.ysize_);
+  std::swap(orig_xsize_, other.orig_xsize_);
+  std::swap(orig_ysize_, other.orig_ysize_);
+  std::swap(bytes_per_row_, other.bytes_per_row_);
+  std::swap(bytes_, other.bytes_);
+}
+
+Image3F PadImageMirror(const Image3F& in, const size_t xborder,
+                       const size_t yborder) {
+  size_t xsize = in.xsize();
+  size_t ysize = in.ysize();
+  Image3F out(xsize + 2 * xborder, ysize + 2 * yborder);
+  if (xborder > xsize || yborder > ysize) {
+    for (size_t c = 0; c < 3; c++) {
+      for (int32_t y = 0; y < static_cast<int32_t>(out.ysize()); y++) {
+        float* row_out = out.PlaneRow(c, y);
+        const float* row_in = in.PlaneRow(
+            c, Mirror(y - static_cast<int32_t>(yborder), in.ysize()));
+        for (int32_t x = 0; x < static_cast<int32_t>(out.xsize()); x++) {
+          int32_t xin = Mirror(x - static_cast<int32_t>(xborder), in.xsize());
+          row_out[x] = row_in[xin];
+        }
+      }
+    }
+    return out;
+  }
+  CopyImageTo(in, Rect(xborder, yborder, xsize, ysize), &out);
+  for (size_t c = 0; c < 3; c++) {
+    // Horizontal pad.
+    for (size_t y = 0; y < ysize; y++) {
+      for (size_t x = 0; x < xborder; x++) {
+        out.PlaneRow(c, y + yborder)[x] =
+            in.ConstPlaneRow(c, y)[xborder - x - 1];
+        out.PlaneRow(c, y + yborder)[x + xsize + xborder] =
+            in.ConstPlaneRow(c, y)[xsize - 1 - x];
+      }
+    }
+    // Vertical pad.
+    for (size_t y = 0; y < yborder; y++) {
+      memcpy(out.PlaneRow(c, y), out.ConstPlaneRow(c, 2 * yborder - 1 - y),
+             out.xsize() * sizeof(float));
+      memcpy(out.PlaneRow(c, y + ysize + yborder),
+             out.ConstPlaneRow(c, ysize + yborder - 1 - y),
+             out.xsize() * sizeof(float));
+    }
+  }
+  return out;
+}
+
+void PadImageToBlockMultipleInPlace(Image3F* JXL_RESTRICT in,
+                                    size_t block_dim) {
+  PROFILER_FUNC;
+  const size_t xsize_orig = in->xsize();
+  const size_t ysize_orig = in->ysize();
+  const size_t xsize = RoundUpTo(xsize_orig, block_dim);
+  const size_t ysize = RoundUpTo(ysize_orig, block_dim);
+  // Expands image size to the originally-allocated size.
+  in->ShrinkTo(xsize, ysize);
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < ysize_orig; y++) {
+      float* JXL_RESTRICT row = in->PlaneRow(c, y);
+      for (size_t x = xsize_orig; x < xsize; x++) {
+        row[x] = row[xsize_orig - 1];
+      }
+    }
+    const float* JXL_RESTRICT row_src = in->ConstPlaneRow(c, ysize_orig - 1);
+    for (size_t y = ysize_orig; y < ysize; y++) {
+      memcpy(in->PlaneRow(c, y), row_src, xsize * sizeof(float));
+    }
+  }
+}
+
+static void DownsampleImage(const ImageF& input, size_t factor,
+                            ImageF* output) {
+  JXL_ASSERT(factor != 1);
+  output->ShrinkTo(DivCeil(input.xsize(), factor),
+                   DivCeil(input.ysize(), factor));
+  size_t in_stride = input.PixelsPerRow();
+  for (size_t y = 0; y < output->ysize(); y++) {
+    float* row_out = output->Row(y);
+    const float* row_in = input.Row(factor * y);
+    for (size_t x = 0; x < output->xsize(); x++) {
+      size_t cnt = 0;
+      float sum = 0;
+      for (size_t iy = 0; iy < factor && iy + factor * y < input.ysize();
+           iy++) {
+        for (size_t ix = 0; ix < factor && ix + factor * x < input.xsize();
+             ix++) {
+          sum += row_in[iy * in_stride + x * factor + ix];
+          cnt++;
+        }
+      }
+      row_out[x] = sum / cnt;
+    }
+  }
+}
+
+void DownsampleImage(ImageF* image, size_t factor) {
+  // Allocate extra space to avoid a reallocation when padding.
+  ImageF downsampled(DivCeil(image->xsize(), factor) + kBlockDim,
+                     DivCeil(image->ysize(), factor) + kBlockDim);
+  DownsampleImage(*image, factor, &downsampled);
+  *image = std::move(downsampled);
+}
+
+void DownsampleImage(Image3F* opsin, size_t factor) {
+  JXL_ASSERT(factor != 1);
+  // Allocate extra space to avoid a reallocation when padding.
+  Image3F downsampled(DivCeil(opsin->xsize(), factor) + kBlockDim,
+                      DivCeil(opsin->ysize(), factor) + kBlockDim);
+  downsampled.ShrinkTo(downsampled.xsize() - kBlockDim,
+                       downsampled.ysize() - kBlockDim);
+  for (size_t c = 0; c < 3; c++) {
+    DownsampleImage(opsin->Plane(c), factor, &downsampled.Plane(c));
+  }
+  *opsin = std::move(downsampled);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/image.h b/third_party/jpeg-xl/lib/jxl/image.h
new file mode 100644
index 0000000000..e66534220c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image.h
@@ -0,0 +1,497 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_IMAGE_H_
+#define LIB_JXL_IMAGE_H_
+
+// SIMD/multicore-friendly planar image representation with row accessors.
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <sstream>
+#include <utility>  // std::move
+
+#include "lib/jxl/base/cache_aligned.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+
+namespace jxl {
+
+// Helper function to create rows that are multiples of SIMD vector size.
+size_t VectorSize();
+
+// Type-independent parts of Plane<> - reduces code duplication and facilitates
+// moving member function implementations to cc file.
+struct PlaneBase {
+  PlaneBase()
+      : xsize_(0),
+        ysize_(0),
+        orig_xsize_(0),
+        orig_ysize_(0),
+        bytes_per_row_(0),
+        bytes_(nullptr) {}
+  PlaneBase(size_t xsize, size_t ysize, size_t sizeof_t);
+
+  // Copy construction/assignment is forbidden to avoid inadvertent copies,
+  // which can be very expensive. Use CopyImageTo() instead.
+  PlaneBase(const PlaneBase& other) = delete;
+  PlaneBase& operator=(const PlaneBase& other) = delete;
+
+  // Move constructor (required for returning Image from function)
+  PlaneBase(PlaneBase&& other) noexcept = default;
+
+  // Move assignment (required for std::vector)
+  PlaneBase& operator=(PlaneBase&& other) noexcept = default;
+
+  void Swap(PlaneBase& other);
+
+  // Useful for pre-allocating image with some padding for alignment purposes
+  // and later reporting the actual valid dimensions. May also be used to
+  // un-shrink the image. Caller is responsible for ensuring xsize/ysize are <=
+  // the original dimensions.
+  void ShrinkTo(const size_t xsize, const size_t ysize) {
+    JXL_CHECK(xsize <= orig_xsize_);
+    JXL_CHECK(ysize <= orig_ysize_);
+    xsize_ = static_cast<uint32_t>(xsize);
+    ysize_ = static_cast<uint32_t>(ysize);
+    // NOTE: we can't recompute bytes_per_row for more compact storage and
+    // better locality because that would invalidate the image contents.
+  }
+
+  // How many pixels.
+  JXL_INLINE size_t xsize() const { return xsize_; }
+  JXL_INLINE size_t ysize() const { return ysize_; }
+
+  // NOTE: do not use this for copying rows - the valid xsize may be much less.
+  JXL_INLINE size_t bytes_per_row() const { return bytes_per_row_; }
+
+  // Raw access to byte contents, for interfacing with other libraries.
+  // Unsigned char instead of char to avoid surprises (sign extension).
+  JXL_INLINE uint8_t* bytes() {
+    void* p = bytes_.get();
+    return static_cast<uint8_t * JXL_RESTRICT>(JXL_ASSUME_ALIGNED(p, 64));
+  }
+  JXL_INLINE const uint8_t* bytes() const {
+    const void* p = bytes_.get();
+    return static_cast<const uint8_t * JXL_RESTRICT>(JXL_ASSUME_ALIGNED(p, 64));
+  }
+
+ protected:
+  // Returns pointer to the start of a row.
+  JXL_INLINE void* VoidRow(const size_t y) const {
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
+    if (y >= ysize_) {
+      JXL_ABORT("Row(%" PRIu64 ") in (%u x %u) image\n", (uint64_t)y, xsize_,
+                ysize_);
+    }
+#endif
+
+    void* row = bytes_.get() + y * bytes_per_row_;
+    return JXL_ASSUME_ALIGNED(row, 64);
+  }
+
+  enum class Padding {
+    // Allow Load(d, row + x) for x = 0; x < xsize(); x += Lanes(d). Default.
+    kRoundUp,
+    // Allow LoadU(d, row + x) for x = xsize() - 1. This requires an extra
+    // vector to be initialized. If done by default, this would suppress
+    // legitimate msan warnings. We therefore require users to explicitly call
+    // InitializePadding before using unaligned loads (e.g. convolution).
+    kUnaligned
+  };
+
+  // Initializes the minimum bytes required to suppress msan warnings from
+  // legitimate (according to Padding mode) vector loads/stores on the right
+  // border, where some lanes are uninitialized and assumed to be unused.
+  void InitializePadding(size_t sizeof_t, Padding padding);
+
+  // (Members are non-const to enable assignment during move-assignment.)
+  uint32_t xsize_;  // In valid pixels, not including any padding.
+  uint32_t ysize_;
+  uint32_t orig_xsize_;
+  uint32_t orig_ysize_;
+  size_t bytes_per_row_;  // Includes padding.
+  CacheAlignedUniquePtr bytes_;
+};
+
+// Single channel, aligned rows separated by padding. T must be POD.
+//
+// 'Single channel' (one 2D array per channel) simplifies vectorization
+// (repeating the same operation on multiple adjacent components) without the
+// complexity of a hybrid layout (8 R, 8 G, 8 B, ...). In particular, clients
+// can easily iterate over all components in a row and Image requires no
+// knowledge of the pixel format beyond the component type "T".
+//
+// 'Aligned' means each row is aligned to the L1 cache line size. This prevents
+// false sharing between two threads operating on adjacent rows.
+//
+// 'Padding' is still relevant because vectors could potentially be larger than
+// a cache line. By rounding up row sizes to the vector size, we allow
+// reading/writing ALIGNED vectors whose first lane is a valid sample. This
+// avoids needing a separate loop to handle remaining unaligned lanes.
+//
+// This image layout could also be achieved with a vector and a row accessor
+// function, but a class wrapper with support for "deleter" allows wrapping
+// existing memory allocated by clients without copying the pixels. It also
+// provides convenient accessors for xsize/ysize, which shortens function
+// argument lists. Supports move-construction so it can be stored in containers.
+template <typename ComponentType>
+class Plane : public PlaneBase {
+ public:
+  using T = ComponentType;
+  static constexpr size_t kNumPlanes = 1;
+
+  Plane() = default;
+  Plane(const size_t xsize, const size_t ysize)
+      : PlaneBase(xsize, ysize, sizeof(T)) {}
+
+  void InitializePaddingForUnalignedAccesses() {
+    InitializePadding(sizeof(T), Padding::kUnaligned);
+  }
+
+  JXL_INLINE T* Row(const size_t y) { return static_cast<T*>(VoidRow(y)); }
+
+  // Returns pointer to const (see above).
+  JXL_INLINE const T* Row(const size_t y) const {
+    return static_cast<const T*>(VoidRow(y));
+  }
+
+  // Documents that the access is const.
+  JXL_INLINE const T* ConstRow(const size_t y) const {
+    return static_cast<const T*>(VoidRow(y));
+  }
+
+  // Returns number of pixels (some of which are padding) per row. Useful for
+  // computing other rows via pointer arithmetic. WARNING: this must
+  // NOT be used to determine xsize.
+  JXL_INLINE intptr_t PixelsPerRow() const {
+    return static_cast<intptr_t>(bytes_per_row_ / sizeof(T));
+  }
+};
+
+using ImageSB = Plane<int8_t>;
+using ImageB = Plane<uint8_t>;
+using ImageS = Plane<int16_t>;  // signed integer or half-float
+using ImageU = Plane<uint16_t>;
+using ImageI = Plane<int32_t>;
+using ImageF = Plane<float>;
+using ImageD = Plane<double>;
+
+// Also works for Image3 and mixed argument types.
+template <class Image1, class Image2>
+bool SameSize(const Image1& image1, const Image2& image2) {
+  return image1.xsize() == image2.xsize() && image1.ysize() == image2.ysize();
+}
+
+template <typename T>
+class Image3;
+
+// Rectangular region in image(s). Factoring this out of Image instead of
+// shifting the pointer by x0/y0 allows this to apply to multiple images with
+// different resolutions (e.g. color transform and quantization field).
+// Can compare using SameSize(rect1, rect2).
+template <typename T>
+class RectT {
+ public:
+  // Most windows are xsize_max * ysize_max, except those on the borders where
+  // begin + size_max > end.
+  constexpr RectT(T xbegin, T ybegin, size_t xsize_max, size_t ysize_max,
+                  T xend, T yend)
+      : x0_(xbegin),
+        y0_(ybegin),
+        xsize_(ClampedSize(xbegin, xsize_max, xend)),
+        ysize_(ClampedSize(ybegin, ysize_max, yend)) {}
+
+  // Construct with origin and known size (typically from another Rect).
+  constexpr RectT(T xbegin, T ybegin, size_t xsize, size_t ysize)
+      : x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {}
+
+  // Construct a rect that covers a whole image/plane/ImageBundle etc.
+  template <typename ImageT>
+  explicit RectT(const ImageT& image)
+      : RectT(0, 0, image.xsize(), image.ysize()) {}
+
+  RectT() : RectT(0, 0, 0, 0) {}
+
+  RectT(const RectT&) = default;
+  RectT& operator=(const RectT&) = default;
+
+  // Construct a subrect that resides in an image/plane/ImageBundle etc.
+  template <typename ImageT>
+  RectT Crop(const ImageT& image) const {
+    return Intersection(RectT(image));
+  }
+
+  // Construct a subrect that resides in the [0, ysize) x [0, xsize) region of
+  // the current rect.
+  RectT Crop(size_t area_xsize, size_t area_ysize) const {
+    return Intersection(RectT(0, 0, area_xsize, area_ysize));
+  }
+
+  // Returns a rect that only contains `num` lines with offset `y` from `y0()`.
+  RectT Lines(size_t y, size_t num) const {
+    JXL_DASSERT(y + num <= ysize_);
+    return RectT(x0_, y0_ + y, xsize_, num);
+  }
+
+  RectT Line(size_t y) const { return Lines(y, 1); }
+
+  JXL_MUST_USE_RESULT RectT Intersection(const RectT& other) const {
+    return RectT(std::max(x0_, other.x0_), std::max(y0_, other.y0_), xsize_,
+                 ysize_, std::min(x1(), other.x1()),
+                 std::min(y1(), other.y1()));
+  }
+
+  JXL_MUST_USE_RESULT RectT Translate(int64_t x_offset,
+                                      int64_t y_offset) const {
+    return RectT(x0_ + x_offset, y0_ + y_offset, xsize_, ysize_);
+  }
+
+  template <typename V>
+  V* Row(Plane<V>* image, size_t y) const {
+    JXL_DASSERT(y + y0_ >= 0);
+    return image->Row(y + y0_) + x0_;
+  }
+
+  template <typename V>
+  const V* Row(const Plane<V>* image, size_t y) const {
+    JXL_DASSERT(y + y0_ >= 0);
+    return image->Row(y + y0_) + x0_;
+  }
+
+  template <typename V>
+  V* PlaneRow(Image3<V>* image, const size_t c, size_t y) const {
+    JXL_DASSERT(y + y0_ >= 0);
+    return image->PlaneRow(c, y + y0_) + x0_;
+  }
+
+  template <typename V>
+  const V* ConstRow(const Plane<V>& image, size_t y) const {
+    JXL_DASSERT(y + y0_ >= 0);
+    return image.ConstRow(y + y0_) + x0_;
+  }
+
+  template <typename V>
+  const V* ConstPlaneRow(const Image3<V>& image, size_t c, size_t y) const {
+    JXL_DASSERT(y + y0_ >= 0);
+    return image.ConstPlaneRow(c, y + y0_) + x0_;
+  }
+
+  bool IsInside(const RectT& other) const {
+    return x0_ >= other.x0() && x1() <= other.x1() && y0_ >= other.y0() &&
+           y1() <= other.y1();
+  }
+
+  // Returns true if this Rect fully resides in the given image. ImageT could be
+  // Plane<T> or Image3<T>; however if ImageT is Rect, results are nonsensical.
+  template <class ImageT>
+  bool IsInside(const ImageT& image) const {
+    return IsInside(RectT(image));
+  }
+
+  T x0() const { return x0_; }
+  T y0() const { return y0_; }
+  size_t xsize() const { return xsize_; }
+  size_t ysize() const { return ysize_; }
+  T x1() const { return x0_ + xsize_; }
+  T y1() const { return y0_ + ysize_; }
+
+  RectT<T> ShiftLeft(size_t shiftx, size_t shifty) const {
+    return RectT<T>(x0_ * (1 << shiftx), y0_ * (1 << shifty), xsize_ << shiftx,
+                    ysize_ << shifty);
+  }
+  RectT<T> ShiftLeft(size_t shift) const { return ShiftLeft(shift, shift); }
+
+  // Requires x0(), y0() to be multiples of 1<<shiftx, 1<<shifty.
+  RectT<T> CeilShiftRight(size_t shiftx, size_t shifty) const {
+    JXL_ASSERT(x0_ % (1 << shiftx) == 0);
+    JXL_ASSERT(y0_ % (1 << shifty) == 0);
+    return RectT<T>(x0_ / (1 << shiftx), y0_ / (1 << shifty),
+                    DivCeil(xsize_, T{1} << shiftx),
+                    DivCeil(ysize_, T{1} << shifty));
+  }
+  RectT<T> CeilShiftRight(std::pair<size_t, size_t> shift) const {
+    return CeilShiftRight(shift.first, shift.second);
+  }
+  RectT<T> CeilShiftRight(size_t shift) const {
+    return CeilShiftRight(shift, shift);
+  }
+
+  template <typename U>
+  RectT<U> As() const {
+    return RectT<U>(U(x0_), U(y0_), U(xsize_), U(ysize_));
+  }
+
+ private:
+  // Returns size_max, or whatever is left in [begin, end).
+  static constexpr size_t ClampedSize(T begin, size_t size_max, T end) {
+    return (static_cast<T>(begin + size_max) <= end)
+               ? size_max
+               : (end > begin ? end - begin : 0);
+  }
+
+  T x0_;
+  T y0_;
+
+  size_t xsize_;
+  size_t ysize_;
+};
+
+template <typename T>
+std::string Description(RectT<T> r) {
+  std::ostringstream os;
+  os << "[" << r.x0() << ".." << r.x1() << ")x"
+     << "[" << r.y0() << ".." << r.y1() << ")";
+  return os.str();
+}
+
+using Rect = RectT<size_t>;
+
+// Currently, we abuse Image to either refer to an image that owns its storage
+// or one that doesn't. In similar vein, we abuse Image* function parameters to
+// either mean "assign to me" or "fill the provided image with data".
+// Hopefully, the "assign to me" meaning will go away and most images in the
+// codebase will not be backed by own storage. When this happens we can redesign
+// Image to be a non-storage-holding view class and introduce BackedImage in
+// those places that actually need it.
+
+// NOTE: we can't use Image as a view because invariants are violated
+// (alignment and the presence of padding before/after each "row").
+
+// A bundle of 3 same-sized images. Typically constructed by moving from three
+// rvalue references to Image. To overwrite an existing Image3 using
+// single-channel producers, we also need access to Image*. Constructing
+// temporary non-owning Image pointing to one plane of an existing Image3 risks
+// dangling references, especially if the wrapper is moved. Therefore, we
+// store an array of Image (which are compact enough that size is not a concern)
+// and provide Plane+Row accessors.
+template <typename ComponentType>
+class Image3 {
+ public:
+  using T = ComponentType;
+  using PlaneT = jxl::Plane<T>;
+  static constexpr size_t kNumPlanes = 3;
+
+  Image3() : planes_{PlaneT(), PlaneT(), PlaneT()} {}
+
+  Image3(const size_t xsize, const size_t ysize)
+      : planes_{PlaneT(xsize, ysize), PlaneT(xsize, ysize),
+                PlaneT(xsize, ysize)} {}
+
+  Image3(Image3&& other) noexcept {
+    for (size_t i = 0; i < kNumPlanes; i++) {
+      planes_[i] = std::move(other.planes_[i]);
+    }
+  }
+
+  Image3(PlaneT&& plane0, PlaneT&& plane1, PlaneT&& plane2) {
+    JXL_CHECK(SameSize(plane0, plane1));
+    JXL_CHECK(SameSize(plane0, plane2));
+    planes_[0] = std::move(plane0);
+    planes_[1] = std::move(plane1);
+    planes_[2] = std::move(plane2);
+  }
+
+  // Copy construction/assignment is forbidden to avoid inadvertent copies,
+  // which can be very expensive. Use CopyImageTo instead.
+  Image3(const Image3& other) = delete;
+  Image3& operator=(const Image3& other) = delete;
+
+  Image3& operator=(Image3&& other) noexcept {
+    for (size_t i = 0; i < kNumPlanes; i++) {
+      planes_[i] = std::move(other.planes_[i]);
+    }
+    return *this;
+  }
+
+  // Returns row pointer; usage: PlaneRow(idx_plane, y)[x] = val.
+  JXL_INLINE T* PlaneRow(const size_t c, const size_t y) {
+    // Custom implementation instead of calling planes_[c].Row ensures only a
+    // single multiplication is needed for PlaneRow(0..2, y).
+    PlaneRowBoundsCheck(c, y);
+    const size_t row_offset = y * planes_[0].bytes_per_row();
+    void* row = planes_[c].bytes() + row_offset;
+    return static_cast<T * JXL_RESTRICT>(JXL_ASSUME_ALIGNED(row, 64));
+  }
+
+  // Returns const row pointer; usage: val = PlaneRow(idx_plane, y)[x].
+  JXL_INLINE const T* PlaneRow(const size_t c, const size_t y) const {
+    PlaneRowBoundsCheck(c, y);
+    const size_t row_offset = y * planes_[0].bytes_per_row();
+    const void* row = planes_[c].bytes() + row_offset;
+    return static_cast<const T * JXL_RESTRICT>(JXL_ASSUME_ALIGNED(row, 64));
+  }
+
+  // Returns const row pointer, even if called from a non-const Image3.
+  JXL_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) const {
+    PlaneRowBoundsCheck(c, y);
+    return PlaneRow(c, y);
+  }
+
+  JXL_INLINE const PlaneT& Plane(size_t idx) const { return planes_[idx]; }
+
+  JXL_INLINE PlaneT& Plane(size_t idx) { return planes_[idx]; }
+
+  void Swap(Image3& other) {
+    for (size_t c = 0; c < 3; ++c) {
+      other.planes_[c].Swap(planes_[c]);
+    }
+  }
+
+  // Useful for pre-allocating image with some padding for alignment purposes
+  // and later reporting the actual valid dimensions. May also be used to
+  // un-shrink the image. Caller is responsible for ensuring xsize/ysize are <=
+  // the original dimensions.
+  void ShrinkTo(const size_t xsize, const size_t ysize) {
+    for (PlaneT& plane : planes_) {
+      plane.ShrinkTo(xsize, ysize);
+    }
+  }
+
+  // Sizes of all three images are guaranteed to be equal.
+  JXL_INLINE size_t xsize() const { return planes_[0].xsize(); }
+  JXL_INLINE size_t ysize() const { return planes_[0].ysize(); }
+  // Returns offset [bytes] from one row to the next row of the same plane.
+  // WARNING: this must NOT be used to determine xsize, nor for copying rows -
+  // the valid xsize may be much less.
+  JXL_INLINE size_t bytes_per_row() const { return planes_[0].bytes_per_row(); }
+  // Returns number of pixels (some of which are padding) per row. Useful for
+  // computing other rows via pointer arithmetic. WARNING: this must NOT be used
+  // to determine xsize.
+  JXL_INLINE intptr_t PixelsPerRow() const { return planes_[0].PixelsPerRow(); }
+
+ private:
+  void PlaneRowBoundsCheck(const size_t c, const size_t y) const {
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
+    if (c >= kNumPlanes || y >= ysize()) {
+      JXL_ABORT("PlaneRow(%" PRIu64 ", %" PRIu64 ") in (%" PRIu64 " x %" PRIu64
+                ") image\n",
+                static_cast<uint64_t>(c), static_cast<uint64_t>(y),
+                static_cast<uint64_t>(xsize()), static_cast<uint64_t>(ysize()));
+    }
+#endif
+  }
+
+ private:
+  PlaneT planes_[kNumPlanes];
+};
+
+using Image3B = Image3<uint8_t>;
+using Image3S = Image3<int16_t>;
+using Image3U = Image3<uint16_t>;
+using Image3I = Image3<int32_t>;
+using Image3F = Image3<float>;
+using Image3D = Image3<double>;
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_IMAGE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/image_bundle.cc b/third_party/jpeg-xl/lib/jxl/image_bundle.cc
new file mode 100644
index 0000000000..7e7051b608
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_bundle.cc
@@ -0,0 +1,125 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/image_bundle.h"
+
+#include <limits>
+#include <utility>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+
+void ImageBundle::ShrinkTo(size_t xsize, size_t ysize) {
+  if (HasColor()) color_.ShrinkTo(xsize, ysize);
+  for (ImageF& ec : extra_channels_) {
+    ec.ShrinkTo(xsize, ysize);
+  }
+}
+
+// Called by all other SetFrom*.
+void ImageBundle::SetFromImage(Image3F&& color,
+                               const ColorEncoding& c_current) {
+  JXL_CHECK(color.xsize() != 0 && color.ysize() != 0);
+  JXL_CHECK(metadata_->color_encoding.IsGray() == c_current.IsGray());
+  color_ = std::move(color);
+  c_current_ = c_current;
+  VerifySizes();
+}
+
+void ImageBundle::VerifyMetadata() const {
+  JXL_CHECK(!c_current_.ICC().empty());
+  JXL_CHECK(metadata_->color_encoding.IsGray() == IsGray());
+
+  if (metadata_->HasAlpha() && alpha().xsize() == 0) {
+    JXL_ABORT("MD alpha_bits %u IB alpha %" PRIuS " x %" PRIuS "\n",
+              metadata_->GetAlphaBits(), alpha().xsize(), alpha().ysize());
+  }
+  const uint32_t alpha_bits = metadata_->GetAlphaBits();
+  JXL_CHECK(alpha_bits <= 32);
+
+  // metadata_->num_extra_channels may temporarily differ from
+  // extra_channels_.size(), e.g. after SetAlpha. They are synced by the next
+  // call to VisitFields.
+}
+
+void ImageBundle::VerifySizes() const {
+  const size_t xs = xsize();
+  const size_t ys = ysize();
+
+  if (HasExtraChannels()) {
+    JXL_CHECK(xs != 0 && ys != 0);
+    for (const ImageF& ec : extra_channels_) {
+      JXL_CHECK(ec.xsize() == xs);
+      JXL_CHECK(ec.ysize() == ys);
+    }
+  }
+}
+
+size_t ImageBundle::DetectRealBitdepth() const {
+  return metadata_->bit_depth.bits_per_sample;
+
+  // TODO(lode): let this function return lower bit depth if possible, e.g.
+  // return 8 bits in case the original image came from a 16-bit PNG that
+  // was in fact representable as 8-bit PNG. Ensure that the implementation
+  // returns 16 if e.g. two consecutive 16-bit values appeared in the original
+  // image (such as 32768 and 32769), take into account that e.g. the values
+  // 3-bit can represent is not a superset of the values 2-bit can represent,
+  // and there may be slight imprecisions in the floating point image.
+}
+
+const ImageF& ImageBundle::black() const {
+  JXL_ASSERT(HasBlack());
+  const size_t ec = metadata_->Find(ExtraChannel::kBlack) -
+                    metadata_->extra_channel_info.data();
+  JXL_ASSERT(ec < extra_channels_.size());
+  return extra_channels_[ec];
+}
+const ImageF& ImageBundle::alpha() const {
+  JXL_ASSERT(HasAlpha());
+  const size_t ec = metadata_->Find(ExtraChannel::kAlpha) -
+                    metadata_->extra_channel_info.data();
+  JXL_ASSERT(ec < extra_channels_.size());
+  return extra_channels_[ec];
+}
+ImageF* ImageBundle::alpha() {
+  JXL_ASSERT(HasAlpha());
+  const size_t ec = metadata_->Find(ExtraChannel::kAlpha) -
+                    metadata_->extra_channel_info.data();
+  JXL_ASSERT(ec < extra_channels_.size());
+  return &extra_channels_[ec];
+}
+
+void ImageBundle::SetAlpha(ImageF&& alpha) {
+  const ExtraChannelInfo* eci = metadata_->Find(ExtraChannel::kAlpha);
+  // Must call SetAlphaBits first, otherwise we don't know which channel index
+  JXL_CHECK(eci != nullptr);
+  JXL_CHECK(alpha.xsize() != 0 && alpha.ysize() != 0);
+  if (extra_channels_.size() < metadata_->extra_channel_info.size()) {
+    // TODO(jon): get rid of this case
+    extra_channels_.insert(
+        extra_channels_.begin() + (eci - metadata_->extra_channel_info.data()),
+        std::move(alpha));
+  } else {
+    extra_channels_[eci - metadata_->extra_channel_info.data()] =
+        std::move(alpha);
+  }
+  // num_extra_channels is automatically set in visitor
+  VerifySizes();
+}
+
+void ImageBundle::SetExtraChannels(std::vector<ImageF>&& extra_channels) {
+  for (const ImageF& plane : extra_channels) {
+    JXL_CHECK(plane.xsize() != 0 && plane.ysize() != 0);
+  }
+  extra_channels_ = std::move(extra_channels);
+  VerifySizes();
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/image_bundle.h b/third_party/jpeg-xl/lib/jxl/image_bundle.h
new file mode 100644
index 0000000000..c7b812b59a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_bundle.h
@@ -0,0 +1,254 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_IMAGE_BUNDLE_H_
+#define LIB_JXL_IMAGE_BUNDLE_H_
+
+// The main image or frame consists of a bundle of associated images.
+
+#include <jxl/cms_interface.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/quantizer.h"
+
+namespace jxl {
+
+// A bundle of color/alpha/depth/plane images.
+class ImageBundle {
+ public:
+  // Uninitialized state for use as output parameter.
+  ImageBundle() : metadata_(nullptr) {}
+  // Caller is responsible for setting metadata before calling Set*.
+  explicit ImageBundle(const ImageMetadata* metadata) : metadata_(metadata) {}
+
+  // Move-only (allows storing in std::vector).
+  ImageBundle(ImageBundle&&) = default;
+  ImageBundle& operator=(ImageBundle&&) = default;
+
+  ImageBundle Copy() const {
+    ImageBundle copy(metadata_);
+    copy.color_ = CopyImage(color_);
+    copy.c_current_ = c_current_;
+    copy.extra_channels_.reserve(extra_channels_.size());
+    for (const ImageF& plane : extra_channels_) {
+      copy.extra_channels_.emplace_back(CopyImage(plane));
+    }
+
+    copy.jpeg_data =
+        jpeg_data ? make_unique<jpeg::JPEGData>(*jpeg_data) : nullptr;
+    copy.color_transform = color_transform;
+    copy.chroma_subsampling = chroma_subsampling;
+
+    return copy;
+  }
+
+  // -- SIZE
+
+  size_t xsize() const {
+    if (IsJPEG()) return jpeg_data->width;
+    if (color_.xsize() != 0) return color_.xsize();
+    return extra_channels_.empty() ? 0 : extra_channels_[0].xsize();
+  }
+  size_t ysize() const {
+    if (IsJPEG()) return jpeg_data->height;
+    if (color_.ysize() != 0) return color_.ysize();
+    return extra_channels_.empty() ? 0 : extra_channels_[0].ysize();
+  }
+  void ShrinkTo(size_t xsize, size_t ysize);
+
+  // sizes taking orientation into account
+  size_t oriented_xsize() const {
+    if (static_cast<uint32_t>(metadata_->GetOrientation()) > 4) {
+      return ysize();
+    } else {
+      return xsize();
+    }
+  }
+  size_t oriented_ysize() const {
+    if (static_cast<uint32_t>(metadata_->GetOrientation()) > 4) {
+      return xsize();
+    } else {
+      return ysize();
+    }
+  }
+
+  // -- COLOR
+
+  // Whether color() is valid/usable. Returns true in most cases. Even images
+  // with spot colors (one example of when !planes().empty()) typically have a
+  // part that can be converted to RGB.
+  bool HasColor() const { return color_.xsize() != 0; }
+
+  // For resetting the size when switching from a reference to main frame.
+  void RemoveColor() { color_ = Image3F(); }
+
+  // Do not use if !HasColor().
+  const Image3F& color() const {
+    // If this fails, Set* was not called - perhaps because decoding failed?
+    JXL_DASSERT(HasColor());
+    return color_;
+  }
+
+  // Do not use if !HasColor().
+  Image3F* color() {
+    JXL_DASSERT(HasColor());
+    return &color_;
+  }
+
+  // If c_current.IsGray(), all planes must be identical. NOTE: c_current is
+  // independent of metadata()->color_encoding, which is the original, whereas
+  // a decoder might return pixels in a different c_current.
+  // This only sets the color channels, you must also make extra channels
+  // match the amount that is in the metadata.
+  void SetFromImage(Image3F&& color, const ColorEncoding& c_current);
+
+  // -- COLOR ENCODING
+
+  const ColorEncoding& c_current() const { return c_current_; }
+
+  // Returns whether the color image has identical planes. Once established by
+  // Set*, remains unchanged until a subsequent Set* or TransformTo.
+  bool IsGray() const { return c_current_.IsGray(); }
+
+  bool IsSRGB() const { return c_current_.IsSRGB(); }
+  bool IsLinearSRGB() const {
+    return c_current_.white_point == WhitePoint::kD65 &&
+           c_current_.primaries == Primaries::kSRGB && c_current_.tf.IsLinear();
+  }
+
+  // Set the c_current profile without doing any transformation, e.g. if the
+  // transformation was already applied.
+  void OverrideProfile(const ColorEncoding& new_c_current) {
+    c_current_ = new_c_current;
+  }
+
+  // TODO(lode): TransformTo and CopyTo are implemented in enc_image_bundle.cc,
+  // move these functions out of this header file and class, to
+  // enc_image_bundle.h.
+
+  // Transforms color to c_desired and sets c_current to c_desired. Alpha and
+  // metadata remains unchanged.
+  Status TransformTo(const ColorEncoding& c_desired, const JxlCmsInterface& cms,
+                     ThreadPool* pool = nullptr);
+  // Copies this:rect, converts to c_desired, and allocates+fills out.
+  Status CopyTo(const Rect& rect, const ColorEncoding& c_desired,
+                const JxlCmsInterface& cms, Image3F* out,
+                ThreadPool* pool = nullptr) const;
+
+  // Detect 'real' bit depth, which can be lower than nominal bit depth
+  // (this is common in PNG), returns 'real' bit depth
+  size_t DetectRealBitdepth() const;
+
+  // -- ALPHA
+
+  void SetAlpha(ImageF&& alpha);
+  bool HasAlpha() const {
+    return metadata_->Find(ExtraChannel::kAlpha) != nullptr;
+  }
+  bool AlphaIsPremultiplied() const {
+    const ExtraChannelInfo* eci = metadata_->Find(ExtraChannel::kAlpha);
+    return (eci == nullptr) ? false : eci->alpha_associated;
+  }
+  const ImageF& alpha() const;
+  ImageF* alpha();
+
+  // -- EXTRA CHANNELS
+  bool HasBlack() const {
+    return metadata_->Find(ExtraChannel::kBlack) != nullptr;
+  }
+  const ImageF& black() const;
+
+  // Extra channels of unknown interpretation (e.g. spot colors).
+  void SetExtraChannels(std::vector<ImageF>&& extra_channels);
+  void ClearExtraChannels() { extra_channels_.clear(); }
+  bool HasExtraChannels() const { return !extra_channels_.empty(); }
+  const std::vector<ImageF>& extra_channels() const { return extra_channels_; }
+  std::vector<ImageF>& extra_channels() { return extra_channels_; }
+
+  const ImageMetadata* metadata() const { return metadata_; }
+
+  void VerifyMetadata() const;
+
+  void SetDecodedBytes(size_t decoded_bytes) { decoded_bytes_ = decoded_bytes; }
+  size_t decoded_bytes() const { return decoded_bytes_; }
+
+  // -- JPEG transcoding:
+
+  // Returns true if image does or will represent quantized DCT-8 coefficients,
+  // stored in 8x8 pixel regions.
+  bool IsJPEG() const {
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+    return jpeg_data != nullptr;
+#else   // JPEGXL_ENABLE_TRANSCODE_JPEG
+    return false;
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+  }
+
+  std::unique_ptr<jpeg::JPEGData> jpeg_data;
+  // these fields are used to signal the input JPEG color space
+  // NOTE: JPEG doesn't actually provide a way to determine whether YCbCr was
+  // applied or not.
+  ColorTransform color_transform = ColorTransform::kNone;
+  YCbCrChromaSubsampling chroma_subsampling;
+
+  FrameOrigin origin{0, 0};
+
+  // Animation-related information, corresponding to the timecode and duration
+  // fields of the jxl::AnimationFrame of the jxl::FrameHeader.
+  // TODO(lode): ImageBundle is used here to carry the information from
+  // jxl::FrameHeader, consider instead passing a jxl::FrameHeader directly to
+  // EncodeFrame or having a field of that type here.
+  uint32_t duration = 0;
+  uint32_t timecode = 0;
+
+  // TODO(lode): these fields do not match the JXL frame header, it should be
+  // possible to specify up to 4 (3 if nonzero duration) slots to save this
+  // frame as reference (see save_as_reference).
+  bool use_for_next_frame = false;
+  bool blend = false;
+  BlendMode blendmode = BlendMode::kBlend;
+
+  std::string name;
+
+ private:
+  // Called after any Set* to ensure their sizes are compatible.
+  void VerifySizes() const;
+
+  // Required for TransformTo so that an ImageBundle is self-sufficient. Always
+  // points to the same thing, but cannot be const-pointer because that prevents
+  // the compiler from generating a move ctor.
+  const ImageMetadata* metadata_;
+
+  // Initialized by Set*:
+  Image3F color_;  // If empty, planes_ is not; all planes equal if IsGray().
+  ColorEncoding c_current_;  // of color_
+
+  // Initialized by SetPlanes; size = ImageMetadata.num_extra_channels
+  std::vector<ImageF> extra_channels_;
+
+  // How many bytes of the input were actually read.
+  size_t decoded_bytes_ = 0;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_IMAGE_BUNDLE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/image_bundle_test.cc b/third_party/jpeg-xl/lib/jxl/image_bundle_test.cc
new file mode 100644
index 0000000000..1a10598fe2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_bundle_test.cc
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/image_bundle.h"
+
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(ImageBundleTest, ExtraChannelName) {
+  AuxOut aux_out;
+  BitWriter writer;
+  BitWriter::Allotment allotment(&writer, 99);
+
+  ImageMetadata metadata;
+  ExtraChannelInfo eci;
+  eci.type = ExtraChannel::kBlack;
+  eci.name = "testK";
+  metadata.extra_channel_info.push_back(std::move(eci));
+  ASSERT_TRUE(WriteImageMetadata(metadata, &writer, /*layer=*/0, &aux_out));
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, /*layer=*/0, &aux_out);
+
+  BitReader reader(writer.GetSpan());
+  ImageMetadata metadata_out;
+  ASSERT_TRUE(ReadImageMetadata(&reader, &metadata_out));
+  EXPECT_TRUE(reader.Close());
+  EXPECT_EQ("testK", metadata_out.Find(ExtraChannel::kBlack)->name);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/image_metadata.cc b/third_party/jpeg-xl/lib/jxl/image_metadata.cc
new file mode 100644
index 0000000000..20b0d6f95a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_metadata.cc
@@ -0,0 +1,472 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/image_metadata.h"
+
+#include <limits>
+#include <utility>
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/quantizer.h"
+
+namespace jxl {
+BitDepth::BitDepth() { Bundle::Init(this); }
+Status BitDepth::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &floating_point_sample));
+  // The same fields (bits_per_sample and exponent_bits_per_sample) are read
+  // in a different way depending on floating_point_sample's value. It's still
+  // default-initialized correctly so using visitor->Conditional is not
+  // required.
+  if (!floating_point_sample) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+        Val(8), Val(10), Val(12), BitsOffset(6, 1), 8, &bits_per_sample));
+    exponent_bits_per_sample = 0;
+  } else {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+        Val(32), Val(16), Val(24), BitsOffset(6, 1), 32, &bits_per_sample));
+    // The encoded value is exponent_bits_per_sample - 1, encoded in 3 bits
+    // so the value can be in range [1, 8].
+    const uint32_t offset = 1;
+    exponent_bits_per_sample -= offset;
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->Bits(4, 8 - offset, &exponent_bits_per_sample));
+    exponent_bits_per_sample += offset;
+  }
+
+  // Error-checking for floating point ranges.
+  if (floating_point_sample) {
+    if (exponent_bits_per_sample < 2 || exponent_bits_per_sample > 8) {
+      return JXL_FAILURE("Invalid exponent_bits_per_sample: %u",
+                         exponent_bits_per_sample);
+    }
+    int mantissa_bits =
+        static_cast<int>(bits_per_sample) - exponent_bits_per_sample - 1;
+    if (mantissa_bits < 2 || mantissa_bits > 23) {
+      return JXL_FAILURE("Invalid bits_per_sample: %u", bits_per_sample);
+    }
+  } else {
+    if (bits_per_sample > 31) {
+      return JXL_FAILURE("Invalid bits_per_sample: %u", bits_per_sample);
+    }
+  }
+  return true;
+}
+
+std::string BitDepth::DebugString() const {
+  std::ostringstream os;
+  os << (floating_point_sample ? "F" : "U");
+  os << bits_per_sample;
+  if (floating_point_sample) os << "." << exponent_bits_per_sample;
+  return os.str();
+}
+
+CustomTransformData::CustomTransformData() { Bundle::Init(this); }
+Status CustomTransformData::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+  if (visitor->Conditional(nonserialized_xyb_encoded)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&opsin_inverse_matrix));
+  }
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &custom_weights_mask));
+  if (visitor->Conditional((custom_weights_mask & 0x1) != 0)) {
+    // 4 5x5 kernels, but all of them can be obtained by symmetry from one,
+    // which is symmetric along its main diagonal. The top-left kernel is
+    // defined by
+    //
+    // 0  1  2  3  4
+    // 1  5  6  7  8
+    // 2  6  9 10 11
+    // 3  7 10 12 13
+    // 4  8 11 13 14
+    float constexpr kWeights2[15] = {
+        -0.01716200f, -0.03452303f, -0.04022174f, -0.02921014f, -0.00624645f,
+        0.14111091f,  0.28896755f,  0.00278718f,  -0.01610267f, 0.56661550f,
+        0.03777607f,  -0.01986694f, -0.03144731f, -0.01185068f, -0.00213539f};
+    for (size_t i = 0; i < 15; i++) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(kWeights2[i], &upsampling2_weights[i]));
+    }
+  }
+  if (visitor->Conditional((custom_weights_mask & 0x2) != 0)) {
+    // 16 5x5 kernels, but all of them can be obtained by symmetry from
+    // three, two of which are symmetric along their main diagonals. The top
+    // left 4 kernels are defined by
+    //
+    // 0  1  2  3  4   5  6  7  8  9
+    // 1 10 11 12 13  14 15 16 17 18
+    // 2 11 19 20 21  22 23 24 25 26
+    // 3 12 20 27 28  29 30 31 32 33
+    // 4 13 21 28 34  35 36 37 38 39
+    //
+    // 5 14 22 29 35  40 41 42 43 44
+    // 6 15 23 30 36  41 45 46 47 48
+    // 7 16 24 31 37  42 46 49 50 51
+    // 8 17 25 32 38  43 47 50 52 53
+    // 9 18 26 33 39  44 48 51 53 54
+    constexpr float kWeights4[55] = {
+        -0.02419067f, -0.03491987f, -0.03693351f, -0.03094285f, -0.00529785f,
+        -0.01663432f, -0.03556863f, -0.03888905f, -0.03516850f, -0.00989469f,
+        0.23651958f,  0.33392945f,  -0.01073543f, -0.01313181f, -0.03556694f,
+        0.13048175f,  0.40103025f,  0.03951150f,  -0.02077584f, 0.46914198f,
+        -0.00209270f, -0.01484589f, -0.04064806f, 0.18942530f,  0.56279892f,
+        0.06674400f,  -0.02335494f, -0.03551682f, -0.00754830f, -0.02267919f,
+        -0.02363578f, 0.00315804f,  -0.03399098f, -0.01359519f, -0.00091653f,
+        -0.00335467f, -0.01163294f, -0.01610294f, -0.00974088f, -0.00191622f,
+        -0.01095446f, -0.03198464f, -0.04455121f, -0.02799790f, -0.00645912f,
+        0.06390599f,  0.22963888f,  0.00630981f,  -0.01897349f, 0.67537268f,
+        0.08483369f,  -0.02534994f, -0.02205197f, -0.01667999f, -0.00384443f};
+    for (size_t i = 0; i < 55; i++) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(kWeights4[i], &upsampling4_weights[i]));
+    }
+  }
+  if (visitor->Conditional((custom_weights_mask & 0x4) != 0)) {
+    // 64 5x5 kernels, all of them can be obtained by symmetry from
+    // 10, 4 of which are symmetric along their main diagonals. The top
+    // left 16 kernels are defined by
+    //  0  1  2  3  4   5  6  7  8  9   a  b  c  d  e   f 10 11 12 13
+    //  1 14 15 16 17  18 19 1a 1b 1c  1d 1e 1f 20 21  22 23 24 25 26
+    //  2 15 27 28 29  2a 2b 2c 2d 2e  2f 30 31 32 33  34 35 36 37 38
+    //  3 16 28 39 3a  3b 3c 3d 3e 3f  40 41 42 43 44  45 46 47 48 49
+    //  4 17 29 3a 4a  4b 4c 4d 4e 4f  50 51 52 53 54  55 56 57 58 59
+
+    //  5 18 2a 3b 4b  5a 5b 5c 5d 5e  5f 60 61 62 63  64 65 66 67 68
+    //  6 19 2b 3c 4c  5b 69 6a 6b 6c  6d 6e 6f 70 71  72 73 74 75 76
+    //  7 1a 2c 3d 4d  5c 6a 77 78 79  7a 7b 7c 7d 7e  7f 80 81 82 83
+    //  8 1b 2d 3e 4e  5d 6b 78 84 85  86 87 88 89 8a  8b 8c 8d 8e 8f
+    //  9 1c 2e 3f 4f  5e 6c 79 85 90  91 92 93 94 95  96 97 98 99 9a
+
+    //  a 1d 2f 40 50  5f 6d 7a 86 91  9b 9c 9d 9e 9f  a0 a1 a2 a3 a4
+    //  b 1e 30 41 51  60 6e 7b 87 92  9c a5 a6 a7 a8  a9 aa ab ac ad
+    //  c 1f 31 42 52  61 6f 7c 88 93  9d a6 ae af b0  b1 b2 b3 b4 b5
+    //  d 20 32 43 53  62 70 7d 89 94  9e a7 af b6 b7  b8 b9 ba bb bc
+    //  e 21 33 44 54  63 71 7e 8a 95  9f a8 b0 b7 bd  be bf c0 c1 c2
+
+    //  f 22 34 45 55  64 72 7f 8b 96  a0 a9 b1 b8 be  c3 c4 c5 c6 c7
+    // 10 23 35 46 56  65 73 80 8c 97  a1 aa b2 b9 bf  c4 c8 c9 ca cb
+    // 11 24 36 47 57  66 74 81 8d 98  a2 ab b3 ba c0  c5 c9 cc cd ce
+    // 12 25 37 48 58  67 75 82 8e 99  a3 ac b4 bb c1  c6 ca cd cf d0
+    // 13 26 38 49 59  68 76 83 8f 9a  a4 ad b5 bc c2  c7 cb ce d0 d1
+    constexpr float kWeights8[210] = {
+        -0.02928613f, -0.03706353f, -0.03783812f, -0.03324558f, -0.00447632f,
+        -0.02519406f, -0.03752601f, -0.03901508f, -0.03663285f, -0.00646649f,
+        -0.02066407f, -0.03838633f, -0.04002101f, -0.03900035f, -0.00901973f,
+        -0.01626393f, -0.03954148f, -0.04046620f, -0.03979621f, -0.01224485f,
+        0.29895328f,  0.35757708f,  -0.02447552f, -0.01081748f, -0.04314594f,
+        0.23903219f,  0.41119301f,  -0.00573046f, -0.01450239f, -0.04246845f,
+        0.17567618f,  0.45220643f,  0.02287757f,  -0.01936783f, -0.03583255f,
+        0.11572472f,  0.47416733f,  0.06284440f,  -0.02685066f, 0.42720050f,
+        -0.02248939f, -0.01155273f, -0.04562755f, 0.28689496f,  0.49093869f,
+        -0.00007891f, -0.01545926f, -0.04562659f, 0.21238920f,  0.53980934f,
+        0.03369474f,  -0.02070211f, -0.03866988f, 0.14229550f,  0.56593398f,
+        0.08045181f,  -0.02888298f, -0.03680918f, -0.00542229f, -0.02920477f,
+        -0.02788574f, -0.02118180f, -0.03942402f, -0.00775547f, -0.02433614f,
+        -0.03193943f, -0.02030828f, -0.04044014f, -0.01074016f, -0.01930822f,
+        -0.03620399f, -0.01974125f, -0.03919545f, -0.01456093f, -0.00045072f,
+        -0.00360110f, -0.01020207f, -0.01231907f, -0.00638988f, -0.00071592f,
+        -0.00279122f, -0.00957115f, -0.01288327f, -0.00730937f, -0.00107783f,
+        -0.00210156f, -0.00890705f, -0.01317668f, -0.00813895f, -0.00153491f,
+        -0.02128481f, -0.04173044f, -0.04831487f, -0.03293190f, -0.00525260f,
+        -0.01720322f, -0.04052736f, -0.05045706f, -0.03607317f, -0.00738030f,
+        -0.01341764f, -0.03965629f, -0.05151616f, -0.03814886f, -0.01005819f,
+        0.18968273f,  0.33063684f,  -0.01300105f, -0.01372950f, -0.04017465f,
+        0.13727832f,  0.36402234f,  0.01027890f,  -0.01832107f, -0.03365072f,
+        0.08734506f,  0.38194295f,  0.04338228f,  -0.02525993f, 0.56408126f,
+        0.00458352f,  -0.01648227f, -0.04887868f, 0.24585519f,  0.62026135f,
+        0.04314807f,  -0.02213737f, -0.04158014f, 0.16637289f,  0.65027023f,
+        0.09621636f,  -0.03101388f, -0.04082742f, -0.00904519f, -0.02790922f,
+        -0.02117818f, 0.00798662f,  -0.03995711f, -0.01243427f, -0.02231705f,
+        -0.02946266f, 0.00992055f,  -0.03600283f, -0.01684920f, -0.00111684f,
+        -0.00411204f, -0.01297130f, -0.01723725f, -0.01022545f, -0.00165306f,
+        -0.00313110f, -0.01218016f, -0.01763266f, -0.01125620f, -0.00231663f,
+        -0.01374149f, -0.03797620f, -0.05142937f, -0.03117307f, -0.00581914f,
+        -0.01064003f, -0.03608089f, -0.05272168f, -0.03375670f, -0.00795586f,
+        0.09628104f,  0.27129991f,  -0.00353779f, -0.01734151f, -0.03153981f,
+        0.05686230f,  0.28500998f,  0.02230594f,  -0.02374955f, 0.68214326f,
+        0.05018048f,  -0.02320852f, -0.04383616f, 0.18459474f,  0.71517975f,
+        0.10805613f,  -0.03263677f, -0.03637639f, -0.01394373f, -0.02511203f,
+        -0.01728636f, 0.05407331f,  -0.02867568f, -0.01893131f, -0.00240854f,
+        -0.00446511f, -0.01636187f, -0.02377053f, -0.01522848f, -0.00333334f,
+        -0.00819975f, -0.02964169f, -0.04499287f, -0.02745350f, -0.00612408f,
+        0.02727416f,  0.19446600f,  0.00159832f,  -0.02232473f, 0.74982506f,
+        0.11452620f,  -0.03348048f, -0.01605681f, -0.02070339f, -0.00458223f};
+    for (size_t i = 0; i < 210; i++) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(kWeights8[i], &upsampling8_weights[i]));
+    }
+  }
+  return true;
+}
+
+ExtraChannelInfo::ExtraChannelInfo() { Bundle::Init(this); }
+Status ExtraChannelInfo::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+
+  // General
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(ExtraChannel::kAlpha, &type));
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&bit_depth));
+
+  JXL_QUIET_RETURN_IF_ERROR(
+      visitor->U32(Val(0), Val(3), Val(4), BitsOffset(3, 1), 0, &dim_shift));
+  if ((1U << dim_shift) > 8) {
+    return JXL_FAILURE("dim_shift %u too large", dim_shift);
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(VisitNameString(visitor, &name));
+
+  // Conditional
+  if (visitor->Conditional(type == ExtraChannel::kAlpha)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &alpha_associated));
+  }
+  if (visitor->Conditional(type == ExtraChannel::kSpotColor)) {
+    for (float& c : spot_color) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0, &c));
+    }
+  }
+  if (visitor->Conditional(type == ExtraChannel::kCFA)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(1), Bits(2), BitsOffset(4, 3),
+                                           BitsOffset(8, 19), 1, &cfa_channel));
+  }
+
+  if (type == ExtraChannel::kUnknown ||
+      (int(ExtraChannel::kReserved0) <= int(type) &&
+       int(type) <= int(ExtraChannel::kReserved7))) {
+    return JXL_FAILURE("Unknown extra channel (bits %u, shift %u, name '%s')\n",
+                       bit_depth.bits_per_sample, dim_shift, name.c_str());
+  }
+  return true;
+}
+
+std::string ExtraChannelInfo::DebugString() const {
+  std::ostringstream os;
+  os << (type == ExtraChannel::kAlpha           ? "Alpha"
+         : type == ExtraChannel::kDepth         ? "Depth"
+         : type == ExtraChannel::kSpotColor     ? "Spot"
+         : type == ExtraChannel::kSelectionMask ? "Mask"
+         : type == ExtraChannel::kBlack         ? "Black"
+         : type == ExtraChannel::kCFA           ? "CFA"
+         : type == ExtraChannel::kThermal       ? "Thermal"
+                                                : "Unknown");
+  if (type == ExtraChannel::kAlpha && alpha_associated) os << "(premul)";
+  os << " " << bit_depth.DebugString();
+  os << " shift: " << dim_shift;
+  return os.str();
+}
+
+ImageMetadata::ImageMetadata() { Bundle::Init(this); }
+Status ImageMetadata::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+
+  // Bundle::AllDefault does not allow usage when reading (it may abort the
+  // program when a codestream has invalid values), but when reading we
+  // overwrite the extra_fields value, so do not need to call AllDefault.
+  bool tone_mapping_default =
+      visitor->IsReading() ? false : Bundle::AllDefault(tone_mapping);
+
+  bool extra_fields = (orientation != 1 || have_preview || have_animation ||
+                       have_intrinsic_size || !tone_mapping_default);
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &extra_fields));
+  if (visitor->Conditional(extra_fields)) {
+    orientation--;
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &orientation));
+    orientation++;
+    // (No need for bounds checking because we read exactly 3 bits)
+
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_intrinsic_size));
+    if (visitor->Conditional(have_intrinsic_size)) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&intrinsic_size));
+    }
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_preview));
+    if (visitor->Conditional(have_preview)) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&preview_size));
+    }
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_animation));
+    if (visitor->Conditional(have_animation)) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&animation));
+    }
+  } else {
+    orientation = 1;  // identity
+    have_intrinsic_size = false;
+    have_preview = false;
+    have_animation = false;
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&bit_depth));
+  JXL_QUIET_RETURN_IF_ERROR(
+      visitor->Bool(true, &modular_16_bit_buffer_sufficient));
+
+  num_extra_channels = extra_channel_info.size();
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), BitsOffset(4, 2),
+                                         BitsOffset(12, 1), 0,
+                                         &num_extra_channels));
+
+  if (visitor->Conditional(num_extra_channels != 0)) {
+    if (visitor->IsReading()) {
+      extra_channel_info.resize(num_extra_channels);
+    }
+    for (ExtraChannelInfo& eci : extra_channel_info) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&eci));
+    }
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(true, &xyb_encoded));
+  JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&color_encoding));
+  if (visitor->Conditional(extra_fields)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&tone_mapping));
+  }
+
+  // Treat as if only the fields up to extra channels exist.
+  if (visitor->IsReading() && nonserialized_only_parse_basic_info) {
+    return true;
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions));
+  // Extensions: in chronological order of being added to the format.
+  return visitor->EndExtensions();
+}
+
+OpsinInverseMatrix::OpsinInverseMatrix() { Bundle::Init(this); }
+Status OpsinInverseMatrix::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+  for (int i = 0; i < 9; ++i) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->F16(
+        DefaultInverseOpsinAbsorbanceMatrix()[i], &inverse_matrix[i]));
+  }
+  for (int i = 0; i < 3; ++i) {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->F16(kNegOpsinAbsorbanceBiasRGB[i], &opsin_biases[i]));
+  }
+  for (int i = 0; i < 4; ++i) {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->F16(kDefaultQuantBias[i], &quant_biases[i]));
+  }
+  return true;
+}
+
+ToneMapping::ToneMapping() { Bundle::Init(this); }
+Status ToneMapping::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(
+      visitor->F16(kDefaultIntensityTarget, &intensity_target));
+  if (intensity_target <= 0.f) {
+    return JXL_FAILURE("invalid intensity target");
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.0f, &min_nits));
+  if (min_nits < 0.f || min_nits > intensity_target) {
+    return JXL_FAILURE("invalid min %f vs max %f", min_nits, intensity_target);
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &relative_to_max_display));
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.0f, &linear_below));
+  if (linear_below < 0 || (relative_to_max_display && linear_below > 1.0f)) {
+    return JXL_FAILURE("invalid linear_below %f (%s)", linear_below,
+                       relative_to_max_display ? "relative" : "absolute");
+  }
+
+  return true;
+}
+
+Status ReadImageMetadata(BitReader* JXL_RESTRICT reader,
+                         ImageMetadata* JXL_RESTRICT metadata) {
+  return Bundle::Read(reader, metadata);
+}
+
+void ImageMetadata::SetAlphaBits(uint32_t bits, bool alpha_is_premultiplied) {
+  std::vector<ExtraChannelInfo>& eciv = extra_channel_info;
+  ExtraChannelInfo* alpha = Find(ExtraChannel::kAlpha);
+  if (bits == 0) {
+    if (alpha != nullptr) {
+      // Remove the alpha channel from the extra channel info. It's
+      // theoretically possible that there are multiple, remove all in that
+      // case. This ensure a next HasAlpha() will return false.
+      const auto is_alpha = [](const ExtraChannelInfo& eci) {
+        return eci.type == ExtraChannel::kAlpha;
+      };
+      eciv.erase(std::remove_if(eciv.begin(), eciv.end(), is_alpha),
+                 eciv.end());
+    }
+  } else {
+    if (alpha == nullptr) {
+      ExtraChannelInfo info;
+      info.type = ExtraChannel::kAlpha;
+      info.bit_depth.bits_per_sample = bits;
+      info.dim_shift = 0;
+      info.alpha_associated = alpha_is_premultiplied;
+      // Prepend rather than append: in case there already are other extra
+      // channels, prefer alpha channel to be listed first.
+      eciv.insert(eciv.begin(), info);
+    } else {
+      // Ignores potential extra alpha channels, only sets to first one.
+      alpha->bit_depth.bits_per_sample = bits;
+      alpha->bit_depth.floating_point_sample = false;
+      alpha->bit_depth.exponent_bits_per_sample = 0;
+      alpha->alpha_associated = alpha_is_premultiplied;
+    }
+  }
+  num_extra_channels = extra_channel_info.size();
+  if (bits > 12) modular_16_bit_buffer_sufficient = false;
+}
+
+std::string ImageMetadata::DebugString() const {
+  std::ostringstream os;
+  os << bit_depth.DebugString();
+  if (modular_16_bit_buffer_sufficient) {
+    os << " (modular 16)";
+  }
+  os << (xyb_encoded ? " xyb encoded" : " orig profile");
+  os << " " << Description(color_encoding);
+  if (num_extra_channels > 0) {
+    os << " extra channels:";
+    for (size_t i = 0; i < num_extra_channels; ++i) {
+      os << " (" << extra_channel_info[i].DebugString() << ")";
+      if (i + 1 < num_extra_channels) os << ",";
+    }
+  }
+  if (have_preview) {
+    os << " preview: " << preview_size.xsize() << "x" << preview_size.ysize();
+  }
+  if (orientation != 1) {
+    os << " orientation: " << orientation;
+  }
+  return os.str();
+}
+
+std::string CodecMetadata::DebugString() const {
+  std::ostringstream os;
+  os << size.xsize() << "x" << size.ysize();
+  os << " " << m.DebugString();
+  return os.str();
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/image_metadata.h b/third_party/jpeg-xl/lib/jxl/image_metadata.h
new file mode 100644
index 0000000000..ca69eb3a3d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_metadata.h
@@ -0,0 +1,425 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Main codestream header bundles, the metadata that applies to all frames.
+// Enums must align with the C API definitions in codestream_header.h.
+
+#ifndef LIB_JXL_IMAGE_METADATA_H_
+#define LIB_JXL_IMAGE_METADATA_H_
+
+#include <jxl/codestream_header.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+#include "lib/jxl/opsin_params.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// EXIF orientation of the image. This field overrides any field present in
+// actual EXIF metadata. The value tells which transformation the decoder must
+// apply after decoding to display the image with the correct orientation.
+enum class Orientation : uint32_t {
+  // Values 1..8 match the EXIF definitions.
+  kIdentity = JXL_ORIENT_IDENTITY,
+  kFlipHorizontal = JXL_ORIENT_FLIP_HORIZONTAL,
+  kRotate180 = JXL_ORIENT_ROTATE_180,
+  kFlipVertical = JXL_ORIENT_FLIP_VERTICAL,
+  kTranspose = JXL_ORIENT_TRANSPOSE,
+  kRotate90 = JXL_ORIENT_ROTATE_90_CW,
+  kAntiTranspose = JXL_ORIENT_ANTI_TRANSPOSE,
+  kRotate270 = JXL_ORIENT_ROTATE_90_CCW,
+};
+// Don't need an EnumBits because Orientation is not read via Enum().
+
+enum class ExtraChannel : uint32_t {
+  // First two enumerators (most common) are cheaper to encode
+  kAlpha = JXL_CHANNEL_ALPHA,
+  kDepth = JXL_CHANNEL_DEPTH,
+
+  kSpotColor = JXL_CHANNEL_SPOT_COLOR,
+  kSelectionMask = JXL_CHANNEL_SELECTION_MASK,
+  kBlack = JXL_CHANNEL_BLACK,  // for CMYK
+  kCFA = JXL_CHANNEL_CFA,      // Bayer channel
+  kThermal = JXL_CHANNEL_THERMAL,
+  kReserved0 = JXL_CHANNEL_RESERVED0,
+  kReserved1 = JXL_CHANNEL_RESERVED1,
+  kReserved2 = JXL_CHANNEL_RESERVED2,
+  kReserved3 = JXL_CHANNEL_RESERVED3,
+  kReserved4 = JXL_CHANNEL_RESERVED4,
+  kReserved5 = JXL_CHANNEL_RESERVED5,
+  kReserved6 = JXL_CHANNEL_RESERVED6,
+  kReserved7 = JXL_CHANNEL_RESERVED7,
+  // disambiguated via name string, raise warning if unsupported
+  kUnknown = JXL_CHANNEL_UNKNOWN,
+  // like kUnknown but can silently be ignored
+  kOptional = JXL_CHANNEL_OPTIONAL
+};
+static inline const char* EnumName(ExtraChannel /*unused*/) {
+  return "ExtraChannel";
+}
+static inline constexpr uint64_t EnumBits(ExtraChannel /*unused*/) {
+  using EC = ExtraChannel;
+  return MakeBit(EC::kAlpha) | MakeBit(EC::kDepth) | MakeBit(EC::kSpotColor) |
+         MakeBit(EC::kSelectionMask) | MakeBit(EC::kBlack) | MakeBit(EC::kCFA) |
+         MakeBit(EC::kThermal) | MakeBit(EC::kUnknown) | MakeBit(EC::kOptional);
+}
+
+// Used in ImageMetadata and ExtraChannelInfo.
+struct BitDepth : public Fields {
+  BitDepth();
+  JXL_FIELDS_NAME(BitDepth)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  std::string DebugString() const;
+
+  // Whether the original (uncompressed) samples are floating point or
+  // unsigned integer.
+  bool floating_point_sample;
+
+  // Bit depth of the original (uncompressed) image samples. Must be in the
+  // range [1, 32].
+  uint32_t bits_per_sample;
+
+  // Floating point exponent bits of the original (uncompressed) image samples,
+  // only used if floating_point_sample is true.
+  // If used, the samples are floating point with:
+  // - 1 sign bit
+  // - exponent_bits_per_sample exponent bits
+  // - (bits_per_sample - exponent_bits_per_sample - 1) mantissa bits
+  // If used, exponent_bits_per_sample must be in the range
+  // [2, 8] and amount of mantissa bits must be in the range [2, 23].
+  // NOTE: exponent_bits_per_sample is 8 for single precision binary32
+  // point, 5 for half precision binary16, 7 for fp24.
+  uint32_t exponent_bits_per_sample;
+};
+
+// Describes one extra channel.
+struct ExtraChannelInfo : public Fields {
+  ExtraChannelInfo();
+  JXL_FIELDS_NAME(ExtraChannelInfo)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  std::string DebugString() const;
+
+  mutable bool all_default;
+
+  ExtraChannel type;
+  BitDepth bit_depth;
+  uint32_t dim_shift;  // downsampled by 2^dim_shift on each axis
+
+  std::string name;  // UTF-8
+
+  // Conditional:
+  bool alpha_associated;  // i.e. premultiplied
+  float spot_color[4];    // spot color in linear RGBA
+  uint32_t cfa_channel;
+};
+
+struct OpsinInverseMatrix : public Fields {
+  OpsinInverseMatrix();
+  JXL_FIELDS_NAME(OpsinInverseMatrix)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  mutable bool all_default;
+
+  float inverse_matrix[9];
+  float opsin_biases[3];
+  float quant_biases[4];
+};
+
+// Information useful for mapping HDR images to lower dynamic range displays.
+struct ToneMapping : public Fields {
+  ToneMapping();
+  JXL_FIELDS_NAME(ToneMapping)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  mutable bool all_default;
+
+  // Upper bound on the intensity level present in the image. For unsigned
+  // integer pixel encodings, this is the brightness of the largest
+  // representable value. The image does not necessarily contain a pixel
+  // actually this bright. An encoder is allowed to set 255 for SDR images
+  // without computing a histogram.
+  float intensity_target;  // [nits]
+
+  // Lower bound on the intensity level present in the image. This may be
+  // loose, i.e. lower than the actual darkest pixel. When tone mapping, a
+  // decoder will map [min_nits, intensity_target] to the display range.
+  float min_nits;
+
+  bool relative_to_max_display;  // see below
+  // The tone mapping will leave unchanged (linear mapping) any pixels whose
+  // brightness is strictly below this. The interpretation depends on
+  // relative_to_max_display. If true, this is a ratio [0, 1] of the maximum
+  // display brightness [nits], otherwise an absolute brightness [nits].
+  float linear_below;
+};
+
+// Contains weights to customize some trasnforms - in particular, XYB and
+// upsampling.
+struct CustomTransformData : public Fields {
+  CustomTransformData();
+  JXL_FIELDS_NAME(CustomTransformData)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  // Must be set before calling VisitFields. Must equal xyb_encoded of
+  // ImageMetadata, should be set by ImageMetadata during VisitFields.
+  bool nonserialized_xyb_encoded = false;
+
+  mutable bool all_default;
+
+  OpsinInverseMatrix opsin_inverse_matrix;
+
+  uint32_t custom_weights_mask;
+  float upsampling2_weights[15];
+  float upsampling4_weights[55];
+  float upsampling8_weights[210];
+};
+
+// Properties of the original image bundle. This enables Encode(Decode()) to
+// re-create an equivalent image without user input.
+struct ImageMetadata : public Fields {
+  ImageMetadata();
+  JXL_FIELDS_NAME(ImageMetadata)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  // Returns bit depth of the JPEG XL compressed alpha channel, or 0 if no alpha
+  // channel present. In the theoretical case that there are multiple alpha
+  // channels, returns the bit depht of the first.
+  uint32_t GetAlphaBits() const {
+    const ExtraChannelInfo* alpha = Find(ExtraChannel::kAlpha);
+    if (alpha == nullptr) return 0;
+    JXL_ASSERT(alpha->bit_depth.bits_per_sample != 0);
+    return alpha->bit_depth.bits_per_sample;
+  }
+
+  // Sets bit depth of alpha channel, adding extra channel if needed, or
+  // removing all alpha channels if bits is 0.
+  // Assumes integer alpha channel and not designed to support multiple
+  // alpha channels (it's possible to use those features by manipulating
+  // extra_channel_info directly).
+  //
+  // Callers must insert the actual channel image at the same index before any
+  // further modifications to extra_channel_info.
+  void SetAlphaBits(uint32_t bits, bool alpha_is_premultiplied = false);
+
+  bool HasAlpha() const { return GetAlphaBits() != 0; }
+
+  // Sets the original bit depth fields to indicate unsigned integer of the
+  // given bit depth.
+  // TODO(lode): move function to BitDepth
+  void SetUintSamples(uint32_t bits) {
+    bit_depth.bits_per_sample = bits;
+    bit_depth.exponent_bits_per_sample = 0;
+    bit_depth.floating_point_sample = false;
+    // RCT / Squeeze may add one bit each, and this is about int16_t,
+    // so uint13 should still be OK but limiting it to 12 seems safer.
+    // TODO(jon): figure out a better way to set this header field.
+    // (in particular, if modular mode is not used it doesn't matter,
+    // and if transforms are restricted, up to 15-bit could be done)
+    if (bits > 12) modular_16_bit_buffer_sufficient = false;
+  }
+  // Sets the original bit depth fields to indicate single precision floating
+  // point.
+  // TODO(lode): move function to BitDepth
+  void SetFloat32Samples() {
+    bit_depth.bits_per_sample = 32;
+    bit_depth.exponent_bits_per_sample = 8;
+    bit_depth.floating_point_sample = true;
+    modular_16_bit_buffer_sufficient = false;
+  }
+
+  void SetFloat16Samples() {
+    bit_depth.bits_per_sample = 16;
+    bit_depth.exponent_bits_per_sample = 5;
+    bit_depth.floating_point_sample = true;
+    modular_16_bit_buffer_sufficient = false;
+  }
+
+  void SetIntensityTarget(float intensity_target) {
+    tone_mapping.intensity_target = intensity_target;
+  }
+  float IntensityTarget() const {
+    JXL_ASSERT(tone_mapping.intensity_target != 0);
+    return tone_mapping.intensity_target;
+  }
+
+  // Returns first ExtraChannelInfo of the given type, or nullptr if none.
+  const ExtraChannelInfo* Find(ExtraChannel type) const {
+    for (const ExtraChannelInfo& eci : extra_channel_info) {
+      if (eci.type == type) return &eci;
+    }
+    return nullptr;
+  }
+
+  // Returns first ExtraChannelInfo of the given type, or nullptr if none.
+  ExtraChannelInfo* Find(ExtraChannel type) {
+    for (ExtraChannelInfo& eci : extra_channel_info) {
+      if (eci.type == type) return &eci;
+    }
+    return nullptr;
+  }
+
+  Orientation GetOrientation() const {
+    return static_cast<Orientation>(orientation);
+  }
+
+  bool ExtraFieldsDefault() const;
+
+  std::string DebugString() const;
+
+  mutable bool all_default;
+
+  BitDepth bit_depth;
+  bool modular_16_bit_buffer_sufficient;  // otherwise 32 is.
+
+  // Whether the colors values of the pixels of frames are encoded in the
+  // codestream using the absolute XYB color space, or the using values that
+  // follow the color space defined by the ColorEncoding or ICC profile. This
+  // determines when or whether a CMS (Color Management System) is needed to get
+  // the pixels in a desired color space. In one case, the pixels have one known
+  // color space and a CMS is needed to convert them to the original image's
+  // color space, in the other case the pixels have the color space of the
+  // original image and a CMS is required if a different display space, or a
+  // single known consistent color space for multiple decoded images, is
+  // desired. In all cases, the color space of all frames from a single image is
+  // the same, both VarDCT and modular frames.
+  //
+  // If true: then frames can be decoded to XYB (which can also be converted to
+  // linear and non-linear sRGB with the built in conversion without CMS). The
+  // attached ColorEncoding or ICC profile has no effect on the meaning of the
+  // pixel's color values, but instead indicates what the color profile of the
+  // original image was, and what color profile one should convert to when
+  // decoding to integers to prevent clipping and precision loss. To do that
+  // conversion requires a CMS.
+  //
+  // If false: then the color values of decoded frames are in the space defined
+  // by the attached ColorEncoding or ICC profile. To instead get the pixels in
+  // a chosen known color space, such as sRGB, requires a CMS, since the
+  // attached ColorEncoding or ICC profile could be any arbitrary color space.
+  // This mode is typically used for lossless images encoded as integers.
+  // Frames can also use YCbCr encoding, some frames may and some may not, but
+  // this is not a different color space but a certain encoding of the RGB
+  // values.
+  //
+  // Note: if !xyb_encoded, but the attached color profile indicates XYB (which
+  // can happen either if it's a ColorEncoding with color_space_ ==
+  // ColorSpace::kXYB, or if it's an ICC Profile that has been crafted to
+  // represent XYB), then the frames still may not use ColorEncoding kXYB, they
+  // must still use kNone (or kYCbCr, which would mean applying the YCbCr
+  // transform to the 3-channel XYB data), since with !xyb_encoded, the 3
+  // channels are stored as-is, no matter what meaning the color profile assigns
+  // to them. To use ColorEncoding::kXYB, xyb_encoded must be true.
+  //
+  // This value is defined in image metadata because this is the global
+  // codestream header. This value does not affect the image itself, so is not
+  // image metadata per se, it only affects the encoding, and what color space
+  // the decoder can receive the pixels in without needing a CMS.
+  bool xyb_encoded;
+
+  ColorEncoding color_encoding;
+
+  // These values are initialized to defaults such that the 'extra_fields'
+  // condition in VisitFields uses correctly initialized values.
+  uint32_t orientation = 1;
+  bool have_preview = false;
+  bool have_animation = false;
+  bool have_intrinsic_size = false;
+
+  // If present, the stored image has the dimensions of the first SizeHeader,
+  // but decoders are advised to resample or display per `intrinsic_size`.
+  SizeHeader intrinsic_size;  // only if have_intrinsic_size
+
+  ToneMapping tone_mapping;
+
+  // When reading: deserialized. When writing: automatically set from vector.
+  uint32_t num_extra_channels;
+  std::vector<ExtraChannelInfo> extra_channel_info;
+
+  // Only present if m.have_preview.
+  PreviewHeader preview_size;
+  // Only present if m.have_animation.
+  AnimationHeader animation;
+
+  uint64_t extensions;
+
+  // Option to stop parsing after basic info, and treat as if the later
+  // fields do not participate. Use to parse only basic image information
+  // excluding the final larger or variable sized data.
+  bool nonserialized_only_parse_basic_info = false;
+};
+
+Status ReadImageMetadata(BitReader* JXL_RESTRICT reader,
+                         ImageMetadata* JXL_RESTRICT metadata);
+
+Status WriteImageMetadata(const ImageMetadata& metadata,
+                          BitWriter* JXL_RESTRICT writer, size_t layer,
+                          AuxOut* aux_out);
+
+// All metadata applicable to the entire codestream (dimensions, extra channels,
+// ...)
+struct CodecMetadata {
+  // TODO(lode): use the preview and animation fields too, in place of the
+  // nonserialized_ ones in ImageMetadata.
+  ImageMetadata m;
+  // The size of the codestream: this is the nominal size applicable to all
+  // frames, although some frames can have a different effective size through
+  // crop, dc_level or representing a the preview.
+  SizeHeader size;
+  // Often default.
+  CustomTransformData transform_data;
+
+  size_t xsize() const { return size.xsize(); }
+  size_t ysize() const { return size.ysize(); }
+  size_t oriented_xsize(bool keep_orientation) const {
+    if (static_cast<uint32_t>(m.GetOrientation()) > 4 && !keep_orientation) {
+      return ysize();
+    } else {
+      return xsize();
+    }
+  }
+  size_t oriented_preview_xsize(bool keep_orientation) const {
+    if (static_cast<uint32_t>(m.GetOrientation()) > 4 && !keep_orientation) {
+      return m.preview_size.ysize();
+    } else {
+      return m.preview_size.xsize();
+    }
+  }
+  size_t oriented_ysize(bool keep_orientation) const {
+    if (static_cast<uint32_t>(m.GetOrientation()) > 4 && !keep_orientation) {
+      return xsize();
+    } else {
+      return ysize();
+    }
+  }
+  size_t oriented_preview_ysize(bool keep_orientation) const {
+    if (static_cast<uint32_t>(m.GetOrientation()) > 4 && !keep_orientation) {
+      return m.preview_size.xsize();
+    } else {
+      return m.preview_size.ysize();
+    }
+  }
+
+  std::string DebugString() const;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_IMAGE_METADATA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/image_ops.h b/third_party/jpeg-xl/lib/jxl/image_ops.h
new file mode 100644
index 0000000000..c025007e95
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_ops.h
@@ -0,0 +1,805 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_IMAGE_OPS_H_
+#define LIB_JXL_IMAGE_OPS_H_
+
+// Operations on images.
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <vector>
+
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+template <typename T>
+void CopyImageTo(const Plane<T>& from, Plane<T>* JXL_RESTRICT to) {
+  PROFILER_ZONE("CopyImage1");
+  JXL_ASSERT(SameSize(from, *to));
+  if (from.ysize() == 0 || from.xsize() == 0) return;
+  for (size_t y = 0; y < from.ysize(); ++y) {
+    const T* JXL_RESTRICT row_from = from.ConstRow(y);
+    T* JXL_RESTRICT row_to = to->Row(y);
+    memcpy(row_to, row_from, from.xsize() * sizeof(T));
+  }
+}
+
+// DEPRECATED - prefer to preallocate result.
+template <typename T>
+Plane<T> CopyImage(const Plane<T>& from) {
+  Plane<T> to(from.xsize(), from.ysize());
+  CopyImageTo(from, &to);
+  return to;
+}
+
+// Copies `from:rect_from` to `to:rect_to`.
+template <typename T>
+void CopyImageTo(const Rect& rect_from, const Plane<T>& from,
+                 const Rect& rect_to, Plane<T>* JXL_RESTRICT to) {
+  PROFILER_ZONE("CopyImageR");
+  JXL_DASSERT(SameSize(rect_from, rect_to));
+  JXL_DASSERT(rect_from.IsInside(from));
+  JXL_DASSERT(rect_to.IsInside(*to));
+  if (rect_from.xsize() == 0) return;
+  for (size_t y = 0; y < rect_from.ysize(); ++y) {
+    const T* JXL_RESTRICT row_from = rect_from.ConstRow(from, y);
+    T* JXL_RESTRICT row_to = rect_to.Row(to, y);
+    memcpy(row_to, row_from, rect_from.xsize() * sizeof(T));
+  }
+}
+
+// DEPRECATED - Returns a copy of the "image" pixels that lie in "rect".
+template <typename T>
+Plane<T> CopyImage(const Rect& rect, const Plane<T>& image) {
+  Plane<T> copy(rect.xsize(), rect.ysize());
+  CopyImageTo(rect, image, &copy);
+  return copy;
+}
+
+// Copies `from:rect_from` to `to:rect_to`.
+template <typename T>
+void CopyImageTo(const Rect& rect_from, const Image3<T>& from,
+                 const Rect& rect_to, Image3<T>* JXL_RESTRICT to) {
+  PROFILER_ZONE("CopyImageR");
+  JXL_ASSERT(SameSize(rect_from, rect_to));
+  for (size_t c = 0; c < 3; c++) {
+    CopyImageTo(rect_from, from.Plane(c), rect_to, &to->Plane(c));
+  }
+}
+
+template <typename T, typename U>
+void ConvertPlaneAndClamp(const Rect& rect_from, const Plane<T>& from,
+                          const Rect& rect_to, Plane<U>* JXL_RESTRICT to) {
+  PROFILER_ZONE("ConvertPlane");
+  JXL_ASSERT(SameSize(rect_from, rect_to));
+  using M = decltype(T() + U());
+  for (size_t y = 0; y < rect_to.ysize(); ++y) {
+    const T* JXL_RESTRICT row_from = rect_from.ConstRow(from, y);
+    U* JXL_RESTRICT row_to = rect_to.Row(to, y);
+    for (size_t x = 0; x < rect_to.xsize(); ++x) {
+      row_to[x] =
+          std::min<M>(std::max<M>(row_from[x], std::numeric_limits<U>::min()),
+                      std::numeric_limits<U>::max());
+    }
+  }
+}
+
+// Copies `from` to `to`.
+template <typename T>
+void CopyImageTo(const T& from, T* JXL_RESTRICT to) {
+  return CopyImageTo(Rect(from), from, Rect(*to), to);
+}
+
+// Copies `from:rect_from` to `to`.
+template <typename T>
+void CopyImageTo(const Rect& rect_from, const T& from, T* JXL_RESTRICT to) {
+  return CopyImageTo(rect_from, from, Rect(*to), to);
+}
+
+// Copies `from` to `to:rect_to`.
+template <typename T>
+void CopyImageTo(const T& from, const Rect& rect_to, T* JXL_RESTRICT to) {
+  return CopyImageTo(Rect(from), from, rect_to, to);
+}
+
+// Copies `from:rect_from` to `to:rect_to`; also copies `padding` pixels of
+// border around `from:rect_from`, in all directions, whenever they are inside
+// the first image.
+template <typename T>
+void CopyImageToWithPadding(const Rect& from_rect, const T& from,
+                            size_t padding, const Rect& to_rect, T* to) {
+  size_t xextra0 = std::min(padding, from_rect.x0());
+  size_t xextra1 =
+      std::min(padding, from.xsize() - from_rect.x0() - from_rect.xsize());
+  size_t yextra0 = std::min(padding, from_rect.y0());
+  size_t yextra1 =
+      std::min(padding, from.ysize() - from_rect.y0() - from_rect.ysize());
+  JXL_DASSERT(to_rect.x0() >= xextra0);
+  JXL_DASSERT(to_rect.y0() >= yextra0);
+
+  return CopyImageTo(Rect(from_rect.x0() - xextra0, from_rect.y0() - yextra0,
+                          from_rect.xsize() + xextra0 + xextra1,
+                          from_rect.ysize() + yextra0 + yextra1),
+                     from,
+                     Rect(to_rect.x0() - xextra0, to_rect.y0() - yextra0,
+                          to_rect.xsize() + xextra0 + xextra1,
+                          to_rect.ysize() + yextra0 + yextra1),
+                     to);
+}
+
+// DEPRECATED - prefer to preallocate result.
+template <typename T>
+Image3<T> CopyImage(const Image3<T>& from) {
+  Image3<T> copy(from.xsize(), from.ysize());
+  CopyImageTo(from, &copy);
+  return copy;
+}
+
+// DEPRECATED - prefer to preallocate result.
+template <typename T>
+Image3<T> CopyImage(const Rect& rect, const Image3<T>& from) {
+  Image3<T> to(rect.xsize(), rect.ysize());
+  CopyImageTo(rect, from.Plane(0), to.Plane(0));
+  CopyImageTo(rect, from.Plane(1), to.Plane(1));
+  CopyImageTo(rect, from.Plane(2), to.Plane(2));
+  return to;
+}
+
+// Sets "thickness" pixels on each border to "value". This is faster than
+// initializing the entire image and overwriting valid/interior pixels.
+template <typename T>
+void SetBorder(const size_t thickness, const T value, Image3<T>* image) {
+  const size_t xsize = image->xsize();
+  const size_t ysize = image->ysize();
+  // Top: fill entire row
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < std::min(thickness, ysize); ++y) {
+      T* JXL_RESTRICT row = image->PlaneRow(c, y);
+      std::fill(row, row + xsize, value);
+    }
+
+    // Bottom: fill entire row
+    for (size_t y = ysize - thickness; y < ysize; ++y) {
+      T* JXL_RESTRICT row = image->PlaneRow(c, y);
+      std::fill(row, row + xsize, value);
+    }
+
+    // Left/right: fill the 'columns' on either side, but only if the image is
+    // big enough that they don't already belong to the top/bottom rows.
+    if (ysize >= 2 * thickness) {
+      for (size_t y = thickness; y < ysize - thickness; ++y) {
+        T* JXL_RESTRICT row = image->PlaneRow(c, y);
+        std::fill(row, row + thickness, value);
+        std::fill(row + xsize - thickness, row + xsize, value);
+      }
+    }
+  }
+}
+
+template <class ImageIn, class ImageOut>
+void Subtract(const ImageIn& image1, const ImageIn& image2, ImageOut* out) {
+  using T = typename ImageIn::T;
+  const size_t xsize = image1.xsize();
+  const size_t ysize = image1.ysize();
+  JXL_CHECK(xsize == image2.xsize());
+  JXL_CHECK(ysize == image2.ysize());
+
+  for (size_t y = 0; y < ysize; ++y) {
+    const T* const JXL_RESTRICT row1 = image1.Row(y);
+    const T* const JXL_RESTRICT row2 = image2.Row(y);
+    T* const JXL_RESTRICT row_out = out->Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      row_out[x] = row1[x] - row2[x];
+    }
+  }
+}
+
+// In-place.
+template <typename Tin, typename Tout>
+void SubtractFrom(const Plane<Tin>& what, Plane<Tout>* to) {
+  const size_t xsize = what.xsize();
+  const size_t ysize = what.ysize();
+  for (size_t y = 0; y < ysize; ++y) {
+    const Tin* JXL_RESTRICT row_what = what.ConstRow(y);
+    Tout* JXL_RESTRICT row_to = to->Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      row_to[x] -= row_what[x];
+    }
+  }
+}
+
+// In-place.
+template <typename Tin, typename Tout>
+void AddTo(const Plane<Tin>& what, Plane<Tout>* to) {
+  const size_t xsize = what.xsize();
+  const size_t ysize = what.ysize();
+  for (size_t y = 0; y < ysize; ++y) {
+    const Tin* JXL_RESTRICT row_what = what.ConstRow(y);
+    Tout* JXL_RESTRICT row_to = to->Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      row_to[x] += row_what[x];
+    }
+  }
+}
+
+template <typename Tin, typename Tout>
+void AddTo(Rect rectFrom, const Plane<Tin>& what, Rect rectTo,
+           Plane<Tout>* to) {
+  JXL_ASSERT(SameSize(rectFrom, rectTo));
+  const size_t xsize = rectTo.xsize();
+  const size_t ysize = rectTo.ysize();
+  for (size_t y = 0; y < ysize; ++y) {
+    const Tin* JXL_RESTRICT row_what = rectFrom.ConstRow(what, y);
+    Tout* JXL_RESTRICT row_to = rectTo.Row(to, y);
+    for (size_t x = 0; x < xsize; ++x) {
+      row_to[x] += row_what[x];
+    }
+  }
+}
+
+// Returns linear combination of two grayscale images.
+template <typename T>
+Plane<T> LinComb(const T lambda1, const Plane<T>& image1, const T lambda2,
+                 const Plane<T>& image2) {
+  const size_t xsize = image1.xsize();
+  const size_t ysize = image1.ysize();
+  JXL_CHECK(xsize == image2.xsize());
+  JXL_CHECK(ysize == image2.ysize());
+  Plane<T> out(xsize, ysize);
+  for (size_t y = 0; y < ysize; ++y) {
+    const T* const JXL_RESTRICT row1 = image1.Row(y);
+    const T* const JXL_RESTRICT row2 = image2.Row(y);
+    T* const JXL_RESTRICT row_out = out.Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      row_out[x] = lambda1 * row1[x] + lambda2 * row2[x];
+    }
+  }
+  return out;
+}
+
+// Returns a pixel-by-pixel multiplication of image by lambda.
+template <typename T>
+Plane<T> ScaleImage(const T lambda, const Plane<T>& image) {
+  Plane<T> out(image.xsize(), image.ysize());
+  for (size_t y = 0; y < image.ysize(); ++y) {
+    const T* const JXL_RESTRICT row = image.Row(y);
+    T* const JXL_RESTRICT row_out = out.Row(y);
+    for (size_t x = 0; x < image.xsize(); ++x) {
+      row_out[x] = lambda * row[x];
+    }
+  }
+  return out;
+}
+
+// Multiplies image by lambda in-place
+template <typename T>
+void ScaleImage(const T lambda, Plane<T>* image) {
+  for (size_t y = 0; y < image->ysize(); ++y) {
+    T* const JXL_RESTRICT row = image->Row(y);
+    for (size_t x = 0; x < image->xsize(); ++x) {
+      row[x] = lambda * row[x];
+    }
+  }
+}
+
+template <typename T>
+Plane<T> Product(const Plane<T>& a, const Plane<T>& b) {
+  Plane<T> c(a.xsize(), a.ysize());
+  for (size_t y = 0; y < a.ysize(); ++y) {
+    const T* const JXL_RESTRICT row_a = a.Row(y);
+    const T* const JXL_RESTRICT row_b = b.Row(y);
+    T* const JXL_RESTRICT row_c = c.Row(y);
+    for (size_t x = 0; x < a.xsize(); ++x) {
+      row_c[x] = row_a[x] * row_b[x];
+    }
+  }
+  return c;
+}
+
+template <typename T>
+void FillImage(const T value, Plane<T>* image) {
+  for (size_t y = 0; y < image->ysize(); ++y) {
+    T* const JXL_RESTRICT row = image->Row(y);
+    for (size_t x = 0; x < image->xsize(); ++x) {
+      row[x] = value;
+    }
+  }
+}
+
+template <typename T>
+void ZeroFillImage(Plane<T>* image) {
+  if (image->xsize() == 0) return;
+  for (size_t y = 0; y < image->ysize(); ++y) {
+    T* const JXL_RESTRICT row = image->Row(y);
+    memset(row, 0, image->xsize() * sizeof(T));
+  }
+}
+
+// Mirrors out of bounds coordinates and returns valid coordinates unchanged.
+// We assume the radius (distance outside the image) is small compared to the
+// image size, otherwise this might not terminate.
+// The mirror is outside the last column (border pixel is also replicated).
+static inline int64_t Mirror(int64_t x, const int64_t xsize) {
+  JXL_DASSERT(xsize != 0);
+
+  // TODO(janwas): replace with branchless version
+  while (x < 0 || x >= xsize) {
+    if (x < 0) {
+      x = -x - 1;
+    } else {
+      x = 2 * xsize - 1 - x;
+    }
+  }
+  return x;
+}
+
+// Wrap modes for ensuring X/Y coordinates are in the valid range [0, size):
+
+// Mirrors (repeating the edge pixel once). Useful for convolutions.
+struct WrapMirror {
+  JXL_INLINE int64_t operator()(const int64_t coord, const int64_t size) const {
+    return Mirror(coord, size);
+  }
+};
+
+// Returns the same coordinate: required for TFNode with Border(), or useful
+// when we know "coord" is already valid (e.g. interior of an image).
+struct WrapUnchanged {
+  JXL_INLINE int64_t operator()(const int64_t coord, int64_t /*size*/) const {
+    return coord;
+  }
+};
+
+// Similar to Wrap* but for row pointers (reduces Row() multiplications).
+
+class WrapRowMirror {
+ public:
+  template <class ImageOrView>
+  WrapRowMirror(const ImageOrView& image, size_t ysize)
+      : first_row_(image.ConstRow(0)), last_row_(image.ConstRow(ysize - 1)) {}
+
+  const float* operator()(const float* const JXL_RESTRICT row,
+                          const int64_t stride) const {
+    if (row < first_row_) {
+      const int64_t num_before = first_row_ - row;
+      // Mirrored; one row before => row 0, two before = row 1, ...
+      return first_row_ + num_before - stride;
+    }
+    if (row > last_row_) {
+      const int64_t num_after = row - last_row_;
+      // Mirrored; one row after => last row, two after = last - 1, ...
+      return last_row_ - num_after + stride;
+    }
+    return row;
+  }
+
+ private:
+  const float* const JXL_RESTRICT first_row_;
+  const float* const JXL_RESTRICT last_row_;
+};
+
+struct WrapRowUnchanged {
+  JXL_INLINE const float* operator()(const float* const JXL_RESTRICT row,
+                                     int64_t /*stride*/) const {
+    return row;
+  }
+};
+
+// Sets "thickness" pixels on each border to "value". This is faster than
+// initializing the entire image and overwriting valid/interior pixels.
+template <typename T>
+void SetBorder(const size_t thickness, const T value, Plane<T>* image) {
+  const size_t xsize = image->xsize();
+  const size_t ysize = image->ysize();
+  // Top: fill entire row
+  for (size_t y = 0; y < std::min(thickness, ysize); ++y) {
+    T* const JXL_RESTRICT row = image->Row(y);
+    std::fill(row, row + xsize, value);
+  }
+
+  // Bottom: fill entire row
+  for (size_t y = ysize - thickness; y < ysize; ++y) {
+    T* const JXL_RESTRICT row = image->Row(y);
+    std::fill(row, row + xsize, value);
+  }
+
+  // Left/right: fill the 'columns' on either side, but only if the image is
+  // big enough that they don't already belong to the top/bottom rows.
+  if (ysize >= 2 * thickness) {
+    for (size_t y = thickness; y < ysize - thickness; ++y) {
+      T* const JXL_RESTRICT row = image->Row(y);
+      std::fill(row, row + thickness, value);
+      std::fill(row + xsize - thickness, row + xsize, value);
+    }
+  }
+}
+
+// Computes the minimum and maximum pixel value.
+template <typename T>
+void ImageMinMax(const Plane<T>& image, T* const JXL_RESTRICT min,
+                 T* const JXL_RESTRICT max) {
+  *min = std::numeric_limits<T>::max();
+  *max = std::numeric_limits<T>::lowest();
+  for (size_t y = 0; y < image.ysize(); ++y) {
+    const T* const JXL_RESTRICT row = image.Row(y);
+    for (size_t x = 0; x < image.xsize(); ++x) {
+      *min = std::min(*min, row[x]);
+      *max = std::max(*max, row[x]);
+    }
+  }
+}
+
+// Copies pixels, scaling their value relative to the "from" min/max by
+// "to_range". Example: U8 [0, 255] := [0.0, 1.0], to_range = 1.0 =>
+// outputs [0.0, 1.0].
+template <typename FromType, typename ToType>
+void ImageConvert(const Plane<FromType>& from, const float to_range,
+                  Plane<ToType>* const JXL_RESTRICT to) {
+  JXL_ASSERT(SameSize(from, *to));
+  FromType min_from, max_from;
+  ImageMinMax(from, &min_from, &max_from);
+  const float scale = to_range / (max_from - min_from);
+  for (size_t y = 0; y < from.ysize(); ++y) {
+    const FromType* const JXL_RESTRICT row_from = from.Row(y);
+    ToType* const JXL_RESTRICT row_to = to->Row(y);
+    for (size_t x = 0; x < from.xsize(); ++x) {
+      row_to[x] = static_cast<ToType>((row_from[x] - min_from) * scale);
+    }
+  }
+}
+
+template <typename From>
+Plane<float> ConvertToFloat(const Plane<From>& from) {
+  float factor = 1.0f / std::numeric_limits<From>::max();
+  if (std::is_same<From, double>::value || std::is_same<From, float>::value) {
+    factor = 1.0f;
+  }
+  Plane<float> to(from.xsize(), from.ysize());
+  for (size_t y = 0; y < from.ysize(); ++y) {
+    const From* const JXL_RESTRICT row_from = from.Row(y);
+    float* const JXL_RESTRICT row_to = to.Row(y);
+    for (size_t x = 0; x < from.xsize(); ++x) {
+      row_to[x] = row_from[x] * factor;
+    }
+  }
+  return to;
+}
+
+template <typename T>
+Plane<T> ImageFromPacked(const std::vector<T>& packed, const size_t xsize,
+                         const size_t ysize) {
+  Plane<T> out(xsize, ysize);
+  for (size_t y = 0; y < ysize; ++y) {
+    T* const JXL_RESTRICT row = out.Row(y);
+    const T* const JXL_RESTRICT packed_row = &packed[y * xsize];
+    memcpy(row, packed_row, xsize * sizeof(T));
+  }
+  return out;
+}
+
+// Computes independent minimum and maximum values for each plane.
+template <typename T>
+void Image3MinMax(const Image3<T>& image, const Rect& rect,
+                  std::array<T, 3>* out_min, std::array<T, 3>* out_max) {
+  for (size_t c = 0; c < 3; ++c) {
+    T min = std::numeric_limits<T>::max();
+    T max = std::numeric_limits<T>::min();
+    for (size_t y = 0; y < rect.ysize(); ++y) {
+      const T* JXL_RESTRICT row = rect.ConstPlaneRow(image, c, y);
+      for (size_t x = 0; x < rect.xsize(); ++x) {
+        min = std::min(min, row[x]);
+        max = std::max(max, row[x]);
+      }
+    }
+    (*out_min)[c] = min;
+    (*out_max)[c] = max;
+  }
+}
+
+// Computes independent minimum and maximum values for each plane.
+template <typename T>
+void Image3MinMax(const Image3<T>& image, std::array<T, 3>* out_min,
+                  std::array<T, 3>* out_max) {
+  Image3MinMax(image, Rect(image), out_min, out_max);
+}
+
+template <typename T>
+void Image3Max(const Image3<T>& image, std::array<T, 3>* out_max) {
+  for (size_t c = 0; c < 3; ++c) {
+    T max = std::numeric_limits<T>::min();
+    for (size_t y = 0; y < image.ysize(); ++y) {
+      const T* JXL_RESTRICT row = image.ConstPlaneRow(c, y);
+      for (size_t x = 0; x < image.xsize(); ++x) {
+        max = std::max(max, row[x]);
+      }
+    }
+    (*out_max)[c] = max;
+  }
+}
+
+// Computes the sum of the pixels in `rect`.
+template <typename T>
+T ImageSum(const Plane<T>& image, const Rect& rect) {
+  T result = 0;
+  for (size_t y = 0; y < rect.ysize(); ++y) {
+    const T* JXL_RESTRICT row = rect.ConstRow(image, y);
+    for (size_t x = 0; x < rect.xsize(); ++x) {
+      result += row[x];
+    }
+  }
+  return result;
+}
+
+template <typename T>
+T ImageSum(const Plane<T>& image) {
+  return ImageSum(image, Rect(image));
+}
+
+template <typename T>
+std::array<T, 3> Image3Sum(const Image3<T>& image, const Rect& rect) {
+  std::array<T, 3> out_sum = 0;
+  for (size_t c = 0; c < 3; ++c) {
+    (out_sum)[c] = ImageSum(image.Plane(c), rect);
+  }
+  return out_sum;
+}
+
+template <typename T>
+std::array<T, 3> Image3Sum(const Image3<T>& image) {
+  return Image3Sum(image, Rect(image));
+}
+
+template <typename T>
+std::vector<T> PackedFromImage(const Plane<T>& image, const Rect& rect) {
+  const size_t xsize = rect.xsize();
+  const size_t ysize = rect.ysize();
+  std::vector<T> packed(xsize * ysize);
+  for (size_t y = 0; y < rect.ysize(); ++y) {
+    memcpy(&packed[y * xsize], rect.ConstRow(image, y), xsize * sizeof(T));
+  }
+  return packed;
+}
+
+template <typename T>
+std::vector<T> PackedFromImage(const Plane<T>& image) {
+  return PackedFromImage(image, Rect(image));
+}
+
+// Computes the median pixel value.
+template <typename T>
+T ImageMedian(const Plane<T>& image, const Rect& rect) {
+  std::vector<T> pixels = PackedFromImage(image, rect);
+  return Median(&pixels);
+}
+
+template <typename T>
+T ImageMedian(const Plane<T>& image) {
+  return ImageMedian(image, Rect(image));
+}
+
+template <typename T>
+std::array<T, 3> Image3Median(const Image3<T>& image, const Rect& rect) {
+  std::array<T, 3> out_median;
+  for (size_t c = 0; c < 3; ++c) {
+    (out_median)[c] = ImageMedian(image.Plane(c), rect);
+  }
+  return out_median;
+}
+
+template <typename T>
+std::array<T, 3> Image3Median(const Image3<T>& image) {
+  return Image3Median(image, Rect(image));
+}
+
+template <typename FromType, typename ToType>
+void Image3Convert(const Image3<FromType>& from, const float to_range,
+                   Image3<ToType>* const JXL_RESTRICT to) {
+  JXL_ASSERT(SameSize(from, *to));
+  std::array<FromType, 3> min_from, max_from;
+  Image3MinMax(from, &min_from, &max_from);
+  float scales[3];
+  for (size_t c = 0; c < 3; ++c) {
+    scales[c] = to_range / (max_from[c] - min_from[c]);
+  }
+  float scale = std::min(scales[0], std::min(scales[1], scales[2]));
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < from.ysize(); ++y) {
+      const FromType* JXL_RESTRICT row_from = from.ConstPlaneRow(c, y);
+      ToType* JXL_RESTRICT row_to = to->PlaneRow(c, y);
+      for (size_t x = 0; x < from.xsize(); ++x) {
+        const float to = (row_from[x] - min_from[c]) * scale;
+        row_to[x] = static_cast<ToType>(to);
+      }
+    }
+  }
+}
+
+template <typename From>
+Image3F ConvertToFloat(const Image3<From>& from) {
+  return Image3F(ConvertToFloat(from.Plane(0)), ConvertToFloat(from.Plane(1)),
+                 ConvertToFloat(from.Plane(2)));
+}
+
+template <typename Tin, typename Tout>
+void Subtract(const Image3<Tin>& image1, const Image3<Tin>& image2,
+              Image3<Tout>* out) {
+  const size_t xsize = image1.xsize();
+  const size_t ysize = image1.ysize();
+  JXL_CHECK(xsize == image2.xsize());
+  JXL_CHECK(ysize == image2.ysize());
+
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < ysize; ++y) {
+      const Tin* const JXL_RESTRICT row1 = image1.ConstPlaneRow(c, y);
+      const Tin* const JXL_RESTRICT row2 = image2.ConstPlaneRow(c, y);
+      Tout* const JXL_RESTRICT row_out = out->PlaneRow(c, y);
+      for (size_t x = 0; x < xsize; ++x) {
+        row_out[x] = row1[x] - row2[x];
+      }
+    }
+  }
+}
+
+template <typename Tin, typename Tout>
+void SubtractFrom(const Image3<Tin>& what, Image3<Tout>* to) {
+  const size_t xsize = what.xsize();
+  const size_t ysize = what.ysize();
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < ysize; ++y) {
+      const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y);
+      Tout* JXL_RESTRICT row_to = to->PlaneRow(c, y);
+      for (size_t x = 0; x < xsize; ++x) {
+        row_to[x] -= row_what[x];
+      }
+    }
+  }
+}
+
+template <typename Tin, typename Tout>
+void AddTo(const Image3<Tin>& what, Image3<Tout>* to) {
+  const size_t xsize = what.xsize();
+  const size_t ysize = what.ysize();
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < ysize; ++y) {
+      const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y);
+      Tout* JXL_RESTRICT row_to = to->PlaneRow(c, y);
+      for (size_t x = 0; x < xsize; ++x) {
+        row_to[x] += row_what[x];
+      }
+    }
+  }
+}
+
+// Adds `what` of the size of `rect` to `to` in the position of `rect`.
+template <typename Tin, typename Tout>
+void AddTo(const Rect& rect, const Image3<Tin>& what, Image3<Tout>* to) {
+  const size_t xsize = what.xsize();
+  const size_t ysize = what.ysize();
+  JXL_ASSERT(xsize == rect.xsize());
+  JXL_ASSERT(ysize == rect.ysize());
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < ysize; ++y) {
+      const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y);
+      Tout* JXL_RESTRICT row_to = rect.PlaneRow(to, c, y);
+      for (size_t x = 0; x < xsize; ++x) {
+        row_to[x] += row_what[x];
+      }
+    }
+  }
+}
+
+template <typename T>
+Image3<T> ScaleImage(const T lambda, const Image3<T>& image) {
+  Image3<T> out(image.xsize(), image.ysize());
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < image.ysize(); ++y) {
+      const T* JXL_RESTRICT row = image.ConstPlaneRow(c, y);
+      T* JXL_RESTRICT row_out = out.PlaneRow(c, y);
+      for (size_t x = 0; x < image.xsize(); ++x) {
+        row_out[x] = lambda * row[x];
+      }
+    }
+  }
+  return out;
+}
+
+// Multiplies image by lambda in-place
+template <typename T>
+void ScaleImage(const T lambda, Image3<T>* image) {
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < image->ysize(); ++y) {
+      T* const JXL_RESTRICT row = image->PlaneRow(c, y);
+      for (size_t x = 0; x < image->xsize(); ++x) {
+        row[x] = lambda * row[x];
+      }
+    }
+  }
+}
+
+// Initializes all planes to the same "value".
+template <typename T>
+void FillImage(const T value, Image3<T>* image) {
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < image->ysize(); ++y) {
+      T* JXL_RESTRICT row = image->PlaneRow(c, y);
+      for (size_t x = 0; x < image->xsize(); ++x) {
+        row[x] = value;
+      }
+    }
+  }
+}
+
+template <typename T>
+void FillPlane(const T value, Plane<T>* image) {
+  for (size_t y = 0; y < image->ysize(); ++y) {
+    T* JXL_RESTRICT row = image->Row(y);
+    for (size_t x = 0; x < image->xsize(); ++x) {
+      row[x] = value;
+    }
+  }
+}
+
+template <typename T>
+void FillImage(const T value, Image3<T>* image, Rect rect) {
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < rect.ysize(); ++y) {
+      T* JXL_RESTRICT row = rect.PlaneRow(image, c, y);
+      for (size_t x = 0; x < rect.xsize(); ++x) {
+        row[x] = value;
+      }
+    }
+  }
+}
+
+template <typename T>
+void FillPlane(const T value, Plane<T>* image, Rect rect) {
+  for (size_t y = 0; y < rect.ysize(); ++y) {
+    T* JXL_RESTRICT row = rect.Row(image, y);
+    for (size_t x = 0; x < rect.xsize(); ++x) {
+      row[x] = value;
+    }
+  }
+}
+
+template <typename T>
+void ZeroFillImage(Image3<T>* image) {
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < image->ysize(); ++y) {
+      T* JXL_RESTRICT row = image->PlaneRow(c, y);
+      if (image->xsize() != 0) memset(row, 0, image->xsize() * sizeof(T));
+    }
+  }
+}
+
+template <typename T>
+void ZeroFillPlane(Plane<T>* image, Rect rect) {
+  for (size_t y = 0; y < rect.ysize(); ++y) {
+    T* JXL_RESTRICT row = rect.Row(image, y);
+    memset(row, 0, rect.xsize() * sizeof(T));
+  }
+}
+
+// Pad an image with xborder columns on each vertical side and yboder rows
+// above and below, mirroring the image.
+Image3F PadImageMirror(const Image3F& in, size_t xborder, size_t yborder);
+
+// Same as above, but operates in-place. Assumes that the `in` image was
+// allocated large enough.
+void PadImageToBlockMultipleInPlace(Image3F* JXL_RESTRICT in,
+                                    size_t block_dim = kBlockDim);
+
+// Downsamples an image by a given factor.
+void DownsampleImage(Image3F* opsin, size_t factor);
+void DownsampleImage(ImageF* image, size_t factor);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_IMAGE_OPS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/image_ops_test.cc b/third_party/jpeg-xl/lib/jxl/image_ops_test.cc
new file mode 100644
index 0000000000..44c021513d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_ops_test.cc
@@ -0,0 +1,164 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/image_ops.h"
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <utility>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+template <typename T>
+void TestPacked(const size_t xsize, const size_t ysize) {
+  Plane<T> image1(xsize, ysize);
+  RandomFillImage(&image1);
+  const std::vector<T>& packed = PackedFromImage(image1);
+  const Plane<T>& image2 = ImageFromPacked(packed, xsize, ysize);
+  JXL_EXPECT_OK(SamePixels(image1, image2, _));
+}
+
+TEST(ImageTest, TestPacked) {
+  TestPacked<uint8_t>(1, 1);
+  TestPacked<uint8_t>(7, 1);
+  TestPacked<uint8_t>(1, 7);
+
+  TestPacked<int16_t>(1, 1);
+  TestPacked<int16_t>(7, 1);
+  TestPacked<int16_t>(1, 7);
+
+  TestPacked<uint16_t>(1, 1);
+  TestPacked<uint16_t>(7, 1);
+  TestPacked<uint16_t>(1, 7);
+
+  TestPacked<float>(1, 1);
+  TestPacked<float>(7, 1);
+  TestPacked<float>(1, 7);
+}
+
+// Ensure entire payload is readable/writable for various size/offset combos.
+TEST(ImageTest, TestAllocator) {
+  Rng rng(0);
+  const size_t k32 = 32;
+  const size_t kAlign = CacheAligned::kAlignment;
+  for (size_t size : {k32 * 1, k32 * 2, k32 * 3, k32 * 4, k32 * 5,
+                      CacheAligned::kAlias, 2 * CacheAligned::kAlias + 4}) {
+    for (size_t offset = 0; offset <= CacheAligned::kAlias; offset += kAlign) {
+      uint8_t* bytes =
+          static_cast<uint8_t*>(CacheAligned::Allocate(size, offset));
+      JXL_CHECK(reinterpret_cast<uintptr_t>(bytes) % kAlign == 0);
+      // Ensure we can write/read the last byte. Use RNG to fool the compiler
+      // into thinking the write is necessary.
+      memset(bytes, 0, size);
+      bytes[size - 1] = 1;                       // greatest element
+      uint32_t pos = rng.UniformU(0, size - 1);  // random but != greatest
+      JXL_CHECK(bytes[pos] < bytes[size - 1]);
+
+      CacheAligned::Free(bytes);
+    }
+  }
+}
+
+template <typename T>
+void TestFillImpl(Image3<T>* img, const char* layout) {
+  FillImage(T(1), img);
+  for (size_t y = 0; y < img->ysize(); ++y) {
+    for (size_t c = 0; c < 3; ++c) {
+      T* JXL_RESTRICT row = img->PlaneRow(c, y);
+      for (size_t x = 0; x < img->xsize(); ++x) {
+        if (row[x] != T(1)) {
+          printf("Not 1 at c=%" PRIuS " %" PRIuS ", %" PRIuS " (%" PRIuS
+                 " x %" PRIuS ") (%s)\n",
+                 c, x, y, img->xsize(), img->ysize(), layout);
+          abort();
+        }
+        row[x] = T(2);
+      }
+    }
+  }
+
+  // Same for ZeroFillImage and swapped c/y loop ordering.
+  ZeroFillImage(img);
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < img->ysize(); ++y) {
+      T* JXL_RESTRICT row = img->PlaneRow(c, y);
+      for (size_t x = 0; x < img->xsize(); ++x) {
+        if (row[x] != T(0)) {
+          printf("Not 0 at c=%" PRIuS " %" PRIuS ", %" PRIuS " (%" PRIuS
+                 " x %" PRIuS ") (%s)\n",
+                 c, x, y, img->xsize(), img->ysize(), layout);
+          abort();
+        }
+        row[x] = T(3);
+      }
+    }
+  }
+}
+
+template <typename T>
+void TestFillT() {
+  for (uint32_t xsize : {0, 1, 15, 16, 31, 32}) {
+    for (uint32_t ysize : {0, 1, 15, 16, 31, 32}) {
+      Image3<T> image(xsize, ysize);
+      TestFillImpl(&image, "size ctor");
+
+      Image3<T> planar(Plane<T>(xsize, ysize), Plane<T>(xsize, ysize),
+                       Plane<T>(xsize, ysize));
+      TestFillImpl(&planar, "planar");
+    }
+  }
+}
+
+// Ensure y/c/x and c/y/x loops visit pixels no more than once.
+TEST(ImageTest, TestFill) {
+  TestFillT<uint8_t>();
+  TestFillT<int16_t>();
+  TestFillT<float>();
+  TestFillT<double>();
+}
+
+TEST(ImageTest, CopyImageToWithPaddingTest) {
+  Plane<uint32_t> src(100, 61);
+  for (size_t y = 0; y < src.ysize(); y++) {
+    for (size_t x = 0; x < src.xsize(); x++) {
+      src.Row(y)[x] = x * 1000 + y;
+    }
+  }
+  Rect src_rect(10, 20, 30, 40);
+  EXPECT_TRUE(src_rect.IsInside(src));
+
+  Plane<uint32_t> dst(60, 50);
+  FillImage(0u, &dst);
+  Rect dst_rect(20, 5, 30, 40);
+  EXPECT_TRUE(dst_rect.IsInside(dst));
+
+  CopyImageToWithPadding(src_rect, src, /*padding=*/2, dst_rect, &dst);
+
+  // ysize is + 3 instead of + 4 because we are at the y image boundary on the
+  // source image.
+  Rect padded_dst_rect(20 - 2, 5 - 2, 30 + 4, 40 + 3);
+  for (size_t y = 0; y < dst.ysize(); y++) {
+    for (size_t x = 0; x < dst.xsize(); x++) {
+      if (Rect(x, y, 1, 1).IsInside(padded_dst_rect)) {
+        EXPECT_EQ((x - dst_rect.x0() + src_rect.x0()) * 1000 +
+                      (y - dst_rect.y0() + src_rect.y0()),
+                  dst.Row(y)[x]);
+      } else {
+        EXPECT_EQ(0u, dst.Row(y)[x]);
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/image_test_utils.h b/third_party/jpeg-xl/lib/jxl/image_test_utils.h
new file mode 100644
index 0000000000..e7d72285e6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_test_utils.h
@@ -0,0 +1,257 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_IMAGE_TEST_UTILS_H_
+#define LIB_JXL_IMAGE_TEST_UTILS_H_
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cmath>
+#include <limits>
+#include <sstream>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+template <typename T>
+bool SamePixels(const Plane<T>& image1, const Plane<T>& image2,
+                std::stringstream& failures) {
+  const Rect rect(image1);
+  JXL_CHECK(SameSize(image1, image2));
+  size_t mismatches = 0;
+  for (size_t y = rect.y0(); y < rect.ysize(); ++y) {
+    const T* const JXL_RESTRICT row1 = image1.Row(y);
+    const T* const JXL_RESTRICT row2 = image2.Row(y);
+    for (size_t x = rect.x0(); x < rect.xsize(); ++x) {
+      if (row1[x] != row2[x]) {
+        failures << "pixel mismatch" << x << ", " << y << ": "
+                 << double(row1[x]) << " != " << double(row2[x]) << "\n";
+        if (++mismatches > 4) {
+          return false;
+        }
+      }
+    }
+  }
+  return mismatches == 0;
+}
+
+template <typename T>
+bool SamePixels(const Image3<T>& image1, const Image3<T>& image2,
+                std::stringstream& failures) {
+  JXL_CHECK(SameSize(image1, image2));
+  for (size_t c = 0; c < 3; ++c) {
+    if (!SamePixels(image1.Plane(c), image2.Plane(c), failures)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Use for floating-point images with fairly large numbers; tolerates small
+// absolute errors and/or small relative errors.
+template <typename T>
+bool VerifyRelativeError(const Plane<T>& expected, const Plane<T>& actual,
+                         const double threshold_l1,
+                         const double threshold_relative,
+                         std::stringstream& failures, const intptr_t border = 0,
+                         const size_t c = 0) {
+  JXL_CHECK(SameSize(expected, actual));
+  const intptr_t xsize = expected.xsize();
+  const intptr_t ysize = expected.ysize();
+
+  // Max over current scanline to give a better idea whether there are
+  // systematic errors or just one outlier. Invalid if negative.
+  double max_l1 = -1;
+  double max_relative = -1;
+  bool any_bad = false;
+  for (intptr_t y = border; y < ysize - border; ++y) {
+    const T* const JXL_RESTRICT row_expected = expected.Row(y);
+    const T* const JXL_RESTRICT row_actual = actual.Row(y);
+    for (intptr_t x = border; x < xsize - border; ++x) {
+      const double l1 = std::abs(row_expected[x] - row_actual[x]);
+
+      // Cannot compute relative, only check/update L1.
+      if (std::abs(row_expected[x]) < 1E-10) {
+        if (l1 > threshold_l1) {
+          any_bad = true;
+          max_l1 = std::max(max_l1, l1);
+        }
+      } else {
+        const double relative = l1 / std::abs(double(row_expected[x]));
+        if (l1 > threshold_l1 && relative > threshold_relative) {
+          // Fails both tolerances => will exit below, update max_*.
+          any_bad = true;
+          max_l1 = std::max(max_l1, l1);
+          max_relative = std::max(max_relative, relative);
+        }
+      }
+    }
+  }
+  if (!any_bad) {
+    return true;
+  }
+  // Never had a valid relative value, don't print it.
+  if (max_relative < 0) {
+    fprintf(stderr, "c=%" PRIu64 ": max +/- %E exceeds +/- %.2E\n",
+            static_cast<uint64_t>(c), max_l1, threshold_l1);
+  } else {
+    fprintf(stderr,
+            "c=%" PRIu64 ": max +/- %E, x %E exceeds +/- %.2E, x %.2E\n",
+            static_cast<uint64_t>(c), max_l1, max_relative, threshold_l1,
+            threshold_relative);
+  }
+  // Dump the expected image and actual image if the region is small enough.
+  const intptr_t kMaxTestDumpSize = 16;
+  if (xsize <= kMaxTestDumpSize + 2 * border &&
+      ysize <= kMaxTestDumpSize + 2 * border) {
+    fprintf(stderr, "Expected image:\n");
+    for (intptr_t y = border; y < ysize - border; ++y) {
+      const T* const JXL_RESTRICT row_expected = expected.Row(y);
+      for (intptr_t x = border; x < xsize - border; ++x) {
+        fprintf(stderr, "%10lf ", static_cast<double>(row_expected[x]));
+      }
+      fprintf(stderr, "\n");
+    }
+
+    fprintf(stderr, "Actual image:\n");
+    for (intptr_t y = border; y < ysize - border; ++y) {
+      const T* const JXL_RESTRICT row_expected = expected.Row(y);
+      const T* const JXL_RESTRICT row_actual = actual.Row(y);
+      for (intptr_t x = border; x < xsize - border; ++x) {
+        const double l1 = std::abs(row_expected[x] - row_actual[x]);
+
+        bool bad = l1 > threshold_l1;
+        if (row_expected[x] > 1E-10) {
+          const double relative = l1 / std::abs(double(row_expected[x]));
+          bad &= relative > threshold_relative;
+        }
+        if (bad) {
+          fprintf(stderr, "%10lf ", static_cast<double>(row_actual[x]));
+        } else {
+          fprintf(stderr, "%10s ", "==");
+        }
+      }
+      fprintf(stderr, "\n");
+    }
+  }
+
+  // Find first failing x for further debugging.
+  for (intptr_t y = border; y < ysize - border; ++y) {
+    const T* const JXL_RESTRICT row_expected = expected.Row(y);
+    const T* const JXL_RESTRICT row_actual = actual.Row(y);
+
+    for (intptr_t x = border; x < xsize - border; ++x) {
+      const double l1 = std::abs(row_expected[x] - row_actual[x]);
+
+      bool bad = l1 > threshold_l1;
+      if (row_expected[x] > 1E-10) {
+        const double relative = l1 / std::abs(double(row_expected[x]));
+        bad &= relative > threshold_relative;
+      }
+      if (bad) {
+        failures << x << ", " << y << " (" << expected.xsize() << " x "
+                 << expected.ysize() << ") expected "
+                 << static_cast<double>(row_expected[x]) << " actual "
+                 << static_cast<double>(row_actual[x]);
+        return false;
+      }
+    }
+  }
+  return false;
+}
+
+template <typename T>
+bool VerifyRelativeError(const Image3<T>& expected, const Image3<T>& actual,
+                         const float threshold_l1,
+                         const float threshold_relative,
+                         std::stringstream& failures,
+                         const intptr_t border = 0) {
+  for (size_t c = 0; c < 3; ++c) {
+    bool ok =
+        VerifyRelativeError(expected.Plane(c), actual.Plane(c), threshold_l1,
+                            threshold_relative, failures, border, c);
+    if (!ok) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T, typename U = T>
+void GenerateImage(Rng& rng, Plane<T>* image, U begin, U end) {
+  for (size_t y = 0; y < image->ysize(); ++y) {
+    T* const JXL_RESTRICT row = image->Row(y);
+    for (size_t x = 0; x < image->xsize(); ++x) {
+      if (std::is_same<T, float>::value || std::is_same<T, double>::value) {
+        row[x] = rng.UniformF(begin, end);
+      } else if (std::is_signed<T>::value) {
+        row[x] = rng.UniformI(begin, end);
+      } else {
+        row[x] = rng.UniformU(begin, end);
+      }
+    }
+  }
+}
+
+template <typename T>
+void RandomFillImage(Plane<T>* image, const T begin, const T end,
+                     const int seed = 129) {
+  Rng rng(seed);
+  GenerateImage(rng, image, begin, end);
+}
+
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value>::type RandomFillImage(
+    Plane<T>* image) {
+  Rng rng(129);
+  GenerateImage(rng, image, int64_t(0),
+                int64_t(std::numeric_limits<T>::max()) + 1);
+}
+
+JXL_INLINE void RandomFillImage(Plane<float>* image) {
+  Rng rng(129);
+  GenerateImage(rng, image, 0.0f, std::numeric_limits<float>::max());
+}
+
+template <typename T, typename U>
+void GenerateImage(Rng& rng, Image3<T>* image, U begin, U end) {
+  for (size_t c = 0; c < 3; ++c) {
+    GenerateImage(rng, &image->Plane(c), begin, end);
+  }
+}
+
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value>::type RandomFillImage(
+    Image3<T>* image) {
+  Rng rng(129);
+  GenerateImage(rng, image, int64_t(0),
+                int64_t(std::numeric_limits<T>::max()) + 1);
+}
+
+JXL_INLINE void RandomFillImage(Image3F* image) {
+  Rng rng(129);
+  GenerateImage(rng, image, 0.0f, std::numeric_limits<float>::max());
+}
+
+template <typename T, typename U>
+void RandomFillImage(Image3<T>* image, const U begin, const U end,
+                     const int seed = 129) {
+  Rng rng(seed);
+  GenerateImage(rng, image, begin, end);
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_IMAGE_TEST_UTILS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/inverse_mtf-inl.h b/third_party/jpeg-xl/lib/jxl/inverse_mtf-inl.h
new file mode 100644
index 0000000000..fcb01d7396
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/inverse_mtf-inl.h
@@ -0,0 +1,90 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// SIMDified inverse-move-to-front transform.
+
+#if defined(LIB_JXL_INVERSE_MTF_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_INVERSE_MTF_INL_H_
+#undef LIB_JXL_INVERSE_MTF_INL_H_
+#else
+#define LIB_JXL_INVERSE_MTF_INL_H_
+#endif
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/sanitizers.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::FirstN;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::Load;
+using hwy::HWY_NAMESPACE::LoadU;
+using hwy::HWY_NAMESPACE::StoreU;
+
+inline void MoveToFront(uint8_t* v, uint8_t index) {
+  uint8_t value = v[index];
+  uint8_t i = index;
+  if (i < 4) {
+    for (; i; --i) v[i] = v[i - 1];
+  } else {
+    const HWY_CAPPED(uint8_t, 64) d;
+    int tail = i & (Lanes(d) - 1);
+    if (tail) {
+      i -= tail;
+      const auto vec = Load(d, v + i);
+      const auto prev = LoadU(d, v + i + 1);
+      StoreU(IfThenElse(FirstN(d, tail), vec, prev), d, v + i + 1);
+    }
+    while (i) {
+      i -= Lanes(d);
+      const auto vec = Load(d, v + i);
+      StoreU(vec, d, v + i + 1);
+    }
+  }
+  v[0] = value;
+}
+
+inline void InverseMoveToFrontTransform(uint8_t* v, int v_len) {
+  HWY_ALIGN uint8_t mtf[256 + 64];
+  int i;
+  for (i = 0; i < 256; ++i) {
+    mtf[i] = static_cast<uint8_t>(i);
+  }
+#if JXL_MEMORY_SANITIZER
+  const HWY_CAPPED(uint8_t, 64) d;
+  for (size_t j = 0; j < Lanes(d); ++j) {
+    mtf[256 + j] = 0;
+  }
+#endif  // JXL_MEMORY_SANITIZER
+  for (i = 0; i < v_len; ++i) {
+    uint8_t index = v[i];
+    v[i] = mtf[index];
+    if (index) MoveToFront(mtf, index);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_INVERSE_MTF_INL_H_
+
+#if HWY_ONCE
+#ifndef INVERSE_MTF_ONCE
+#define INVERSE_MTF_ONCE
+
+namespace jxl {
+inline void InverseMoveToFrontTransform(uint8_t* v, int v_len) {
+  return HWY_STATIC_DISPATCH(InverseMoveToFrontTransform)(v, v_len);
+}
+}  // namespace jxl
+
+#endif  // INVERSE_MTF_ONCE
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc
new file mode 100644
index 0000000000..db49a1c215
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc
@@ -0,0 +1,145 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/jpeg/dec_jpeg_data.h"
+
+#include <brotli/decode.h>
+
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+namespace jpeg {
+Status DecodeJPEGData(Span<const uint8_t> encoded, JPEGData* jpeg_data) {
+  Status ret = true;
+  const uint8_t* in = encoded.data();
+  size_t available_in = encoded.size();
+  {
+    BitReader br(encoded);
+    BitReaderScopedCloser br_closer(&br, &ret);
+    JXL_RETURN_IF_ERROR(Bundle::Read(&br, jpeg_data));
+    JXL_RETURN_IF_ERROR(br.JumpToByteBoundary());
+    in += br.TotalBitsConsumed() / 8;
+    available_in -= br.TotalBitsConsumed() / 8;
+  }
+  JXL_RETURN_IF_ERROR(ret);
+
+  BrotliDecoderState* brotli_dec =
+      BrotliDecoderCreateInstance(nullptr, nullptr, nullptr);
+
+  struct BrotliDecDeleter {
+    BrotliDecoderState* brotli_dec;
+    ~BrotliDecDeleter() { BrotliDecoderDestroyInstance(brotli_dec); }
+  } brotli_dec_deleter{brotli_dec};
+
+  BrotliDecoderResult result =
+      BrotliDecoderResult::BROTLI_DECODER_RESULT_SUCCESS;
+
+  auto br_read = [&](std::vector<uint8_t>& data) -> Status {
+    size_t available_out = data.size();
+    uint8_t* out = data.data();
+    while (available_out != 0) {
+      if (BrotliDecoderIsFinished(brotli_dec)) {
+        return JXL_FAILURE("Not enough decompressed output");
+      }
+      uint8_t* next_out_before = out;
+      size_t avail_out_before = available_out;
+      msan::MemoryIsInitialized(in, available_in);
+      result = BrotliDecoderDecompressStream(brotli_dec, &available_in, &in,
+                                             &available_out, &out, nullptr);
+      if (result !=
+              BrotliDecoderResult::BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT &&
+          result != BrotliDecoderResult::BROTLI_DECODER_RESULT_SUCCESS) {
+        return JXL_FAILURE(
+            "Brotli decoding error: %s\n",
+            BrotliDecoderErrorString(BrotliDecoderGetErrorCode(brotli_dec)));
+      }
+      msan::UnpoisonMemory(next_out_before, avail_out_before - available_out);
+    }
+    return true;
+  };
+  size_t num_icc = 0;
+  for (size_t i = 0; i < jpeg_data->app_data.size(); i++) {
+    auto& marker = jpeg_data->app_data[i];
+    if (jpeg_data->app_marker_type[i] != AppMarkerType::kUnknown) {
+      // Set the size of the marker.
+      size_t size_minus_1 = marker.size() - 1;
+      marker[1] = size_minus_1 >> 8;
+      marker[2] = size_minus_1 & 0xFF;
+      if (jpeg_data->app_marker_type[i] == AppMarkerType::kICC) {
+        if (marker.size() < 17) {
+          return JXL_FAILURE("ICC markers must be at least 17 bytes");
+        }
+        marker[0] = 0xE2;
+        memcpy(&marker[3], kIccProfileTag, sizeof kIccProfileTag);
+        marker[15] = ++num_icc;
+      }
+    } else {
+      JXL_RETURN_IF_ERROR(br_read(marker));
+      if (marker[1] * 256u + marker[2] + 1u != marker.size()) {
+        return JXL_FAILURE("Incorrect marker size");
+      }
+    }
+  }
+  for (size_t i = 0; i < jpeg_data->app_data.size(); i++) {
+    auto& marker = jpeg_data->app_data[i];
+    if (jpeg_data->app_marker_type[i] == AppMarkerType::kICC) {
+      marker[16] = num_icc;
+    }
+    if (jpeg_data->app_marker_type[i] == AppMarkerType::kExif) {
+      marker[0] = 0xE1;
+      if (marker.size() < 3 + sizeof kExifTag) {
+        return JXL_FAILURE("Incorrect Exif marker size");
+      }
+      memcpy(&marker[3], kExifTag, sizeof kExifTag);
+    }
+    if (jpeg_data->app_marker_type[i] == AppMarkerType::kXMP) {
+      marker[0] = 0xE1;
+      if (marker.size() < 3 + sizeof kXMPTag) {
+        return JXL_FAILURE("Incorrect XMP marker size");
+      }
+      memcpy(&marker[3], kXMPTag, sizeof kXMPTag);
+    }
+  }
+  // TODO(eustas): actually inject ICC profile and check it fits perfectly.
+  for (size_t i = 0; i < jpeg_data->com_data.size(); i++) {
+    auto& marker = jpeg_data->com_data[i];
+    JXL_RETURN_IF_ERROR(br_read(marker));
+    if (marker[1] * 256u + marker[2] + 1u != marker.size()) {
+      return JXL_FAILURE("Incorrect marker size");
+    }
+  }
+  for (size_t i = 0; i < jpeg_data->inter_marker_data.size(); i++) {
+    JXL_RETURN_IF_ERROR(br_read(jpeg_data->inter_marker_data[i]));
+  }
+  JXL_RETURN_IF_ERROR(br_read(jpeg_data->tail_data));
+
+  // Check if there is more decompressed output.
+  size_t available_out = 1;
+  uint64_t dummy;
+  uint8_t* next_out = reinterpret_cast<uint8_t*>(&dummy);
+  result = BrotliDecoderDecompressStream(brotli_dec, &available_in, &in,
+                                         &available_out, &next_out, nullptr);
+  if (available_out == 0 ||
+      result == BrotliDecoderResult::BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT) {
+    return JXL_FAILURE("Excess data in compressed stream");
+  }
+  if (result == BrotliDecoderResult::BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT) {
+    return JXL_FAILURE("Incomplete brotli-stream");
+  }
+  if (!BrotliDecoderIsFinished(brotli_dec) ||
+      result != BrotliDecoderResult::BROTLI_DECODER_RESULT_SUCCESS) {
+    return JXL_FAILURE("Corrupted brotli-stream");
+  }
+  if (available_in != 0) {
+    return JXL_FAILURE("Unused data after brotli stream");
+  }
+
+  return true;
+}
+}  // namespace jpeg
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.h b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.h
new file mode 100644
index 0000000000..b9d50bf9f8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.h
@@ -0,0 +1,19 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_JPEG_DEC_JPEG_DATA_H_
+#define LIB_JXL_JPEG_DEC_JPEG_DATA_H_
+
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+Status DecodeJPEGData(Span<const uint8_t> encoded, JPEGData* jpeg_data);
+}
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_DEC_JPEG_DATA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc
new file mode 100644
index 0000000000..f9ae755789
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc
@@ -0,0 +1,1050 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/jpeg/dec_jpeg_data_writer.h"
+
+#include <stdlib.h>
+#include <string.h> /* for memset, memcpy */
+
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/jpeg/dec_jpeg_serialization_state.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+
+namespace {
+
+enum struct SerializationStatus {
+  NEEDS_MORE_INPUT,
+  NEEDS_MORE_OUTPUT,
+  ERROR,
+  DONE
+};
+
+const int kJpegPrecision = 8;
+
+// JpegBitWriter: buffer size
+const size_t kJpegBitWriterChunkSize = 16384;
+
+// DCTCodingState: maximum number of correction bits to buffer
+const int kJPEGMaxCorrectionBits = 1u << 16;
+
+// Returns non-zero if and only if x has a zero byte, i.e. one of
+// x & 0xff, x & 0xff00, ..., x & 0xff00000000000000 is zero.
+static JXL_INLINE uint64_t HasZeroByte(uint64_t x) {
+  return (x - 0x0101010101010101ULL) & ~x & 0x8080808080808080ULL;
+}
+
+void JpegBitWriterInit(JpegBitWriter* bw,
+                       std::deque<OutputChunk>* output_queue) {
+  bw->output = output_queue;
+  bw->chunk = OutputChunk(kJpegBitWriterChunkSize);
+  bw->pos = 0;
+  bw->put_buffer = 0;
+  bw->put_bits = 64;
+  bw->healthy = true;
+  bw->data = bw->chunk.buffer->data();
+}
+
+static JXL_NOINLINE void SwapBuffer(JpegBitWriter* bw) {
+  bw->chunk.len = bw->pos;
+  bw->output->emplace_back(std::move(bw->chunk));
+  bw->chunk = OutputChunk(kJpegBitWriterChunkSize);
+  bw->data = bw->chunk.buffer->data();
+  bw->pos = 0;
+}
+
+static JXL_INLINE void Reserve(JpegBitWriter* bw, size_t n_bytes) {
+  if (JXL_UNLIKELY((bw->pos + n_bytes) > kJpegBitWriterChunkSize)) {
+    SwapBuffer(bw);
+  }
+}
+
+/**
+ * Writes the given byte to the output, writes an extra zero if byte is 0xFF.
+ *
+ * This method is "careless" - caller must make sure that there is enough
+ * space in the output buffer. Emits up to 2 bytes to buffer.
+ */
+static JXL_INLINE void EmitByte(JpegBitWriter* bw, int byte) {
+  bw->data[bw->pos++] = byte;
+  if (byte == 0xFF) bw->data[bw->pos++] = 0;
+}
+
+static JXL_INLINE void DischargeBitBuffer(JpegBitWriter* bw) {
+  // At this point we are ready to emit the most significant 6 bytes of
+  // put_buffer_ to the output.
+  // The JPEG format requires that after every 0xff byte in the entropy
+  // coded section, there is a zero byte, therefore we first check if any of
+  // the 6 most significant bytes of put_buffer_ is 0xFF.
+  Reserve(bw, 12);
+  if (HasZeroByte(~bw->put_buffer | 0xFFFF)) {
+    // We have a 0xFF byte somewhere, examine each byte and append a zero
+    // byte if necessary.
+    EmitByte(bw, (bw->put_buffer >> 56) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 48) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 40) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 32) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 24) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 16) & 0xFF);
+  } else {
+    // We don't have any 0xFF bytes, output all 6 bytes without checking.
+    bw->data[bw->pos] = (bw->put_buffer >> 56) & 0xFF;
+    bw->data[bw->pos + 1] = (bw->put_buffer >> 48) & 0xFF;
+    bw->data[bw->pos + 2] = (bw->put_buffer >> 40) & 0xFF;
+    bw->data[bw->pos + 3] = (bw->put_buffer >> 32) & 0xFF;
+    bw->data[bw->pos + 4] = (bw->put_buffer >> 24) & 0xFF;
+    bw->data[bw->pos + 5] = (bw->put_buffer >> 16) & 0xFF;
+    bw->pos += 6;
+  }
+  bw->put_buffer <<= 48;
+  bw->put_bits += 48;
+}
+
+static JXL_INLINE void WriteBits(JpegBitWriter* bw, int nbits, uint64_t bits) {
+  // This is an optimization; if everything goes well,
+  // then |nbits| is positive; if non-existing Huffman symbol is going to be
+  // encoded, its length should be zero; later encoder could check the
+  // "health" of JpegBitWriter.
+  if (nbits == 0) {
+    bw->healthy = false;
+    return;
+  }
+  bw->put_bits -= nbits;
+  bw->put_buffer |= (bits << bw->put_bits);
+  if (bw->put_bits <= 16) DischargeBitBuffer(bw);
+}
+
+void EmitMarker(JpegBitWriter* bw, int marker) {
+  Reserve(bw, 2);
+  JXL_DASSERT(marker != 0xFF);
+  bw->data[bw->pos++] = 0xFF;
+  bw->data[bw->pos++] = marker;
+}
+
+bool JumpToByteBoundary(JpegBitWriter* bw, const uint8_t** pad_bits,
+                        const uint8_t* pad_bits_end) {
+  size_t n_bits = bw->put_bits & 7u;
+  uint8_t pad_pattern;
+  if (*pad_bits == nullptr) {
+    pad_pattern = (1u << n_bits) - 1;
+  } else {
+    pad_pattern = 0;
+    const uint8_t* src = *pad_bits;
+    // TODO(eustas): bitwise reading looks insanely ineffective...
+    while (n_bits--) {
+      pad_pattern <<= 1;
+      if (src >= pad_bits_end) return false;
+      // TODO(eustas): DCHECK *src == {0, 1}
+      pad_pattern |= !!*(src++);
+    }
+    *pad_bits = src;
+  }
+
+  Reserve(bw, 16);
+
+  while (bw->put_bits <= 56) {
+    int c = (bw->put_buffer >> 56) & 0xFF;
+    EmitByte(bw, c);
+    bw->put_buffer <<= 8;
+    bw->put_bits += 8;
+  }
+  if (bw->put_bits < 64) {
+    int pad_mask = 0xFFu >> (64 - bw->put_bits);
+    int c = ((bw->put_buffer >> 56) & ~pad_mask) | pad_pattern;
+    EmitByte(bw, c);
+  }
+  bw->put_buffer = 0;
+  bw->put_bits = 64;
+
+  return true;
+}
+
+void JpegBitWriterFinish(JpegBitWriter* bw) {
+  if (bw->pos == 0) return;
+  bw->chunk.len = bw->pos;
+  bw->output->emplace_back(std::move(bw->chunk));
+  bw->chunk = OutputChunk(nullptr, 0);
+  bw->data = nullptr;
+  bw->pos = 0;
+}
+
+void DCTCodingStateInit(DCTCodingState* s) {
+  s->eob_run_ = 0;
+  s->cur_ac_huff_ = nullptr;
+  s->refinement_bits_.clear();
+  s->refinement_bits_.reserve(kJPEGMaxCorrectionBits);
+}
+
+enum OutputModes {
+  kModeHistogram,
+  kModeWrite,
+};
+
+template <int kOutputMode>
+static JXL_INLINE void WriteSymbol(int symbol, HuffmanCodeTable* table,
+                                   JpegBitWriter* bw) {
+  if (kOutputMode == OutputModes::kModeHistogram) {
+    ++table->depth[symbol];
+  } else {
+    WriteBits(bw, table->depth[symbol], table->code[symbol]);
+  }
+}
+
+// Emit all buffered data to the bit stream using the given Huffman code and
+// bit writer.
+template <int kOutputMode>
+static JXL_INLINE void Flush(DCTCodingState* s, JpegBitWriter* bw) {
+  if (s->eob_run_ > 0) {
+    int nbits = FloorLog2Nonzero<uint32_t>(s->eob_run_);
+    int symbol = nbits << 4u;
+    WriteSymbol<kOutputMode>(symbol, s->cur_ac_huff_, bw);
+    if (nbits > 0) {
+      WriteBits(bw, nbits, s->eob_run_ & ((1 << nbits) - 1));
+    }
+    s->eob_run_ = 0;
+  }
+  for (size_t i = 0; i < s->refinement_bits_.size(); ++i) {
+    WriteBits(bw, 1, s->refinement_bits_[i]);
+  }
+  s->refinement_bits_.clear();
+}
+
+// Buffer some more data at the end-of-band (the last non-zero or newly
+// non-zero coefficient within the [Ss, Se] spectral band).
+template <int kOutputMode>
+static JXL_INLINE void BufferEndOfBand(DCTCodingState* s,
+                                       HuffmanCodeTable* ac_huff,
+                                       const std::vector<int>* new_bits,
+                                       JpegBitWriter* bw) {
+  if (s->eob_run_ == 0) {
+    s->cur_ac_huff_ = ac_huff;
+  }
+  ++s->eob_run_;
+  if (new_bits) {
+    s->refinement_bits_.insert(s->refinement_bits_.end(), new_bits->begin(),
+                               new_bits->end());
+  }
+  if (s->eob_run_ == 0x7FFF ||
+      s->refinement_bits_.size() > kJPEGMaxCorrectionBits - kDCTBlockSize + 1) {
+    Flush<kOutputMode>(s, bw);
+  }
+}
+
+bool BuildHuffmanCodeTable(const JPEGHuffmanCode& huff,
+                           HuffmanCodeTable* table) {
+  int huff_code[kJpegHuffmanAlphabetSize];
+  // +1 for a sentinel element.
+  uint32_t huff_size[kJpegHuffmanAlphabetSize + 1];
+  int p = 0;
+  for (size_t l = 1; l <= kJpegHuffmanMaxBitLength; ++l) {
+    int i = huff.counts[l];
+    if (p + i > kJpegHuffmanAlphabetSize + 1) {
+      return false;
+    }
+    while (i--) huff_size[p++] = l;
+  }
+
+  if (p == 0) {
+    return true;
+  }
+
+  // Reuse sentinel element.
+  int last_p = p - 1;
+  huff_size[last_p] = 0;
+
+  int code = 0;
+  uint32_t si = huff_size[0];
+  p = 0;
+  while (huff_size[p]) {
+    while ((huff_size[p]) == si) {
+      huff_code[p++] = code;
+      code++;
+    }
+    code <<= 1;
+    si++;
+  }
+  for (p = 0; p < last_p; p++) {
+    int i = huff.values[p];
+    table->depth[i] = huff_size[p];
+    table->code[i] = huff_code[p];
+  }
+  return true;
+}
+
+bool EncodeSOI(SerializationState* state) {
+  state->output_queue.push_back(OutputChunk({0xFF, 0xD8}));
+  return true;
+}
+
+bool EncodeEOI(const JPEGData& jpg, SerializationState* state) {
+  state->output_queue.push_back(OutputChunk({0xFF, 0xD9}));
+  state->output_queue.emplace_back(jpg.tail_data);
+  return true;
+}
+
+bool EncodeSOF(const JPEGData& jpg, uint8_t marker, SerializationState* state) {
+  if (marker <= 0xC2) state->is_progressive = (marker == 0xC2);
+
+  const size_t n_comps = jpg.components.size();
+  const size_t marker_len = 8 + 3 * n_comps;
+  state->output_queue.emplace_back(marker_len + 2);
+  uint8_t* data = state->output_queue.back().buffer->data();
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = marker;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  data[pos++] = kJpegPrecision;
+  data[pos++] = jpg.height >> 8u;
+  data[pos++] = jpg.height & 0xFFu;
+  data[pos++] = jpg.width >> 8u;
+  data[pos++] = jpg.width & 0xFFu;
+  data[pos++] = n_comps;
+  for (size_t i = 0; i < n_comps; ++i) {
+    data[pos++] = jpg.components[i].id;
+    data[pos++] = ((jpg.components[i].h_samp_factor << 4u) |
+                   (jpg.components[i].v_samp_factor));
+    const size_t quant_idx = jpg.components[i].quant_idx;
+    if (quant_idx >= jpg.quant.size()) return false;
+    data[pos++] = jpg.quant[quant_idx].index;
+  }
+  return true;
+}
+
+bool EncodeSOS(const JPEGData& jpg, const JPEGScanInfo& scan_info,
+               SerializationState* state) {
+  const size_t n_scans = scan_info.num_components;
+  const size_t marker_len = 6 + 2 * n_scans;
+  state->output_queue.emplace_back(marker_len + 2);
+  uint8_t* data = state->output_queue.back().buffer->data();
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = 0xDA;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  data[pos++] = n_scans;
+  for (size_t i = 0; i < n_scans; ++i) {
+    const JPEGComponentScanInfo& si = scan_info.components[i];
+    if (si.comp_idx >= jpg.components.size()) return false;
+    data[pos++] = jpg.components[si.comp_idx].id;
+    data[pos++] = (si.dc_tbl_idx << 4u) + si.ac_tbl_idx;
+  }
+  data[pos++] = scan_info.Ss;
+  data[pos++] = scan_info.Se;
+  data[pos++] = ((scan_info.Ah << 4u) | (scan_info.Al));
+  return true;
+}
+
+bool EncodeDHT(const JPEGData& jpg, SerializationState* state) {
+  const std::vector<JPEGHuffmanCode>& huffman_code = jpg.huffman_code;
+
+  size_t marker_len = 2;
+  for (size_t i = state->dht_index; i < huffman_code.size(); ++i) {
+    const JPEGHuffmanCode& huff = huffman_code[i];
+    marker_len += kJpegHuffmanMaxBitLength;
+    for (size_t j = 0; j < huff.counts.size(); ++j) {
+      marker_len += huff.counts[j];
+    }
+    if (huff.is_last) break;
+  }
+  state->output_queue.emplace_back(marker_len + 2);
+  uint8_t* data = state->output_queue.back().buffer->data();
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = 0xC4;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  while (true) {
+    const size_t huffman_code_index = state->dht_index++;
+    if (huffman_code_index >= huffman_code.size()) {
+      return false;
+    }
+    const JPEGHuffmanCode& huff = huffman_code[huffman_code_index];
+    size_t index = huff.slot_id;
+    HuffmanCodeTable* huff_table;
+    if (index & 0x10) {
+      index -= 0x10;
+      huff_table = &state->ac_huff_table[index];
+    } else {
+      huff_table = &state->dc_huff_table[index];
+    }
+    // TODO(eustas): cache
+    // TODO(eustas): set up non-existing symbols
+    if (!BuildHuffmanCodeTable(huff, huff_table)) {
+      return false;
+    }
+    size_t total_count = 0;
+    size_t max_length = 0;
+    for (size_t i = 0; i < huff.counts.size(); ++i) {
+      if (huff.counts[i] != 0) {
+        max_length = i;
+      }
+      total_count += huff.counts[i];
+    }
+    --total_count;
+    data[pos++] = huff.slot_id;
+    for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+      data[pos++] = (i == max_length ? huff.counts[i] - 1 : huff.counts[i]);
+    }
+    for (size_t i = 0; i < total_count; ++i) {
+      data[pos++] = huff.values[i];
+    }
+    if (huff.is_last) break;
+  }
+  return true;
+}
+
+bool EncodeDQT(const JPEGData& jpg, SerializationState* state) {
+  int marker_len = 2;
+  for (size_t i = state->dqt_index; i < jpg.quant.size(); ++i) {
+    const JPEGQuantTable& table = jpg.quant[i];
+    marker_len += 1 + (table.precision ? 2 : 1) * kDCTBlockSize;
+    if (table.is_last) break;
+  }
+  state->output_queue.emplace_back(marker_len + 2);
+  uint8_t* data = state->output_queue.back().buffer->data();
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = 0xDB;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  while (true) {
+    const size_t idx = state->dqt_index++;
+    if (idx >= jpg.quant.size()) {
+      return false;  // corrupt input
+    }
+    const JPEGQuantTable& table = jpg.quant[idx];
+    data[pos++] = (table.precision << 4u) + table.index;
+    for (size_t i = 0; i < kDCTBlockSize; ++i) {
+      int val_idx = kJPEGNaturalOrder[i];
+      int val = table.values[val_idx];
+      if (table.precision) {
+        data[pos++] = val >> 8u;
+      }
+      data[pos++] = val & 0xFFu;
+    }
+    if (table.is_last) break;
+  }
+  return true;
+}
+
+bool EncodeDRI(const JPEGData& jpg, SerializationState* state) {
+  state->seen_dri_marker = true;
+  OutputChunk dri_marker = {0xFF,
+                            0xDD,
+                            0,
+                            4,
+                            static_cast<uint8_t>(jpg.restart_interval >> 8),
+                            static_cast<uint8_t>(jpg.restart_interval & 0xFF)};
+  state->output_queue.push_back(std::move(dri_marker));
+  return true;
+}
+
+bool EncodeRestart(uint8_t marker, SerializationState* state) {
+  state->output_queue.push_back(OutputChunk({0xFF, marker}));
+  return true;
+}
+
+bool EncodeAPP(const JPEGData& jpg, uint8_t marker, SerializationState* state) {
+  // TODO(eustas): check that marker corresponds to payload?
+  (void)marker;
+
+  size_t app_index = state->app_index++;
+  if (app_index >= jpg.app_data.size()) return false;
+  state->output_queue.push_back(OutputChunk({0xFF}));
+  state->output_queue.emplace_back(jpg.app_data[app_index]);
+  return true;
+}
+
+bool EncodeCOM(const JPEGData& jpg, SerializationState* state) {
+  size_t com_index = state->com_index++;
+  if (com_index >= jpg.com_data.size()) return false;
+  state->output_queue.push_back(OutputChunk({0xFF}));
+  state->output_queue.emplace_back(jpg.com_data[com_index]);
+  return true;
+}
+
+bool EncodeInterMarkerData(const JPEGData& jpg, SerializationState* state) {
+  size_t index = state->data_index++;
+  if (index >= jpg.inter_marker_data.size()) return false;
+  state->output_queue.emplace_back(jpg.inter_marker_data[index]);
+  return true;
+}
+
+template <int kOutputMode>
+bool EncodeDCTBlockSequential(const coeff_t* coeffs, HuffmanCodeTable* dc_huff,
+                              HuffmanCodeTable* ac_huff, int num_zero_runs,
+                              coeff_t* last_dc_coeff, JpegBitWriter* bw) {
+  coeff_t temp2;
+  coeff_t temp;
+  temp2 = coeffs[0];
+  temp = temp2 - *last_dc_coeff;
+  *last_dc_coeff = temp2;
+  temp2 = temp;
+  if (temp < 0) {
+    temp = -temp;
+    if (temp < 0) return false;
+    temp2--;
+  }
+  int dc_nbits = (temp == 0) ? 0 : (FloorLog2Nonzero<uint32_t>(temp) + 1);
+  WriteSymbol<kOutputMode>(dc_nbits, dc_huff, bw);
+  if (dc_nbits >= 12) return false;
+  if (dc_nbits > 0) {
+    WriteBits(bw, dc_nbits, temp2 & ((1u << dc_nbits) - 1));
+  }
+  int r = 0;
+  for (int k = 1; k < 64; ++k) {
+    if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) {
+      r++;
+      continue;
+    }
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp2 = ~temp;
+    } else {
+      temp2 = temp;
+    }
+    while (r > 15) {
+      WriteSymbol<kOutputMode>(0xf0, ac_huff, bw);
+      r -= 16;
+    }
+    int ac_nbits = FloorLog2Nonzero<uint32_t>(temp) + 1;
+    if (ac_nbits >= 16) return false;
+    int symbol = (r << 4u) + ac_nbits;
+    WriteSymbol<kOutputMode>(symbol, ac_huff, bw);
+    WriteBits(bw, ac_nbits, temp2 & ((1 << ac_nbits) - 1));
+    r = 0;
+  }
+  for (int i = 0; i < num_zero_runs; ++i) {
+    WriteSymbol<kOutputMode>(0xf0, ac_huff, bw);
+    r -= 16;
+  }
+  if (r > 0) {
+    WriteSymbol<kOutputMode>(0, ac_huff, bw);
+  }
+  return true;
+}
+
+template <int kOutputMode>
+bool EncodeDCTBlockProgressive(const coeff_t* coeffs, HuffmanCodeTable* dc_huff,
+                               HuffmanCodeTable* ac_huff, int Ss, int Se,
+                               int Al, int num_zero_runs,
+                               DCTCodingState* coding_state,
+                               coeff_t* last_dc_coeff, JpegBitWriter* bw) {
+  bool eob_run_allowed = Ss > 0;
+  coeff_t temp2;
+  coeff_t temp;
+  if (Ss == 0) {
+    temp2 = coeffs[0] >> Al;
+    temp = temp2 - *last_dc_coeff;
+    *last_dc_coeff = temp2;
+    temp2 = temp;
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp2--;
+    }
+    int nbits = (temp == 0) ? 0 : (FloorLog2Nonzero<uint32_t>(temp) + 1);
+    WriteSymbol<kOutputMode>(nbits, dc_huff, bw);
+    if (nbits > 0) {
+      WriteBits(bw, nbits, temp2 & ((1 << nbits) - 1));
+    }
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int r = 0;
+  for (int k = Ss; k <= Se; ++k) {
+    if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) {
+      r++;
+      continue;
+    }
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp >>= Al;
+      temp2 = ~temp;
+    } else {
+      temp >>= Al;
+      temp2 = temp;
+    }
+    if (temp == 0) {
+      r++;
+      continue;
+    }
+    Flush<kOutputMode>(coding_state, bw);
+    while (r > 15) {
+      WriteSymbol<kOutputMode>(0xf0, ac_huff, bw);
+      r -= 16;
+    }
+    int nbits = FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int symbol = (r << 4u) + nbits;
+    WriteSymbol<kOutputMode>(symbol, ac_huff, bw);
+    WriteBits(bw, nbits, temp2 & ((1 << nbits) - 1));
+    r = 0;
+  }
+  if (num_zero_runs > 0) {
+    Flush<kOutputMode>(coding_state, bw);
+    for (int i = 0; i < num_zero_runs; ++i) {
+      WriteSymbol<kOutputMode>(0xf0, ac_huff, bw);
+      r -= 16;
+    }
+  }
+  if (r > 0) {
+    BufferEndOfBand<kOutputMode>(coding_state, ac_huff, nullptr, bw);
+    if (!eob_run_allowed) {
+      Flush<kOutputMode>(coding_state, bw);
+    }
+  }
+  return true;
+}
+
+template <int kOutputMode>
+bool EncodeRefinementBits(const coeff_t* coeffs, HuffmanCodeTable* ac_huff,
+                          int Ss, int Se, int Al, DCTCodingState* coding_state,
+                          JpegBitWriter* bw) {
+  bool eob_run_allowed = Ss > 0;
+  if (Ss == 0) {
+    // Emit next bit of DC component.
+    WriteBits(bw, 1, (coeffs[0] >> Al) & 1);
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int abs_values[kDCTBlockSize];
+  int eob = 0;
+  for (int k = Ss; k <= Se; k++) {
+    const coeff_t abs_val = std::abs(coeffs[kJPEGNaturalOrder[k]]);
+    abs_values[k] = abs_val >> Al;
+    if (abs_values[k] == 1) {
+      eob = k;
+    }
+  }
+  int r = 0;
+  std::vector<int> refinement_bits;
+  refinement_bits.reserve(kDCTBlockSize);
+  for (int k = Ss; k <= Se; k++) {
+    if (abs_values[k] == 0) {
+      r++;
+      continue;
+    }
+    while (r > 15 && k <= eob) {
+      Flush<kOutputMode>(coding_state, bw);
+      WriteSymbol<kOutputMode>(0xf0, ac_huff, bw);
+      r -= 16;
+      for (int bit : refinement_bits) {
+        WriteBits(bw, 1, bit);
+      }
+      refinement_bits.clear();
+    }
+    if (abs_values[k] > 1) {
+      refinement_bits.push_back(abs_values[k] & 1u);
+      continue;
+    }
+    Flush<kOutputMode>(coding_state, bw);
+    int symbol = (r << 4u) + 1;
+    int new_non_zero_bit = (coeffs[kJPEGNaturalOrder[k]] < 0) ? 0 : 1;
+    WriteSymbol<kOutputMode>(symbol, ac_huff, bw);
+    WriteBits(bw, 1, new_non_zero_bit);
+    for (int bit : refinement_bits) {
+      WriteBits(bw, 1, bit);
+    }
+    refinement_bits.clear();
+    r = 0;
+  }
+  if (r > 0 || !refinement_bits.empty()) {
+    BufferEndOfBand<kOutputMode>(coding_state, ac_huff, &refinement_bits, bw);
+    if (!eob_run_allowed) {
+      Flush<kOutputMode>(coding_state, bw);
+    }
+  }
+  return true;
+}
+
+size_t NumHistograms(const JPEGData& jpg) {
+  size_t num = 0;
+  for (const auto& si : jpg.scan_info) {
+    num += si.num_components;
+  }
+  return num;
+}
+
+size_t HistogramIndex(const JPEGData& jpg, size_t scan_index,
+                      size_t component_index) {
+  size_t idx = 0;
+  for (size_t i = 0; i < scan_index; ++i) {
+    idx += jpg.scan_info[i].num_components;
+  }
+  return idx + component_index;
+}
+
+template <int kMode, int kOutputMode>
+SerializationStatus JXL_NOINLINE DoEncodeScan(const JPEGData& jpg,
+                                              SerializationState* state) {
+  const JPEGScanInfo& scan_info = jpg.scan_info[state->scan_index];
+  EncodeScanState& ss = state->scan_state;
+
+  const int restart_interval =
+      state->seen_dri_marker ? jpg.restart_interval : 0;
+
+  const auto get_next_extra_zero_run_index = [&ss, &scan_info]() -> int {
+    if (ss.extra_zero_runs_pos < scan_info.extra_zero_runs.size()) {
+      return scan_info.extra_zero_runs[ss.extra_zero_runs_pos].block_idx;
+    } else {
+      return -1;
+    }
+  };
+
+  const auto get_next_reset_point = [&ss, &scan_info]() -> int {
+    if (ss.next_reset_point_pos < scan_info.reset_points.size()) {
+      return scan_info.reset_points[ss.next_reset_point_pos++];
+    } else {
+      return -1;
+    }
+  };
+
+  if (ss.stage == EncodeScanState::HEAD) {
+    if (!EncodeSOS(jpg, scan_info, state)) return SerializationStatus::ERROR;
+    JpegBitWriterInit(&ss.bw, &state->output_queue);
+    DCTCodingStateInit(&ss.coding_state);
+    ss.restarts_to_go = restart_interval;
+    ss.next_restart_marker = 0;
+    ss.block_scan_index = 0;
+    ss.extra_zero_runs_pos = 0;
+    ss.next_extra_zero_run_index = get_next_extra_zero_run_index();
+    ss.next_reset_point_pos = 0;
+    ss.next_reset_point = get_next_reset_point();
+    ss.mcu_y = 0;
+    memset(ss.last_dc_coeff, 0, sizeof(ss.last_dc_coeff));
+    ss.stage = EncodeScanState::BODY;
+  }
+  JpegBitWriter* bw = &ss.bw;
+  DCTCodingState* coding_state = &ss.coding_state;
+
+  JXL_DASSERT(ss.stage == EncodeScanState::BODY);
+
+  // "Non-interleaved" means color data comes in separate scans, in other words
+  // each scan can contain only one color component.
+  const bool is_interleaved = (scan_info.num_components > 1);
+  int MCUs_per_row = 0;
+  int MCU_rows = 0;
+  jpg.CalculateMcuSize(scan_info, &MCUs_per_row, &MCU_rows);
+  const bool is_progressive = state->is_progressive;
+  const int Al = is_progressive ? scan_info.Al : 0;
+  const int Ss = is_progressive ? scan_info.Ss : 0;
+  const int Se = is_progressive ? scan_info.Se : 63;
+
+  // DC-only is defined by [0..0] spectral range.
+  const bool want_ac = ((Ss != 0) || (Se != 0));
+  // TODO: support streaming decoding again.
+  const bool complete_ac = true;
+  const bool has_ac = true;
+  if (want_ac && !has_ac) return SerializationStatus::NEEDS_MORE_INPUT;
+
+  // |has_ac| implies |complete_dc| but not vice versa; for the sake of
+  // simplicity we pretend they are equal, because they are separated by just a
+  // few bytes of input.
+  const bool complete_dc = has_ac;
+  const bool complete = want_ac ? complete_ac : complete_dc;
+  // When "incomplete" |ac_dc| tracks information about current ("incomplete")
+  // band parsing progress.
+
+  // FIXME: Is this always complete?
+  // const int last_mcu_y =
+  //     complete ? MCU_rows : parsing_state.internal->ac_dc.next_mcu_y *
+  //     v_group;
+  (void)complete;
+  const int last_mcu_y = complete ? MCU_rows : 0;
+
+  for (; ss.mcu_y < last_mcu_y; ++ss.mcu_y) {
+    for (int mcu_x = 0; mcu_x < MCUs_per_row; ++mcu_x) {
+      // Possibly emit a restart marker.
+      if (restart_interval > 0 && ss.restarts_to_go == 0) {
+        Flush<kOutputMode>(coding_state, bw);
+        if (!JumpToByteBoundary(bw, &state->pad_bits, state->pad_bits_end)) {
+          return SerializationStatus::ERROR;
+        }
+        EmitMarker(bw, 0xD0 + ss.next_restart_marker);
+        ss.next_restart_marker += 1;
+        ss.next_restart_marker &= 0x7;
+        ss.restarts_to_go = restart_interval;
+        memset(ss.last_dc_coeff, 0, sizeof(ss.last_dc_coeff));
+      }
+      // Encode one MCU
+      for (size_t i = 0; i < scan_info.num_components; ++i) {
+        const JPEGComponentScanInfo& si = scan_info.components[i];
+        const JPEGComponent& c = jpg.components[si.comp_idx];
+        size_t dc_tbl_idx = (kOutputMode == OutputModes::kModeHistogram
+                                 ? HistogramIndex(jpg, state->scan_index, i)
+                                 : si.dc_tbl_idx);
+        size_t ac_tbl_idx = (kOutputMode == OutputModes::kModeHistogram
+                                 ? HistogramIndex(jpg, state->scan_index, i)
+                                 : si.ac_tbl_idx);
+        HuffmanCodeTable* dc_huff = &state->dc_huff_table[dc_tbl_idx];
+        HuffmanCodeTable* ac_huff = &state->ac_huff_table[ac_tbl_idx];
+        int n_blocks_y = is_interleaved ? c.v_samp_factor : 1;
+        int n_blocks_x = is_interleaved ? c.h_samp_factor : 1;
+        for (int iy = 0; iy < n_blocks_y; ++iy) {
+          for (int ix = 0; ix < n_blocks_x; ++ix) {
+            int block_y = ss.mcu_y * n_blocks_y + iy;
+            int block_x = mcu_x * n_blocks_x + ix;
+            int block_idx = block_y * c.width_in_blocks + block_x;
+            if (ss.block_scan_index == ss.next_reset_point) {
+              Flush<kOutputMode>(coding_state, bw);
+              ss.next_reset_point = get_next_reset_point();
+            }
+            int num_zero_runs = 0;
+            if (ss.block_scan_index == ss.next_extra_zero_run_index) {
+              num_zero_runs = scan_info.extra_zero_runs[ss.extra_zero_runs_pos]
+                                  .num_extra_zero_runs;
+              ++ss.extra_zero_runs_pos;
+              ss.next_extra_zero_run_index = get_next_extra_zero_run_index();
+            }
+            const coeff_t* coeffs = &c.coeffs[block_idx << 6];
+            bool ok;
+            if (kMode == 0) {
+              ok = EncodeDCTBlockSequential<kOutputMode>(
+                  coeffs, dc_huff, ac_huff, num_zero_runs,
+                  ss.last_dc_coeff + si.comp_idx, bw);
+            } else if (kMode == 1) {
+              ok = EncodeDCTBlockProgressive<kOutputMode>(
+                  coeffs, dc_huff, ac_huff, Ss, Se, Al, num_zero_runs,
+                  coding_state, ss.last_dc_coeff + si.comp_idx, bw);
+            } else {
+              ok = EncodeRefinementBits<kOutputMode>(coeffs, ac_huff, Ss, Se,
+                                                     Al, coding_state, bw);
+            }
+            if (!ok) return SerializationStatus::ERROR;
+            ++ss.block_scan_index;
+          }
+        }
+      }
+      --ss.restarts_to_go;
+    }
+  }
+  if (ss.mcu_y < MCU_rows) {
+    if (!bw->healthy) return SerializationStatus::ERROR;
+    return SerializationStatus::NEEDS_MORE_INPUT;
+  }
+  Flush<kOutputMode>(coding_state, bw);
+  if (!JumpToByteBoundary(bw, &state->pad_bits, state->pad_bits_end)) {
+    return SerializationStatus::ERROR;
+  }
+  JpegBitWriterFinish(bw);
+  ss.stage = EncodeScanState::HEAD;
+  state->scan_index++;
+  if (!bw->healthy) return SerializationStatus::ERROR;
+
+  return SerializationStatus::DONE;
+}
+
+template <int kOutputMode>
+static SerializationStatus JXL_INLINE EncodeScan(const JPEGData& jpg,
+                                                 SerializationState* state) {
+  const JPEGScanInfo& scan_info = jpg.scan_info[state->scan_index];
+  const bool is_progressive = state->is_progressive;
+  const int Al = is_progressive ? scan_info.Al : 0;
+  const int Ah = is_progressive ? scan_info.Ah : 0;
+  const int Ss = is_progressive ? scan_info.Ss : 0;
+  const int Se = is_progressive ? scan_info.Se : 63;
+  const bool need_sequential =
+      !is_progressive || (Ah == 0 && Al == 0 && Ss == 0 && Se == 63);
+  if (need_sequential) {
+    return DoEncodeScan<0, kOutputMode>(jpg, state);
+  } else if (Ah == 0) {
+    return DoEncodeScan<1, kOutputMode>(jpg, state);
+  } else {
+    return DoEncodeScan<2, kOutputMode>(jpg, state);
+  }
+}
+
+template <int kOutputMode>
+SerializationStatus SerializeSection(uint8_t marker, SerializationState* state,
+                                     const JPEGData& jpg) {
+  const auto to_status = [](bool result) {
+    return result ? SerializationStatus::DONE : SerializationStatus::ERROR;
+  };
+  // TODO(eustas): add and use marker enum
+  switch (marker) {
+    case 0xC0:
+    case 0xC1:
+    case 0xC2:
+    case 0xC9:
+    case 0xCA:
+      return to_status(EncodeSOF(jpg, marker, state));
+
+    case 0xC4:
+      return to_status((kOutputMode == OutputModes::kModeHistogram) ||
+                       EncodeDHT(jpg, state));
+
+    case 0xD0:
+    case 0xD1:
+    case 0xD2:
+    case 0xD3:
+    case 0xD4:
+    case 0xD5:
+    case 0xD6:
+    case 0xD7:
+      return to_status(EncodeRestart(marker, state));
+
+    case 0xD9:
+      return to_status(EncodeEOI(jpg, state));
+
+    case 0xDA:
+      return EncodeScan<kOutputMode>(jpg, state);
+
+    case 0xDB:
+      return to_status(EncodeDQT(jpg, state));
+
+    case 0xDD:
+      return to_status(EncodeDRI(jpg, state));
+
+    case 0xE0:
+    case 0xE1:
+    case 0xE2:
+    case 0xE3:
+    case 0xE4:
+    case 0xE5:
+    case 0xE6:
+    case 0xE7:
+    case 0xE8:
+    case 0xE9:
+    case 0xEA:
+    case 0xEB:
+    case 0xEC:
+    case 0xED:
+    case 0xEE:
+    case 0xEF:
+      return to_status(EncodeAPP(jpg, marker, state));
+
+    case 0xFE:
+      return to_status(EncodeCOM(jpg, state));
+
+    case 0xFF:
+      return to_status(EncodeInterMarkerData(jpg, state));
+
+    default:
+      return SerializationStatus::ERROR;
+  }
+}
+
+// TODO(veluca): add streaming support again.
+template <int kOutputMode>
+Status WriteJpegInternal(const JPEGData& jpg, const JPEGOutput& out,
+                         SerializationState* ss) {
+  const auto maybe_push_output = [&]() -> Status {
+    if (ss->stage != SerializationState::STAGE_ERROR) {
+      while (!ss->output_queue.empty()) {
+        auto& chunk = ss->output_queue.front();
+        size_t num_written = out(chunk.next, chunk.len);
+        if (num_written == 0 && chunk.len > 0) {
+          return StatusMessage(Status(StatusCode::kNotEnoughBytes),
+                               "Failed to write output");
+        }
+        chunk.len -= num_written;
+        if (chunk.len == 0) {
+          ss->output_queue.pop_front();
+        }
+      }
+    }
+    return true;
+  };
+
+  while (true) {
+    switch (ss->stage) {
+      case SerializationState::STAGE_INIT: {
+        // Valid Brunsli requires, at least, 0xD9 marker.
+        // This might happen on corrupted stream, or on unconditioned JPEGData.
+        // TODO(eustas): check D9 in the only one and is the last one.
+        if (jpg.marker_order.empty()) {
+          ss->stage = SerializationState::STAGE_ERROR;
+          break;
+        }
+        if (kOutputMode == OutputModes::kModeHistogram) {
+          size_t num_histo = NumHistograms(jpg);
+          ss->dc_huff_table.resize(num_histo);
+          ss->ac_huff_table.resize(num_histo);
+          for (size_t i = 0; i < num_histo; ++i) {
+            ss->dc_huff_table[i].InitDepths();
+            ss->ac_huff_table[i].InitDepths();
+          }
+        } else {
+          ss->dc_huff_table.resize(kMaxHuffmanTables);
+          ss->ac_huff_table.resize(kMaxHuffmanTables);
+        }
+        if (jpg.has_zero_padding_bit) {
+          ss->pad_bits = jpg.padding_bits.data();
+          ss->pad_bits_end = ss->pad_bits + jpg.padding_bits.size();
+        }
+
+        EncodeSOI(ss);
+        JXL_QUIET_RETURN_IF_ERROR(maybe_push_output());
+        ss->stage = SerializationState::STAGE_SERIALIZE_SECTION;
+        break;
+      }
+
+      case SerializationState::STAGE_SERIALIZE_SECTION: {
+        if (ss->section_index >= jpg.marker_order.size()) {
+          ss->stage = SerializationState::STAGE_DONE;
+          break;
+        }
+        uint8_t marker = jpg.marker_order[ss->section_index];
+        SerializationStatus status =
+            SerializeSection<kOutputMode>(marker, ss, jpg);
+        if (status == SerializationStatus::ERROR) {
+          JXL_WARNING("Failed to encode marker 0x%.2x", marker);
+          ss->stage = SerializationState::STAGE_ERROR;
+          break;
+        }
+        JXL_QUIET_RETURN_IF_ERROR(maybe_push_output());
+        if (status == SerializationStatus::NEEDS_MORE_INPUT) {
+          return JXL_FAILURE("Incomplete serialization data");
+        } else if (status != SerializationStatus::DONE) {
+          JXL_DASSERT(false);
+          ss->stage = SerializationState::STAGE_ERROR;
+          break;
+        }
+        ++ss->section_index;
+        break;
+      }
+
+      case SerializationState::STAGE_DONE:
+        JXL_ASSERT(ss->output_queue.empty());
+        if (ss->pad_bits != nullptr && ss->pad_bits != ss->pad_bits_end) {
+          return JXL_FAILURE("Invalid number of padding bits.");
+        }
+        return true;
+
+      case SerializationState::STAGE_ERROR:
+        return JXL_FAILURE("JPEG serialization error");
+    }
+  }
+}
+
+}  // namespace
+
+Status WriteJpeg(const JPEGData& jpg, const JPEGOutput& out) {
+  SerializationState ss;
+  return WriteJpegInternal<OutputModes::kModeWrite>(jpg, out, &ss);
+}
+
+Status ProcessJpeg(const JPEGData& jpg, SerializationState* ss) {
+  auto nullout = [](const uint8_t* buf, size_t len) { return len; };
+  return WriteJpegInternal<OutputModes::kModeHistogram>(jpg, nullout, ss);
+}
+
+}  // namespace jpeg
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.h b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.h
new file mode 100644
index 0000000000..9ccfb749a8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.h
@@ -0,0 +1,35 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Functions for writing a JPEGData object into a jpeg byte stream.
+
+#ifndef LIB_JXL_JPEG_DEC_JPEG_DATA_WRITER_H_
+#define LIB_JXL_JPEG_DEC_JPEG_DATA_WRITER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <functional>
+
+#include "lib/jxl/jpeg/dec_jpeg_serialization_state.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+
+// Function type used to write len bytes into buf. Returns the number of bytes
+// written.
+using JPEGOutput = std::function<size_t(const uint8_t* buf, size_t len)>;
+
+Status WriteJpeg(const JPEGData& jpg, const JPEGOutput& out);
+
+// Same as WriteJpeg, but instead of writing to the output, collects statistics
+// about the bit-stream into `ss`.
+Status ProcessJpeg(const JPEGData& jpg, SerializationState* ss);
+
+}  // namespace jpeg
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_DEC_JPEG_DATA_WRITER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_output_chunk.h b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_output_chunk.h
new file mode 100644
index 0000000000..e003c04952
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_output_chunk.h
@@ -0,0 +1,72 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_JPEG_DEC_JPEG_OUTPUT_CHUNK_H_
+#define LIB_JXL_JPEG_DEC_JPEG_OUTPUT_CHUNK_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <initializer_list>
+#include <memory>
+#include <vector>
+
+namespace jxl {
+namespace jpeg {
+
+/**
+ * A chunk of output data.
+ *
+ * Data producer creates OutputChunks and adds them to the end output queue.
+ * Once control flow leaves the producer code, it is considered that chunk of
+ * data is final and can not be changed; to underline this fact |next| is a
+ * const-pointer.
+ *
+ * Data consumer removes OutputChunks from the beginning of the output queue.
+ * It is possible to consume OutputChunks partially, by updating |next| and
+ * |len|.
+ *
+ * There are 2 types of output chunks:
+ *  - owning: actual data is stored in |buffer| field; producer fills data after
+ *    the instance it created; it is legal to reduce |len| to show that not all
+ *    the capacity of |buffer| is used
+ *  - non-owning: represents the data stored (owned) somewhere else
+ */
+struct OutputChunk {
+  // Non-owning
+  template <typename Bytes>
+  explicit OutputChunk(Bytes& bytes) : len(bytes.size()) {
+    // Deal both with const qualifier and data type.
+    const void* src = bytes.data();
+    next = reinterpret_cast<const uint8_t*>(src);
+  }
+
+  // Non-owning
+  OutputChunk(const uint8_t* data, size_t size) : next(data), len(size) {}
+
+  // Owning
+  explicit OutputChunk(size_t size = 0) {
+    buffer.reset(new std::vector<uint8_t>(size));
+    next = buffer->data();
+    len = size;
+  }
+
+  // Owning
+  OutputChunk(std::initializer_list<uint8_t> bytes) {
+    buffer.reset(new std::vector<uint8_t>(bytes));
+    next = buffer->data();
+    len = bytes.size();
+  }
+
+  const uint8_t* next;
+  size_t len;
+  // TODO(veluca): consider removing the unique_ptr.
+  std::unique_ptr<std::vector<uint8_t>> buffer;
+};
+
+}  // namespace jpeg
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_DEC_JPEG_OUTPUT_CHUNK_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_serialization_state.h b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_serialization_state.h
new file mode 100644
index 0000000000..40ce450a76
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_serialization_state.h
@@ -0,0 +1,96 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_JPEG_DEC_JPEG_SERIALIZATION_STATE_H_
+#define LIB_JXL_JPEG_DEC_JPEG_SERIALIZATION_STATE_H_
+
+#include <deque>
+#include <vector>
+
+#include "lib/jxl/jpeg/dec_jpeg_output_chunk.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+
+struct HuffmanCodeTable {
+  int depth[256];
+  int code[256];
+  void InitDepths() { std::fill(std::begin(depth), std::end(depth), 0); }
+};
+
+// Handles the packing of bits into output bytes.
+struct JpegBitWriter {
+  bool healthy;
+  std::deque<OutputChunk>* output;
+  OutputChunk chunk;
+  uint8_t* data;
+  size_t pos;
+  uint64_t put_buffer;
+  int put_bits;
+};
+
+// Holds data that is buffered between 8x8 blocks in progressive mode.
+struct DCTCodingState {
+  // The run length of end-of-band symbols in a progressive scan.
+  int eob_run_;
+  // The huffman table to be used when flushing the state.
+  HuffmanCodeTable* cur_ac_huff_;
+  // The sequence of currently buffered refinement bits for a successive
+  // approximation scan (one where Ah > 0).
+  std::vector<int> refinement_bits_;
+};
+
+struct EncodeScanState {
+  enum Stage { HEAD, BODY };
+
+  Stage stage = HEAD;
+
+  int mcu_y;
+  JpegBitWriter bw;
+  coeff_t last_dc_coeff[kMaxComponents] = {0};
+  int restarts_to_go;
+  int next_restart_marker;
+  int block_scan_index;
+  DCTCodingState coding_state;
+  size_t extra_zero_runs_pos;
+  int next_extra_zero_run_index;
+  size_t next_reset_point_pos;
+  int next_reset_point;
+};
+
+struct SerializationState {
+  enum Stage {
+    STAGE_INIT,
+    STAGE_SERIALIZE_SECTION,
+    STAGE_DONE,
+    STAGE_ERROR,
+  };
+
+  Stage stage = STAGE_INIT;
+
+  std::deque<OutputChunk> output_queue;
+
+  size_t section_index = 0;
+  int dht_index = 0;
+  int dqt_index = 0;
+  int app_index = 0;
+  int com_index = 0;
+  int data_index = 0;
+  int scan_index = 0;
+  std::vector<HuffmanCodeTable> dc_huff_table;
+  std::vector<HuffmanCodeTable> ac_huff_table;
+  const uint8_t* pad_bits = nullptr;
+  const uint8_t* pad_bits_end = nullptr;
+  bool seen_dri_marker = false;
+  bool is_progressive = false;
+
+  EncodeScanState scan_state;
+};
+
+}  // namespace jpeg
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_DEC_JPEG_SERIALIZATION_STATE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.cc b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.cc
new file mode 100644
index 0000000000..842612f4ab
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.cc
@@ -0,0 +1,384 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/jpeg/enc_jpeg_data.h"
+
+#include <brotli/encode.h>
+#include <stdio.h>
+
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/jpeg/enc_jpeg_data_reader.h"
+#include "lib/jxl/luminance.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+namespace jpeg {
+
+namespace {
+
+constexpr int BITS_IN_JSAMPLE = 8;
+using ByteSpan = Span<const uint8_t>;
+
+// TODO(eustas): move to jpeg_data, to use from codec_jpg as well.
+// See if there is a canonically chunked ICC profile and mark corresponding
+// app-tags with AppMarkerType::kICC.
+Status DetectIccProfile(JPEGData& jpeg_data) {
+  JXL_DASSERT(jpeg_data.app_data.size() == jpeg_data.app_marker_type.size());
+  size_t num_icc = 0;
+  size_t num_icc_jpeg = 0;
+  for (size_t i = 0; i < jpeg_data.app_data.size(); i++) {
+    const auto& app = jpeg_data.app_data[i];
+    size_t pos = 0;
+    if (app[pos++] != 0xE2) continue;
+    // At least APPn + size; otherwise it should be intermarker-data.
+    JXL_DASSERT(app.size() >= 3);
+    size_t tag_length = (app[pos] << 8) + app[pos + 1];
+    pos += 2;
+    JXL_DASSERT(app.size() == tag_length + 1);
+    // Empty payload is 2 bytes for tag length itself + signature
+    if (tag_length < 2 + sizeof kIccProfileTag) continue;
+
+    if (memcmp(&app[pos], kIccProfileTag, sizeof kIccProfileTag) != 0) continue;
+    pos += sizeof kIccProfileTag;
+    uint8_t chunk_id = app[pos++];
+    uint8_t num_chunks = app[pos++];
+    if (chunk_id != num_icc + 1) continue;
+    if (num_icc_jpeg == 0) num_icc_jpeg = num_chunks;
+    if (num_icc_jpeg != num_chunks) continue;
+    num_icc++;
+    jpeg_data.app_marker_type[i] = AppMarkerType::kICC;
+  }
+  if (num_icc != num_icc_jpeg) {
+    return JXL_FAILURE("Invalid ICC chunks");
+  }
+  return true;
+}
+
+bool GetMarkerPayload(const uint8_t* data, size_t size, ByteSpan* payload) {
+  if (size < 3) {
+    return false;
+  }
+  size_t hi = data[1];
+  size_t lo = data[2];
+  size_t internal_size = (hi << 8u) | lo;
+  // Second byte of marker is not counted towards size.
+  if (internal_size != size - 1) {
+    return false;
+  }
+  // cut second marker byte and "length" from payload.
+  *payload = ByteSpan(data, size);
+  payload->remove_prefix(3);
+  return true;
+}
+
+Status DetectBlobs(jpeg::JPEGData& jpeg_data) {
+  JXL_DASSERT(jpeg_data.app_data.size() == jpeg_data.app_marker_type.size());
+  bool have_exif = false, have_xmp = false;
+  for (size_t i = 0; i < jpeg_data.app_data.size(); i++) {
+    auto& marker = jpeg_data.app_data[i];
+    if (marker.empty() || marker[0] != kApp1) {
+      continue;
+    }
+    ByteSpan payload;
+    if (!GetMarkerPayload(marker.data(), marker.size(), &payload)) {
+      // Something is wrong with this marker; does not care.
+      continue;
+    }
+    if (!have_exif && payload.size() >= sizeof kExifTag &&
+        !memcmp(payload.data(), kExifTag, sizeof kExifTag)) {
+      jpeg_data.app_marker_type[i] = AppMarkerType::kExif;
+      have_exif = true;
+    }
+    if (!have_xmp && payload.size() >= sizeof kXMPTag &&
+        !memcmp(payload.data(), kXMPTag, sizeof kXMPTag)) {
+      jpeg_data.app_marker_type[i] = AppMarkerType::kXMP;
+      have_xmp = true;
+    }
+  }
+  return true;
+}
+
+Status ParseChunkedMarker(const jpeg::JPEGData& src, uint8_t marker_type,
+                          const ByteSpan& tag, PaddedBytes* output,
+                          bool allow_permutations = false) {
+  output->clear();
+
+  std::vector<ByteSpan> chunks;
+  std::vector<bool> presence;
+  size_t expected_number_of_parts = 0;
+  bool is_first_chunk = true;
+  size_t ordinal = 0;
+  for (const auto& marker : src.app_data) {
+    if (marker.empty() || marker[0] != marker_type) {
+      continue;
+    }
+    ByteSpan payload;
+    if (!GetMarkerPayload(marker.data(), marker.size(), &payload)) {
+      // Something is wrong with this marker; does not care.
+      continue;
+    }
+    if ((payload.size() < tag.size()) ||
+        memcmp(payload.data(), tag.data(), tag.size()) != 0) {
+      continue;
+    }
+    payload.remove_prefix(tag.size());
+    if (payload.size() < 2) {
+      return JXL_FAILURE("Chunk is too small.");
+    }
+    uint8_t index = payload[0];
+    uint8_t total = payload[1];
+    ordinal++;
+    if (!allow_permutations) {
+      if (index != ordinal) return JXL_FAILURE("Invalid chunk order.");
+    }
+
+    payload.remove_prefix(2);
+
+    JXL_RETURN_IF_ERROR(total != 0);
+    if (is_first_chunk) {
+      is_first_chunk = false;
+      expected_number_of_parts = total;
+      // 1-based indices; 0-th element is added for convenience.
+      chunks.resize(total + 1);
+      presence.resize(total + 1);
+    } else {
+      JXL_RETURN_IF_ERROR(expected_number_of_parts == total);
+    }
+
+    if (index == 0 || index > total) {
+      return JXL_FAILURE("Invalid chunk index.");
+    }
+
+    if (presence[index]) {
+      return JXL_FAILURE("Duplicate chunk.");
+    }
+    presence[index] = true;
+    chunks[index] = payload;
+  }
+
+  for (size_t i = 0; i < expected_number_of_parts; ++i) {
+    // 0-th element is not used.
+    size_t index = i + 1;
+    if (!presence[index]) {
+      return JXL_FAILURE("Missing chunk.");
+    }
+    output->append(chunks[index]);
+  }
+
+  return true;
+}
+
+Status SetBlobsFromJpegData(const jpeg::JPEGData& jpeg_data, Blobs* blobs) {
+  for (size_t i = 0; i < jpeg_data.app_data.size(); i++) {
+    auto& marker = jpeg_data.app_data[i];
+    if (marker.empty() || marker[0] != kApp1) {
+      continue;
+    }
+    ByteSpan payload;
+    if (!GetMarkerPayload(marker.data(), marker.size(), &payload)) {
+      // Something is wrong with this marker; does not care.
+      continue;
+    }
+    if (payload.size() >= sizeof kExifTag &&
+        !memcmp(payload.data(), kExifTag, sizeof kExifTag)) {
+      if (blobs->exif.empty()) {
+        blobs->exif.resize(payload.size() - sizeof kExifTag);
+        memcpy(blobs->exif.data(), payload.data() + sizeof kExifTag,
+               payload.size() - sizeof kExifTag);
+      } else {
+        JXL_WARNING(
+            "ReJPEG: multiple Exif blobs, storing only first one in the JPEG "
+            "XL container\n");
+      }
+    }
+    if (payload.size() >= sizeof kXMPTag &&
+        !memcmp(payload.data(), kXMPTag, sizeof kXMPTag)) {
+      if (blobs->xmp.empty()) {
+        blobs->xmp.resize(payload.size() - sizeof kXMPTag);
+        memcpy(blobs->xmp.data(), payload.data() + sizeof kXMPTag,
+               payload.size() - sizeof kXMPTag);
+      } else {
+        JXL_WARNING(
+            "ReJPEG: multiple XMP blobs, storing only first one in the JPEG "
+            "XL container\n");
+      }
+    }
+  }
+  return true;
+}
+
+static inline bool IsJPG(const Span<const uint8_t> bytes) {
+  return bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xD8;
+}
+
+}  // namespace
+
+Status SetColorEncodingFromJpegData(const jpeg::JPEGData& jpg,
+                                    ColorEncoding* color_encoding) {
+  PaddedBytes icc_profile;
+  if (!ParseChunkedMarker(jpg, kApp2, ByteSpan(kIccProfileTag), &icc_profile)) {
+    JXL_WARNING("ReJPEG: corrupted ICC profile\n");
+    icc_profile.clear();
+  }
+
+  if (icc_profile.empty()) {
+    bool is_gray = (jpg.components.size() == 1);
+    *color_encoding = ColorEncoding::SRGB(is_gray);
+    return true;
+  }
+
+  return color_encoding->SetICC(std::move(icc_profile));
+}
+
+Status EncodeJPEGData(JPEGData& jpeg_data, PaddedBytes* bytes,
+                      const CompressParams& cparams) {
+  jpeg_data.app_marker_type.resize(jpeg_data.app_data.size(),
+                                   AppMarkerType::kUnknown);
+  JXL_RETURN_IF_ERROR(DetectIccProfile(jpeg_data));
+  JXL_RETURN_IF_ERROR(DetectBlobs(jpeg_data));
+  BitWriter writer;
+  JXL_RETURN_IF_ERROR(Bundle::Write(jpeg_data, &writer, 0, nullptr));
+  writer.ZeroPadToByte();
+  *bytes = std::move(writer).TakeBytes();
+  BrotliEncoderState* brotli_enc =
+      BrotliEncoderCreateInstance(nullptr, nullptr, nullptr);
+  int effort = cparams.brotli_effort;
+  if (effort < 0) effort = 11 - static_cast<int>(cparams.speed_tier);
+  BrotliEncoderSetParameter(brotli_enc, BROTLI_PARAM_QUALITY, effort);
+  size_t total_data = 0;
+  for (size_t i = 0; i < jpeg_data.app_data.size(); i++) {
+    if (jpeg_data.app_marker_type[i] != AppMarkerType::kUnknown) {
+      continue;
+    }
+    total_data += jpeg_data.app_data[i].size();
+  }
+  for (size_t i = 0; i < jpeg_data.com_data.size(); i++) {
+    total_data += jpeg_data.com_data[i].size();
+  }
+  for (size_t i = 0; i < jpeg_data.inter_marker_data.size(); i++) {
+    total_data += jpeg_data.inter_marker_data[i].size();
+  }
+  total_data += jpeg_data.tail_data.size();
+  size_t initial_size = bytes->size();
+  size_t brotli_capacity = BrotliEncoderMaxCompressedSize(total_data);
+  BrotliEncoderSetParameter(brotli_enc, BROTLI_PARAM_SIZE_HINT, total_data);
+  bytes->resize(bytes->size() + brotli_capacity);
+  size_t enc_size = 0;
+  auto br_append = [&](const std::vector<uint8_t>& data, bool last) {
+    size_t available_in = data.size();
+    const uint8_t* in = data.data();
+    uint8_t* out = &(*bytes)[initial_size + enc_size];
+    do {
+      uint8_t* out_before = out;
+      msan::MemoryIsInitialized(in, available_in);
+      JXL_CHECK(BrotliEncoderCompressStream(
+          brotli_enc, last ? BROTLI_OPERATION_FINISH : BROTLI_OPERATION_PROCESS,
+          &available_in, &in, &brotli_capacity, &out, &enc_size));
+      msan::UnpoisonMemory(out_before, out - out_before);
+    } while (BrotliEncoderHasMoreOutput(brotli_enc) || available_in > 0);
+  };
+
+  for (size_t i = 0; i < jpeg_data.app_data.size(); i++) {
+    if (jpeg_data.app_marker_type[i] != AppMarkerType::kUnknown) {
+      continue;
+    }
+    br_append(jpeg_data.app_data[i], /*last=*/false);
+  }
+  for (size_t i = 0; i < jpeg_data.com_data.size(); i++) {
+    br_append(jpeg_data.com_data[i], /*last=*/false);
+  }
+  for (size_t i = 0; i < jpeg_data.inter_marker_data.size(); i++) {
+    br_append(jpeg_data.inter_marker_data[i], /*last=*/false);
+  }
+  br_append(jpeg_data.tail_data, /*last=*/true);
+  BrotliEncoderDestroyInstance(brotli_enc);
+  bytes->resize(initial_size + enc_size);
+  return true;
+}
+
+Status DecodeImageJPG(const Span<const uint8_t> bytes, CodecInOut* io) {
+  if (!IsJPG(bytes)) return false;
+  io->frames.clear();
+  io->frames.reserve(1);
+  io->frames.emplace_back(&io->metadata.m);
+  io->Main().jpeg_data = make_unique<jpeg::JPEGData>();
+  jpeg::JPEGData* jpeg_data = io->Main().jpeg_data.get();
+  if (!jpeg::ReadJpeg(bytes.data(), bytes.size(), jpeg::JpegReadMode::kReadAll,
+                      jpeg_data)) {
+    return JXL_FAILURE("Error reading JPEG");
+  }
+  JXL_RETURN_IF_ERROR(
+      SetColorEncodingFromJpegData(*jpeg_data, &io->metadata.m.color_encoding));
+  JXL_RETURN_IF_ERROR(SetBlobsFromJpegData(*jpeg_data, &io->blobs));
+  size_t nbcomp = jpeg_data->components.size();
+  if (nbcomp != 1 && nbcomp != 3) {
+    return JXL_FAILURE("Cannot recompress JPEGs with neither 1 nor 3 channels");
+  }
+  YCbCrChromaSubsampling cs;
+  if (nbcomp == 3) {
+    uint8_t hsample[3], vsample[3];
+    for (size_t i = 0; i < nbcomp; i++) {
+      hsample[i] = jpeg_data->components[i].h_samp_factor;
+      vsample[i] = jpeg_data->components[i].v_samp_factor;
+    }
+    JXL_RETURN_IF_ERROR(cs.Set(hsample, vsample));
+  } else if (nbcomp == 1) {
+    uint8_t hsample[3], vsample[3];
+    for (size_t i = 0; i < 3; i++) {
+      hsample[i] = jpeg_data->components[0].h_samp_factor;
+      vsample[i] = jpeg_data->components[0].v_samp_factor;
+    }
+    JXL_RETURN_IF_ERROR(cs.Set(hsample, vsample));
+  }
+  bool is_rgb = false;
+  {
+    const auto& markers = jpeg_data->marker_order;
+    // If there is a JFIF marker, this is YCbCr. Otherwise...
+    if (std::find(markers.begin(), markers.end(), 0xE0) == markers.end()) {
+      // Try to find an 'Adobe' marker.
+      size_t app_markers = 0;
+      size_t i = 0;
+      for (; i < markers.size(); i++) {
+        // This is an APP marker.
+        if ((markers[i] & 0xF0) == 0xE0) {
+          JXL_CHECK(app_markers < jpeg_data->app_data.size());
+          // APP14 marker
+          if (markers[i] == 0xEE) {
+            const auto& data = jpeg_data->app_data[app_markers];
+            if (data.size() == 15 && data[3] == 'A' && data[4] == 'd' &&
+                data[5] == 'o' && data[6] == 'b' && data[7] == 'e') {
+              // 'Adobe' marker.
+              is_rgb = data[14] == 0;
+              break;
+            }
+          }
+          app_markers++;
+        }
+      }
+
+      if (i == markers.size()) {
+        // No 'Adobe' marker, guess from component IDs.
+        is_rgb = nbcomp == 3 && jpeg_data->components[0].id == 'R' &&
+                 jpeg_data->components[1].id == 'G' &&
+                 jpeg_data->components[2].id == 'B';
+      }
+    }
+  }
+
+  io->Main().chroma_subsampling = cs;
+  io->Main().color_transform =
+      (!is_rgb || nbcomp == 1) ? ColorTransform::kYCbCr : ColorTransform::kNone;
+
+  io->metadata.m.SetIntensityTarget(kDefaultIntensityTarget);
+  io->metadata.m.SetUintSamples(BITS_IN_JSAMPLE);
+  io->SetFromImage(Image3F(jpeg_data->width, jpeg_data->height),
+                   io->metadata.m.color_encoding);
+  SetIntensityTarget(&io->metadata.m);
+  return true;
+}
+
+}  // namespace jpeg
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.h b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.h
new file mode 100644
index 0000000000..806128c465
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_JPEG_ENC_JPEG_DATA_H_
+#define LIB_JXL_JPEG_ENC_JPEG_DATA_H_
+
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+Status EncodeJPEGData(JPEGData& jpeg_data, PaddedBytes* bytes,
+                      const CompressParams& cparams);
+
+Status SetColorEncodingFromJpegData(const jpeg::JPEGData& jpg,
+                                    ColorEncoding* color_encoding);
+
+/**
+ * Decodes bytes containing JPEG codestream into a CodecInOut as coefficients
+ * only, for lossless JPEG transcoding.
+ */
+Status DecodeImageJPG(Span<const uint8_t> bytes, CodecInOut* io);
+
+}  // namespace jpeg
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_ENC_JPEG_DATA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.cc b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.cc
new file mode 100644
index 0000000000..f569b73363
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.cc
@@ -0,0 +1,1053 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/jpeg/enc_jpeg_data_reader.h"
+
+#include <inttypes.h>
+#include <string.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/jpeg/enc_jpeg_huffman_decode.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+
+namespace {
+static const int kBrunsliMaxSampling = 15;
+
+// Macros for commonly used error conditions.
+
+#define JXL_JPEG_VERIFY_LEN(n)                                \
+  if (*pos + (n) > len) {                                     \
+    return JXL_FAILURE("Unexpected end of input: pos=%" PRIuS \
+                       " need=%d len=%" PRIuS,                \
+                       *pos, static_cast<int>(n), len);       \
+  }
+
+#define JXL_JPEG_VERIFY_INPUT(var, low, high, code)                    \
+  if ((var) < (low) || (var) > (high)) {                               \
+    return JXL_FAILURE("Invalid " #var ": %d", static_cast<int>(var)); \
+  }
+
+#define JXL_JPEG_VERIFY_MARKER_END()                             \
+  if (start_pos + marker_len != *pos) {                          \
+    return JXL_FAILURE("Invalid marker length: declared=%" PRIuS \
+                       " actual=%" PRIuS,                        \
+                       marker_len, (*pos - start_pos));          \
+  }
+
+#define JXL_JPEG_EXPECT_MARKER()                                 \
+  if (pos + 2 > len || data[pos] != 0xff) {                      \
+    return JXL_FAILURE(                                          \
+        "Marker byte (0xff) expected, found: 0x%.2x pos=%" PRIuS \
+        " len=%" PRIuS,                                          \
+        (pos < len ? data[pos] : 0), pos, len);                  \
+  }
+
+inline int ReadUint8(const uint8_t* data, size_t* pos) {
+  return data[(*pos)++];
+}
+
+inline int ReadUint16(const uint8_t* data, size_t* pos) {
+  int v = (data[*pos] << 8) + data[*pos + 1];
+  *pos += 2;
+  return v;
+}
+
+// Reads the Start of Frame (SOF) marker segment and fills in *jpg with the
+// parsed data.
+bool ProcessSOF(const uint8_t* data, const size_t len, JpegReadMode mode,
+                size_t* pos, JPEGData* jpg) {
+  if (jpg->width != 0) {
+    return JXL_FAILURE("Duplicate SOF marker.");
+  }
+  const size_t start_pos = *pos;
+  JXL_JPEG_VERIFY_LEN(8);
+  size_t marker_len = ReadUint16(data, pos);
+  int precision = ReadUint8(data, pos);
+  int height = ReadUint16(data, pos);
+  int width = ReadUint16(data, pos);
+  int num_components = ReadUint8(data, pos);
+  // 'jbrd' is hardcoded for 8bits:
+  JXL_JPEG_VERIFY_INPUT(precision, 8, 8, PRECISION);
+  JXL_JPEG_VERIFY_INPUT(height, 1, kMaxDimPixels, HEIGHT);
+  JXL_JPEG_VERIFY_INPUT(width, 1, kMaxDimPixels, WIDTH);
+  JXL_JPEG_VERIFY_INPUT(num_components, 1, kMaxComponents, NUMCOMP);
+  JXL_JPEG_VERIFY_LEN(3 * num_components);
+  jpg->height = height;
+  jpg->width = width;
+  jpg->components.resize(num_components);
+
+  // Read sampling factors and quant table index for each component.
+  std::vector<bool> ids_seen(256, false);
+  int max_h_samp_factor = 1;
+  int max_v_samp_factor = 1;
+  for (size_t i = 0; i < jpg->components.size(); ++i) {
+    const int id = ReadUint8(data, pos);
+    if (ids_seen[id]) {  // (cf. section B.2.2, syntax of Ci)
+      return JXL_FAILURE("Duplicate ID %d in SOF.", id);
+    }
+    ids_seen[id] = true;
+    jpg->components[i].id = id;
+    int factor = ReadUint8(data, pos);
+    int h_samp_factor = factor >> 4;
+    int v_samp_factor = factor & 0xf;
+    JXL_JPEG_VERIFY_INPUT(h_samp_factor, 1, kBrunsliMaxSampling, SAMP_FACTOR);
+    JXL_JPEG_VERIFY_INPUT(v_samp_factor, 1, kBrunsliMaxSampling, SAMP_FACTOR);
+    jpg->components[i].h_samp_factor = h_samp_factor;
+    jpg->components[i].v_samp_factor = v_samp_factor;
+    jpg->components[i].quant_idx = ReadUint8(data, pos);
+    max_h_samp_factor = std::max(max_h_samp_factor, h_samp_factor);
+    max_v_samp_factor = std::max(max_v_samp_factor, v_samp_factor);
+  }
+
+  // We have checked above that none of the sampling factors are 0, so the max
+  // sampling factors can not be 0.
+  int MCU_rows = DivCeil(jpg->height, max_v_samp_factor * 8);
+  int MCU_cols = DivCeil(jpg->width, max_h_samp_factor * 8);
+  // Compute the block dimensions for each component.
+  for (size_t i = 0; i < jpg->components.size(); ++i) {
+    JPEGComponent* c = &jpg->components[i];
+    if (max_h_samp_factor % c->h_samp_factor != 0 ||
+        max_v_samp_factor % c->v_samp_factor != 0) {
+      return JXL_FAILURE("Non-integral subsampling ratios.");
+    }
+    c->width_in_blocks = MCU_cols * c->h_samp_factor;
+    c->height_in_blocks = MCU_rows * c->v_samp_factor;
+    const uint64_t num_blocks =
+        static_cast<uint64_t>(c->width_in_blocks) * c->height_in_blocks;
+    if (mode == JpegReadMode::kReadAll) {
+      c->coeffs.resize(num_blocks * kDCTBlockSize);
+    }
+  }
+  JXL_JPEG_VERIFY_MARKER_END();
+  return true;
+}
+
+// Reads the Start of Scan (SOS) marker segment and fills in *scan_info with the
+// parsed data.
+bool ProcessSOS(const uint8_t* data, const size_t len, size_t* pos,
+                JPEGData* jpg) {
+  const size_t start_pos = *pos;
+  JXL_JPEG_VERIFY_LEN(3);
+  size_t marker_len = ReadUint16(data, pos);
+  size_t comps_in_scan = ReadUint8(data, pos);
+  JXL_JPEG_VERIFY_INPUT(comps_in_scan, 1, jpg->components.size(),
+                        COMPS_IN_SCAN);
+
+  JPEGScanInfo scan_info;
+  scan_info.num_components = comps_in_scan;
+  JXL_JPEG_VERIFY_LEN(2 * comps_in_scan);
+  std::vector<bool> ids_seen(256, false);
+  for (size_t i = 0; i < comps_in_scan; ++i) {
+    uint32_t id = ReadUint8(data, pos);
+    if (ids_seen[id]) {  // (cf. section B.2.3, regarding CSj)
+      return JXL_FAILURE("Duplicate ID %d in SOS.", id);
+    }
+    ids_seen[id] = true;
+    bool found_index = false;
+    for (size_t j = 0; j < jpg->components.size(); ++j) {
+      if (jpg->components[j].id == id) {
+        scan_info.components[i].comp_idx = j;
+        found_index = true;
+      }
+    }
+    if (!found_index) {
+      return JXL_FAILURE("SOS marker: Could not find component with id %d", id);
+    }
+    int c = ReadUint8(data, pos);
+    int dc_tbl_idx = c >> 4;
+    int ac_tbl_idx = c & 0xf;
+    JXL_JPEG_VERIFY_INPUT(dc_tbl_idx, 0, 3, HUFFMAN_INDEX);
+    JXL_JPEG_VERIFY_INPUT(ac_tbl_idx, 0, 3, HUFFMAN_INDEX);
+    scan_info.components[i].dc_tbl_idx = dc_tbl_idx;
+    scan_info.components[i].ac_tbl_idx = ac_tbl_idx;
+  }
+  JXL_JPEG_VERIFY_LEN(3);
+  scan_info.Ss = ReadUint8(data, pos);
+  scan_info.Se = ReadUint8(data, pos);
+  JXL_JPEG_VERIFY_INPUT(static_cast<int>(scan_info.Ss), 0, 63, START_OF_SCAN);
+  JXL_JPEG_VERIFY_INPUT(scan_info.Se, scan_info.Ss, 63, END_OF_SCAN);
+  int c = ReadUint8(data, pos);
+  scan_info.Ah = c >> 4;
+  scan_info.Al = c & 0xf;
+  if (scan_info.Ah != 0 && scan_info.Al != scan_info.Ah - 1) {
+    // section G.1.1.1.2 : Successive approximation control only improves
+    // by one bit at a time. But it's not always respected, so we just issue
+    // a warning.
+    JXL_WARNING("Invalid progressive parameters: Al=%d Ah=%d", scan_info.Al,
+                scan_info.Ah);
+  }
+  // Check that all the Huffman tables needed for this scan are defined.
+  for (size_t i = 0; i < comps_in_scan; ++i) {
+    bool found_dc_table = false;
+    bool found_ac_table = false;
+    for (size_t j = 0; j < jpg->huffman_code.size(); ++j) {
+      uint32_t slot_id = jpg->huffman_code[j].slot_id;
+      if (slot_id == scan_info.components[i].dc_tbl_idx) {
+        found_dc_table = true;
+      } else if (slot_id == scan_info.components[i].ac_tbl_idx + 16) {
+        found_ac_table = true;
+      }
+    }
+    if (scan_info.Ss == 0 && !found_dc_table) {
+      return JXL_FAILURE(
+          "SOS marker: Could not find DC Huffman table with index %d",
+          scan_info.components[i].dc_tbl_idx);
+    }
+    if (scan_info.Se > 0 && !found_ac_table) {
+      return JXL_FAILURE(
+          "SOS marker: Could not find AC Huffman table with index %d",
+          scan_info.components[i].ac_tbl_idx);
+    }
+  }
+  jpg->scan_info.push_back(scan_info);
+  JXL_JPEG_VERIFY_MARKER_END();
+  return true;
+}
+
+// Reads the Define Huffman Table (DHT) marker segment and fills in *jpg with
+// the parsed data. Builds the Huffman decoding table in either dc_huff_lut or
+// ac_huff_lut, depending on the type and solt_id of Huffman code being read.
+bool ProcessDHT(const uint8_t* data, const size_t len, JpegReadMode mode,
+                std::vector<HuffmanTableEntry>* dc_huff_lut,
+                std::vector<HuffmanTableEntry>* ac_huff_lut, size_t* pos,
+                JPEGData* jpg) {
+  const size_t start_pos = *pos;
+  JXL_JPEG_VERIFY_LEN(2);
+  size_t marker_len = ReadUint16(data, pos);
+  if (marker_len == 2) {
+    return JXL_FAILURE("DHT marker: no Huffman table found");
+  }
+  while (*pos < start_pos + marker_len) {
+    JXL_JPEG_VERIFY_LEN(1 + kJpegHuffmanMaxBitLength);
+    JPEGHuffmanCode huff;
+    huff.slot_id = ReadUint8(data, pos);
+    int huffman_index = huff.slot_id;
+    int is_ac_table = (huff.slot_id & 0x10) != 0;
+    HuffmanTableEntry* huff_lut;
+    if (is_ac_table) {
+      huffman_index -= 0x10;
+      JXL_JPEG_VERIFY_INPUT(huffman_index, 0, 3, HUFFMAN_INDEX);
+      huff_lut = &(*ac_huff_lut)[huffman_index * kJpegHuffmanLutSize];
+    } else {
+      JXL_JPEG_VERIFY_INPUT(huffman_index, 0, 3, HUFFMAN_INDEX);
+      huff_lut = &(*dc_huff_lut)[huffman_index * kJpegHuffmanLutSize];
+    }
+    huff.counts[0] = 0;
+    int total_count = 0;
+    int space = 1 << kJpegHuffmanMaxBitLength;
+    int max_depth = 1;
+    for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+      int count = ReadUint8(data, pos);
+      if (count != 0) {
+        max_depth = i;
+      }
+      huff.counts[i] = count;
+      total_count += count;
+      space -= count * (1 << (kJpegHuffmanMaxBitLength - i));
+    }
+    if (is_ac_table) {
+      JXL_JPEG_VERIFY_INPUT(total_count, 0, kJpegHuffmanAlphabetSize,
+                            HUFFMAN_CODE);
+    } else {
+      JXL_JPEG_VERIFY_INPUT(total_count, 0, kJpegDCAlphabetSize, HUFFMAN_CODE);
+    }
+    JXL_JPEG_VERIFY_LEN(total_count);
+    std::vector<bool> values_seen(256, false);
+    for (int i = 0; i < total_count; ++i) {
+      int value = ReadUint8(data, pos);
+      if (!is_ac_table) {
+        JXL_JPEG_VERIFY_INPUT(value, 0, kJpegDCAlphabetSize - 1, HUFFMAN_CODE);
+      }
+      if (values_seen[value]) {
+        return JXL_FAILURE("Duplicate Huffman code value %d", value);
+      }
+      values_seen[value] = true;
+      huff.values[i] = value;
+    }
+    // Add an invalid symbol that will have the all 1 code.
+    ++huff.counts[max_depth];
+    huff.values[total_count] = kJpegHuffmanAlphabetSize;
+    space -= (1 << (kJpegHuffmanMaxBitLength - max_depth));
+    if (space < 0) {
+      return JXL_FAILURE("Invalid Huffman code lengths.");
+    } else if (space > 0 && huff_lut[0].value != 0xffff) {
+      // Re-initialize the values to an invalid symbol so that we can recognize
+      // it when reading the bit stream using a Huffman code with space > 0.
+      for (int i = 0; i < kJpegHuffmanLutSize; ++i) {
+        huff_lut[i].bits = 0;
+        huff_lut[i].value = 0xffff;
+      }
+    }
+    huff.is_last = (*pos == start_pos + marker_len);
+    if (mode == JpegReadMode::kReadAll) {
+      BuildJpegHuffmanTable(&huff.counts[0], &huff.values[0], huff_lut);
+    }
+    jpg->huffman_code.push_back(huff);
+  }
+  JXL_JPEG_VERIFY_MARKER_END();
+  return true;
+}
+
+// Reads the Define Quantization Table (DQT) marker segment and fills in *jpg
+// with the parsed data.
+bool ProcessDQT(const uint8_t* data, const size_t len, size_t* pos,
+                JPEGData* jpg) {
+  const size_t start_pos = *pos;
+  JXL_JPEG_VERIFY_LEN(2);
+  size_t marker_len = ReadUint16(data, pos);
+  if (marker_len == 2) {
+    return JXL_FAILURE("DQT marker: no quantization table found");
+  }
+  while (*pos < start_pos + marker_len && jpg->quant.size() < kMaxQuantTables) {
+    JXL_JPEG_VERIFY_LEN(1);
+    int quant_table_index = ReadUint8(data, pos);
+    int quant_table_precision = quant_table_index >> 4;
+    JXL_JPEG_VERIFY_INPUT(quant_table_precision, 0, 1, QUANT_TBL_PRECISION);
+    quant_table_index &= 0xf;
+    JXL_JPEG_VERIFY_INPUT(quant_table_index, 0, 3, QUANT_TBL_INDEX);
+    JXL_JPEG_VERIFY_LEN((quant_table_precision + 1) * kDCTBlockSize);
+    JPEGQuantTable table;
+    table.index = quant_table_index;
+    table.precision = quant_table_precision;
+    for (size_t i = 0; i < kDCTBlockSize; ++i) {
+      int quant_val =
+          quant_table_precision ? ReadUint16(data, pos) : ReadUint8(data, pos);
+      JXL_JPEG_VERIFY_INPUT(quant_val, 1, 65535, QUANT_VAL);
+      table.values[kJPEGNaturalOrder[i]] = quant_val;
+    }
+    table.is_last = (*pos == start_pos + marker_len);
+    jpg->quant.push_back(table);
+  }
+  JXL_JPEG_VERIFY_MARKER_END();
+  return true;
+}
+
+// Reads the DRI marker and saves the restart interval into *jpg.
+bool ProcessDRI(const uint8_t* data, const size_t len, size_t* pos,
+                bool* found_dri, JPEGData* jpg) {
+  if (*found_dri) {
+    return JXL_FAILURE("Duplicate DRI marker.");
+  }
+  *found_dri = true;
+  const size_t start_pos = *pos;
+  JXL_JPEG_VERIFY_LEN(4);
+  size_t marker_len = ReadUint16(data, pos);
+  int restart_interval = ReadUint16(data, pos);
+  jpg->restart_interval = restart_interval;
+  JXL_JPEG_VERIFY_MARKER_END();
+  return true;
+}
+
+// Saves the APP marker segment as a string to *jpg.
+bool ProcessAPP(const uint8_t* data, const size_t len, size_t* pos,
+                JPEGData* jpg) {
+  JXL_JPEG_VERIFY_LEN(2);
+  size_t marker_len = ReadUint16(data, pos);
+  JXL_JPEG_VERIFY_INPUT(marker_len, 2, 65535, MARKER_LEN);
+  JXL_JPEG_VERIFY_LEN(marker_len - 2);
+  JXL_DASSERT(*pos >= 3);
+  // Save the marker type together with the app data.
+  const uint8_t* app_str_start = data + *pos - 3;
+  std::vector<uint8_t> app_str(app_str_start, app_str_start + marker_len + 1);
+  *pos += marker_len - 2;
+  jpg->app_data.push_back(app_str);
+  return true;
+}
+
+// Saves the COM marker segment as a string to *jpg.
+bool ProcessCOM(const uint8_t* data, const size_t len, size_t* pos,
+                JPEGData* jpg) {
+  JXL_JPEG_VERIFY_LEN(2);
+  size_t marker_len = ReadUint16(data, pos);
+  JXL_JPEG_VERIFY_INPUT(marker_len, 2, 65535, MARKER_LEN);
+  JXL_JPEG_VERIFY_LEN(marker_len - 2);
+  const uint8_t* com_str_start = data + *pos - 3;
+  std::vector<uint8_t> com_str(com_str_start, com_str_start + marker_len + 1);
+  *pos += marker_len - 2;
+  jpg->com_data.push_back(com_str);
+  return true;
+}
+
+// Helper structure to read bits from the entropy coded data segment.
+struct BitReaderState {
+  BitReaderState(const uint8_t* data, const size_t len, size_t pos)
+      : data_(data), len_(len) {
+    Reset(pos);
+  }
+
+  void Reset(size_t pos) {
+    pos_ = pos;
+    val_ = 0;
+    bits_left_ = 0;
+    next_marker_pos_ = len_ - 2;
+    FillBitWindow();
+  }
+
+  // Returns the next byte and skips the 0xff/0x00 escape sequences.
+  uint8_t GetNextByte() {
+    if (pos_ >= next_marker_pos_) {
+      ++pos_;
+      return 0;
+    }
+    uint8_t c = data_[pos_++];
+    if (c == 0xff) {
+      uint8_t escape = data_[pos_];
+      if (escape == 0) {
+        ++pos_;
+      } else {
+        // 0xff was followed by a non-zero byte, which means that we found the
+        // start of the next marker segment.
+        next_marker_pos_ = pos_ - 1;
+      }
+    }
+    return c;
+  }
+
+  void FillBitWindow() {
+    if (bits_left_ <= 16) {
+      while (bits_left_ <= 56) {
+        val_ <<= 8;
+        val_ |= (uint64_t)GetNextByte();
+        bits_left_ += 8;
+      }
+    }
+  }
+
+  int ReadBits(int nbits) {
+    FillBitWindow();
+    uint64_t val = (val_ >> (bits_left_ - nbits)) & ((1ULL << nbits) - 1);
+    bits_left_ -= nbits;
+    return val;
+  }
+
+  // Sets *pos to the next stream position where parsing should continue.
+  // Enqueue the padding bits seen (0 or 1).
+  // Returns false if there is inconsistent or invalid padding or the stream
+  // ended too early.
+  bool FinishStream(JPEGData* jpg, size_t* pos) {
+    int npadbits = bits_left_ & 7;
+    if (npadbits > 0) {
+      uint64_t padmask = (1ULL << npadbits) - 1;
+      uint64_t padbits = (val_ >> (bits_left_ - npadbits)) & padmask;
+      if (padbits != padmask) {
+        jpg->has_zero_padding_bit = true;
+      }
+      for (int i = npadbits - 1; i >= 0; --i) {
+        jpg->padding_bits.push_back((padbits >> i) & 1);
+      }
+    }
+    // Give back some bytes that we did not use.
+    int unused_bytes_left = bits_left_ >> 3;
+    while (unused_bytes_left-- > 0) {
+      --pos_;
+      // If we give back a 0 byte, we need to check if it was a 0xff/0x00 escape
+      // sequence, and if yes, we need to give back one more byte.
+      if (pos_ < next_marker_pos_ && data_[pos_] == 0 &&
+          data_[pos_ - 1] == 0xff) {
+        --pos_;
+      }
+    }
+    if (pos_ > next_marker_pos_) {
+      // Data ran out before the scan was complete.
+      return JXL_FAILURE("Unexpected end of scan.");
+    }
+    *pos = pos_;
+    return true;
+  }
+
+  const uint8_t* data_;
+  const size_t len_;
+  size_t pos_;
+  uint64_t val_;
+  int bits_left_;
+  size_t next_marker_pos_;
+};
+
+// Returns the next Huffman-coded symbol.
+int ReadSymbol(const HuffmanTableEntry* table, BitReaderState* br) {
+  int nbits;
+  br->FillBitWindow();
+  int val = (br->val_ >> (br->bits_left_ - 8)) & 0xff;
+  table += val;
+  nbits = table->bits - 8;
+  if (nbits > 0) {
+    br->bits_left_ -= 8;
+    table += table->value;
+    val = (br->val_ >> (br->bits_left_ - nbits)) & ((1 << nbits) - 1);
+    table += val;
+  }
+  br->bits_left_ -= table->bits;
+  return table->value;
+}
+
+/**
+ * Returns the DC diff or AC value for extra bits value x and prefix code s.
+ *
+ * CCITT Rec. T.81 (1992 E)
+ * Table F.1 – Difference magnitude categories for DC coding
+ *  SSSS | DIFF values
+ * ------+--------------------------
+ *     0 | 0
+ *     1 | –1, 1
+ *     2 | –3, –2, 2, 3
+ *     3 | –7..–4, 4..7
+ * ......|..........................
+ *    11 | –2047..–1024, 1024..2047
+ *
+ * CCITT Rec. T.81 (1992 E)
+ * Table F.2 – Categories assigned to coefficient values
+ * [ Same as Table F.1, but does not include SSSS equal to 0 and 11]
+ *
+ *
+ * CCITT Rec. T.81 (1992 E)
+ * F.1.2.1.1 Structure of DC code table
+ * For each category,... additional bits... appended... to uniquely identify
+ * which difference... occurred... When DIFF is positive... SSSS... bits of DIFF
+ * are appended. When DIFF is negative... SSSS... bits of (DIFF – 1) are
+ * appended... Most significant bit... is 0 for negative differences and 1 for
+ * positive differences.
+ *
+ * In other words the upper half of extra bits range represents DIFF as is.
+ * The lower half represents the negative DIFFs with an offset.
+ */
+int HuffExtend(int x, int s) {
+  JXL_DASSERT(s >= 1);
+  int half = 1 << (s - 1);
+  if (x >= half) {
+    JXL_DASSERT(x < (1 << s));
+    return x;
+  } else {
+    return x - (1 << s) + 1;
+  }
+}
+
+// Decodes one 8x8 block of DCT coefficients from the bit stream.
+bool DecodeDCTBlock(const HuffmanTableEntry* dc_huff,
+                    const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al,
+                    int* eobrun, bool* reset_state, int* num_zero_runs,
+                    BitReaderState* br, JPEGData* jpg, coeff_t* last_dc_coeff,
+                    coeff_t* coeffs) {
+  // Nowadays multiplication is even faster than variable shift.
+  int Am = 1 << Al;
+  bool eobrun_allowed = Ss > 0;
+  if (Ss == 0) {
+    int s = ReadSymbol(dc_huff, br);
+    if (s >= kJpegDCAlphabetSize) {
+      return JXL_FAILURE("Invalid Huffman symbol %d  for DC coefficient.", s);
+    }
+    int diff = 0;
+    if (s > 0) {
+      int bits = br->ReadBits(s);
+      diff = HuffExtend(bits, s);
+    }
+    int coeff = diff + *last_dc_coeff;
+    const int dc_coeff = coeff * Am;
+    coeffs[0] = dc_coeff;
+    // TODO(eustas): is there a more elegant / explicit way to check this?
+    if (dc_coeff != coeffs[0]) {
+      return JXL_FAILURE("Invalid DC coefficient %d", dc_coeff);
+    }
+    *last_dc_coeff = coeff;
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  if (*eobrun > 0) {
+    --(*eobrun);
+    return true;
+  }
+  *num_zero_runs = 0;
+  for (int k = Ss; k <= Se; k++) {
+    int sr = ReadSymbol(ac_huff, br);
+    if (sr >= kJpegHuffmanAlphabetSize) {
+      return JXL_FAILURE("Invalid Huffman symbol %d for AC coefficient %d", sr,
+                         k);
+    }
+    int r = sr >> 4;
+    int s = sr & 15;
+    if (s > 0) {
+      k += r;
+      if (k > Se) {
+        return JXL_FAILURE("Out-of-band coefficient %d band was %d-%d", k, Ss,
+                           Se);
+      }
+      if (s + Al >= kJpegDCAlphabetSize) {
+        return JXL_FAILURE(
+            "Out of range AC coefficient value: s = %d Al = %d k = %d", s, Al,
+            k);
+      }
+      int bits = br->ReadBits(s);
+      int coeff = HuffExtend(bits, s);
+      coeffs[kJPEGNaturalOrder[k]] = coeff * Am;
+      *num_zero_runs = 0;
+    } else if (r == 15) {
+      k += 15;
+      ++(*num_zero_runs);
+    } else {
+      if (eobrun_allowed && k == Ss && *eobrun == 0) {
+        // We have two end-of-block runs right after each other, so we signal
+        // the jpeg encoder to force a state reset at this point.
+        *reset_state = true;
+      }
+      *eobrun = 1 << r;
+      if (r > 0) {
+        if (!eobrun_allowed) {
+          return JXL_FAILURE("End-of-block run crossing DC coeff.");
+        }
+        *eobrun += br->ReadBits(r);
+      }
+      break;
+    }
+  }
+  --(*eobrun);
+  return true;
+}
+
+bool RefineDCTBlock(const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al,
+                    int* eobrun, bool* reset_state, BitReaderState* br,
+                    JPEGData* jpg, coeff_t* coeffs) {
+  // Nowadays multiplication is even faster than variable shift.
+  int Am = 1 << Al;
+  bool eobrun_allowed = Ss > 0;
+  if (Ss == 0) {
+    int s = br->ReadBits(1);
+    coeff_t dc_coeff = coeffs[0];
+    dc_coeff |= s * Am;
+    coeffs[0] = dc_coeff;
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int p1 = Am;
+  int m1 = -Am;
+  int k = Ss;
+  int r;
+  int s;
+  bool in_zero_run = false;
+  if (*eobrun <= 0) {
+    for (; k <= Se; k++) {
+      s = ReadSymbol(ac_huff, br);
+      if (s >= kJpegHuffmanAlphabetSize) {
+        return JXL_FAILURE("Invalid Huffman symbol %d for AC coefficient %d", s,
+                           k);
+      }
+      r = s >> 4;
+      s &= 15;
+      if (s) {
+        if (s != 1) {
+          return JXL_FAILURE("Invalid Huffman symbol %d for AC coefficient %d",
+                             s, k);
+        }
+        s = br->ReadBits(1) ? p1 : m1;
+        in_zero_run = false;
+      } else {
+        if (r != 15) {
+          if (eobrun_allowed && k == Ss && *eobrun == 0) {
+            // We have two end-of-block runs right after each other, so we
+            // signal the jpeg encoder to force a state reset at this point.
+            *reset_state = true;
+          }
+          *eobrun = 1 << r;
+          if (r > 0) {
+            if (!eobrun_allowed) {
+              return JXL_FAILURE("End-of-block run crossing DC coeff.");
+            }
+            *eobrun += br->ReadBits(r);
+          }
+          break;
+        }
+        in_zero_run = true;
+      }
+      do {
+        coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]];
+        if (thiscoef != 0) {
+          if (br->ReadBits(1)) {
+            if ((thiscoef & p1) == 0) {
+              if (thiscoef >= 0) {
+                thiscoef += p1;
+              } else {
+                thiscoef += m1;
+              }
+            }
+          }
+          coeffs[kJPEGNaturalOrder[k]] = thiscoef;
+        } else {
+          if (--r < 0) {
+            break;
+          }
+        }
+        k++;
+      } while (k <= Se);
+      if (s) {
+        if (k > Se) {
+          return JXL_FAILURE("Out-of-band coefficient %d band was %d-%d", k, Ss,
+                             Se);
+        }
+        coeffs[kJPEGNaturalOrder[k]] = s;
+      }
+    }
+  }
+  if (in_zero_run) {
+    return JXL_FAILURE("Extra zero run before end-of-block.");
+  }
+  if (*eobrun > 0) {
+    for (; k <= Se; k++) {
+      coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]];
+      if (thiscoef != 0) {
+        if (br->ReadBits(1)) {
+          if ((thiscoef & p1) == 0) {
+            if (thiscoef >= 0) {
+              thiscoef += p1;
+            } else {
+              thiscoef += m1;
+            }
+          }
+        }
+        coeffs[kJPEGNaturalOrder[k]] = thiscoef;
+      }
+    }
+  }
+  --(*eobrun);
+  return true;
+}
+
+bool ProcessRestart(const uint8_t* data, const size_t len,
+                    int* next_restart_marker, BitReaderState* br,
+                    JPEGData* jpg) {
+  size_t pos = 0;
+  if (!br->FinishStream(jpg, &pos)) {
+    return JXL_FAILURE("Invalid scan");
+  }
+  int expected_marker = 0xd0 + *next_restart_marker;
+  JXL_JPEG_EXPECT_MARKER();
+  int marker = data[pos + 1];
+  if (marker != expected_marker) {
+    return JXL_FAILURE("Did not find expected restart marker %d actual %d",
+                       expected_marker, marker);
+  }
+  br->Reset(pos + 2);
+  *next_restart_marker += 1;
+  *next_restart_marker &= 0x7;
+  return true;
+}
+
+bool ProcessScan(const uint8_t* data, const size_t len,
+                 const std::vector<HuffmanTableEntry>& dc_huff_lut,
+                 const std::vector<HuffmanTableEntry>& ac_huff_lut,
+                 uint16_t scan_progression[kMaxComponents][kDCTBlockSize],
+                 bool is_progressive, size_t* pos, JPEGData* jpg) {
+  if (!ProcessSOS(data, len, pos, jpg)) {
+    return false;
+  }
+  JPEGScanInfo* scan_info = &jpg->scan_info.back();
+  bool is_interleaved = (scan_info->num_components > 1);
+  int max_h_samp_factor = 1;
+  int max_v_samp_factor = 1;
+  for (size_t i = 0; i < jpg->components.size(); ++i) {
+    max_h_samp_factor =
+        std::max(max_h_samp_factor, jpg->components[i].h_samp_factor);
+    max_v_samp_factor =
+        std::max(max_v_samp_factor, jpg->components[i].v_samp_factor);
+  }
+
+  int MCU_rows = DivCeil(jpg->height, max_v_samp_factor * 8);
+  int MCUs_per_row = DivCeil(jpg->width, max_h_samp_factor * 8);
+  if (!is_interleaved) {
+    const JPEGComponent& c = jpg->components[scan_info->components[0].comp_idx];
+    MCUs_per_row = DivCeil(jpg->width * c.h_samp_factor, 8 * max_h_samp_factor);
+    MCU_rows = DivCeil(jpg->height * c.v_samp_factor, 8 * max_v_samp_factor);
+  }
+  coeff_t last_dc_coeff[kMaxComponents] = {0};
+  BitReaderState br(data, len, *pos);
+  int restarts_to_go = jpg->restart_interval;
+  int next_restart_marker = 0;
+  int eobrun = -1;
+  int block_scan_index = 0;
+  const int Al = is_progressive ? scan_info->Al : 0;
+  const int Ah = is_progressive ? scan_info->Ah : 0;
+  const int Ss = is_progressive ? scan_info->Ss : 0;
+  const int Se = is_progressive ? scan_info->Se : 63;
+  const uint16_t scan_bitmask = Ah == 0 ? (0xffff << Al) : (1u << Al);
+  const uint16_t refinement_bitmask = (1 << Al) - 1;
+  for (size_t i = 0; i < scan_info->num_components; ++i) {
+    int comp_idx = scan_info->components[i].comp_idx;
+    for (int k = Ss; k <= Se; ++k) {
+      if (scan_progression[comp_idx][k] & scan_bitmask) {
+        return JXL_FAILURE(
+            "Overlapping scans: component=%d k=%d prev_mask: %u cur_mask %u",
+            comp_idx, k, scan_progression[i][k], scan_bitmask);
+      }
+      if (scan_progression[comp_idx][k] & refinement_bitmask) {
+        return JXL_FAILURE(
+            "Invalid scan order, a more refined scan was already done: "
+            "component=%d k=%d prev_mask=%u cur_mask=%u",
+            comp_idx, k, scan_progression[i][k], scan_bitmask);
+      }
+      scan_progression[comp_idx][k] |= scan_bitmask;
+    }
+  }
+  if (Al > 10) {
+    return JXL_FAILURE("Scan parameter Al=%d is not supported.", Al);
+  }
+  for (int mcu_y = 0; mcu_y < MCU_rows; ++mcu_y) {
+    for (int mcu_x = 0; mcu_x < MCUs_per_row; ++mcu_x) {
+      // Handle the restart intervals.
+      if (jpg->restart_interval > 0) {
+        if (restarts_to_go == 0) {
+          if (ProcessRestart(data, len, &next_restart_marker, &br, jpg)) {
+            restarts_to_go = jpg->restart_interval;
+            memset(static_cast<void*>(last_dc_coeff), 0, sizeof(last_dc_coeff));
+            if (eobrun > 0) {
+              return JXL_FAILURE("End-of-block run too long.");
+            }
+            eobrun = -1;  // fresh start
+          } else {
+            return JXL_FAILURE("Could not process restart.");
+          }
+        }
+        --restarts_to_go;
+      }
+      // Decode one MCU.
+      for (size_t i = 0; i < scan_info->num_components; ++i) {
+        JPEGComponentScanInfo* si = &scan_info->components[i];
+        JPEGComponent* c = &jpg->components[si->comp_idx];
+        const HuffmanTableEntry* dc_lut =
+            &dc_huff_lut[si->dc_tbl_idx * kJpegHuffmanLutSize];
+        const HuffmanTableEntry* ac_lut =
+            &ac_huff_lut[si->ac_tbl_idx * kJpegHuffmanLutSize];
+        int nblocks_y = is_interleaved ? c->v_samp_factor : 1;
+        int nblocks_x = is_interleaved ? c->h_samp_factor : 1;
+        for (int iy = 0; iy < nblocks_y; ++iy) {
+          for (int ix = 0; ix < nblocks_x; ++ix) {
+            int block_y = mcu_y * nblocks_y + iy;
+            int block_x = mcu_x * nblocks_x + ix;
+            int block_idx = block_y * c->width_in_blocks + block_x;
+            bool reset_state = false;
+            int num_zero_runs = 0;
+            coeff_t* coeffs = &c->coeffs[block_idx * kDCTBlockSize];
+            if (Ah == 0) {
+              if (!DecodeDCTBlock(dc_lut, ac_lut, Ss, Se, Al, &eobrun,
+                                  &reset_state, &num_zero_runs, &br, jpg,
+                                  &last_dc_coeff[si->comp_idx], coeffs)) {
+                return false;
+              }
+            } else {
+              if (!RefineDCTBlock(ac_lut, Ss, Se, Al, &eobrun, &reset_state,
+                                  &br, jpg, coeffs)) {
+                return false;
+              }
+            }
+            if (reset_state) {
+              scan_info->reset_points.emplace_back(block_scan_index);
+            }
+            if (num_zero_runs > 0) {
+              JPEGScanInfo::ExtraZeroRunInfo info;
+              info.block_idx = block_scan_index;
+              info.num_extra_zero_runs = num_zero_runs;
+              scan_info->extra_zero_runs.push_back(info);
+            }
+            ++block_scan_index;
+          }
+        }
+      }
+    }
+  }
+  if (eobrun > 0) {
+    return JXL_FAILURE("End-of-block run too long.");
+  }
+  if (!br.FinishStream(jpg, pos)) {
+    return JXL_FAILURE("Invalid scan.");
+  }
+  if (*pos > len) {
+    return JXL_FAILURE("Unexpected end of file during scan. pos=%" PRIuS
+                       " len=%" PRIuS,
+                       *pos, len);
+  }
+  return true;
+}
+
+// Changes the quant_idx field of the components to refer to the index of the
+// quant table in the jpg->quant array.
+bool FixupIndexes(JPEGData* jpg) {
+  for (size_t i = 0; i < jpg->components.size(); ++i) {
+    JPEGComponent* c = &jpg->components[i];
+    bool found_index = false;
+    for (size_t j = 0; j < jpg->quant.size(); ++j) {
+      if (jpg->quant[j].index == c->quant_idx) {
+        c->quant_idx = j;
+        found_index = true;
+        break;
+      }
+    }
+    if (!found_index) {
+      return JXL_FAILURE("Quantization table with index %u not found",
+                         c->quant_idx);
+    }
+  }
+  return true;
+}
+
+size_t FindNextMarker(const uint8_t* data, const size_t len, size_t pos) {
+  // kIsValidMarker[i] == 1 means (0xc0 + i) is a valid marker.
+  static const uint8_t kIsValidMarker[] = {
+      1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
+      1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+  };
+  size_t num_skipped = 0;
+  while (pos + 1 < len && (data[pos] != 0xff || data[pos + 1] < 0xc0 ||
+                           !kIsValidMarker[data[pos + 1] - 0xc0])) {
+    ++pos;
+    ++num_skipped;
+  }
+  return num_skipped;
+}
+
+}  // namespace
+
+bool ReadJpeg(const uint8_t* data, const size_t len, JpegReadMode mode,
+              JPEGData* jpg) {
+  size_t pos = 0;
+  // Check SOI marker.
+  JXL_JPEG_EXPECT_MARKER();
+  int marker = data[pos + 1];
+  pos += 2;
+  if (marker != 0xd8) {
+    return JXL_FAILURE("Did not find expected SOI marker, actual=%d", marker);
+  }
+  int lut_size = kMaxHuffmanTables * kJpegHuffmanLutSize;
+  std::vector<HuffmanTableEntry> dc_huff_lut(lut_size);
+  std::vector<HuffmanTableEntry> ac_huff_lut(lut_size);
+  bool found_sof = false;
+  bool found_dri = false;
+  uint16_t scan_progression[kMaxComponents][kDCTBlockSize] = {{0}};
+
+  jpg->padding_bits.resize(0);
+  bool is_progressive = false;  // default
+  do {
+    // Read next marker.
+    size_t num_skipped = FindNextMarker(data, len, pos);
+    if (num_skipped > 0) {
+      // Add a fake marker to indicate arbitrary in-between-markers data.
+      jpg->marker_order.push_back(0xff);
+      jpg->inter_marker_data.emplace_back(data + pos, data + pos + num_skipped);
+      pos += num_skipped;
+    }
+    JXL_JPEG_EXPECT_MARKER();
+    marker = data[pos + 1];
+    pos += 2;
+    bool ok = true;
+    switch (marker) {
+      case 0xc0:
+      case 0xc1:
+      case 0xc2:
+        is_progressive = (marker == 0xc2);
+        ok = ProcessSOF(data, len, mode, &pos, jpg);
+        found_sof = true;
+        break;
+      case 0xc4:
+        ok = ProcessDHT(data, len, mode, &dc_huff_lut, &ac_huff_lut, &pos, jpg);
+        break;
+      case 0xd0:
+      case 0xd1:
+      case 0xd2:
+      case 0xd3:
+      case 0xd4:
+      case 0xd5:
+      case 0xd6:
+      case 0xd7:
+        // RST markers do not have any data.
+        break;
+      case 0xd9:
+        // Found end marker.
+        break;
+      case 0xda:
+        if (mode == JpegReadMode::kReadAll) {
+          ok = ProcessScan(data, len, dc_huff_lut, ac_huff_lut,
+                           scan_progression, is_progressive, &pos, jpg);
+        }
+        break;
+      case 0xdb:
+        ok = ProcessDQT(data, len, &pos, jpg);
+        break;
+      case 0xdd:
+        ok = ProcessDRI(data, len, &pos, &found_dri, jpg);
+        break;
+      case 0xe0:
+      case 0xe1:
+      case 0xe2:
+      case 0xe3:
+      case 0xe4:
+      case 0xe5:
+      case 0xe6:
+      case 0xe7:
+      case 0xe8:
+      case 0xe9:
+      case 0xea:
+      case 0xeb:
+      case 0xec:
+      case 0xed:
+      case 0xee:
+      case 0xef:
+        if (mode != JpegReadMode::kReadTables) {
+          ok = ProcessAPP(data, len, &pos, jpg);
+        }
+        break;
+      case 0xfe:
+        if (mode != JpegReadMode::kReadTables) {
+          ok = ProcessCOM(data, len, &pos, jpg);
+        }
+        break;
+      default:
+        return JXL_FAILURE("Unsupported marker: %d pos=%" PRIuS " len=%" PRIuS,
+                           marker, pos, len);
+    }
+    if (!ok) {
+      return false;
+    }
+    jpg->marker_order.push_back(marker);
+    if (mode == JpegReadMode::kReadHeader && found_sof) {
+      break;
+    }
+  } while (marker != 0xd9);
+
+  if (!found_sof) {
+    return JXL_FAILURE("Missing SOF marker.");
+  }
+
+  // Supplemental checks.
+  if (mode == JpegReadMode::kReadAll) {
+    if (pos < len) {
+      jpg->tail_data = std::vector<uint8_t>(data + pos, data + len);
+    }
+    if (!FixupIndexes(jpg)) {
+      return false;
+    }
+    if (jpg->huffman_code.empty()) {
+      // Section B.2.4.2: "If a table has never been defined for a particular
+      // destination, then when this destination is specified in a scan header,
+      // the results are unpredictable."
+      return JXL_FAILURE("Need at least one Huffman code table.");
+    }
+    if (jpg->huffman_code.size() >= kMaxDHTMarkers) {
+      return JXL_FAILURE("Too many Huffman tables.");
+    }
+  }
+  return true;
+}
+
+}  // namespace jpeg
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.h b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.h
new file mode 100644
index 0000000000..3fad820e9d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.h
@@ -0,0 +1,36 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Functions for reading a jpeg byte stream into a JPEGData object.
+
+#ifndef LIB_JXL_JPEG_ENC_JPEG_DATA_READER_H_
+#define LIB_JXL_JPEG_ENC_JPEG_DATA_READER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+
+enum class JpegReadMode {
+  kReadHeader,  // only basic headers
+  kReadTables,  // headers and tables (quant, Huffman, ...)
+  kReadAll,     // everything
+};
+
+// Parses the JPEG stream contained in data[*pos ... len) and fills in *jpg with
+// the parsed information.
+// If mode is kReadHeader, it fills in only the image dimensions in *jpg.
+// Returns false if the data is not valid JPEG, or if it contains an unsupported
+// JPEG feature.
+bool ReadJpeg(const uint8_t* data, const size_t len, JpegReadMode mode,
+              JPEGData* jpg);
+
+}  // namespace jpeg
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_ENC_JPEG_DATA_READER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc
new file mode 100644
index 0000000000..38282e640a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc
@@ -0,0 +1,103 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/jpeg/enc_jpeg_huffman_decode.h"
+
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+
+// Returns the table width of the next 2nd level table, count is the histogram
+// of bit lengths for the remaining symbols, len is the code length of the next
+// processed symbol.
+static inline int NextTableBitSize(const int* count, int len) {
+  int left = 1 << (len - kJpegHuffmanRootTableBits);
+  while (len < static_cast<int>(kJpegHuffmanMaxBitLength)) {
+    left -= count[len];
+    if (left <= 0) break;
+    ++len;
+    left <<= 1;
+  }
+  return len - kJpegHuffmanRootTableBits;
+}
+
+void BuildJpegHuffmanTable(const uint32_t* count, const uint32_t* symbols,
+                           HuffmanTableEntry* lut) {
+  HuffmanTableEntry code;    // current table entry
+  HuffmanTableEntry* table;  // next available space in table
+  int len;                   // current code length
+  int idx;                   // symbol index
+  int key;                   // prefix code
+  int reps;                  // number of replicate key values in current table
+  int low;                   // low bits for current root entry
+  int table_bits;            // key length of current table
+  int table_size;            // size of current table
+
+  // Make a local copy of the input bit length histogram.
+  int tmp_count[kJpegHuffmanMaxBitLength + 1] = {0};
+  int total_count = 0;
+  for (len = 1; len <= static_cast<int>(kJpegHuffmanMaxBitLength); ++len) {
+    tmp_count[len] = count[len];
+    total_count += tmp_count[len];
+  }
+
+  table = lut;
+  table_bits = kJpegHuffmanRootTableBits;
+  table_size = 1 << table_bits;
+
+  // Special case code with only one value.
+  if (total_count == 1) {
+    code.bits = 0;
+    code.value = symbols[0];
+    for (key = 0; key < table_size; ++key) {
+      table[key] = code;
+    }
+    return;
+  }
+
+  // Fill in root table.
+  key = 0;
+  idx = 0;
+  for (len = 1; len <= kJpegHuffmanRootTableBits; ++len) {
+    for (; tmp_count[len] > 0; --tmp_count[len]) {
+      code.bits = len;
+      code.value = symbols[idx++];
+      reps = 1 << (kJpegHuffmanRootTableBits - len);
+      while (reps--) {
+        table[key++] = code;
+      }
+    }
+  }
+
+  // Fill in 2nd level tables and add pointers to root table.
+  table += table_size;
+  table_size = 0;
+  low = 0;
+  for (len = kJpegHuffmanRootTableBits + 1;
+       len <= static_cast<int>(kJpegHuffmanMaxBitLength); ++len) {
+    for (; tmp_count[len] > 0; --tmp_count[len]) {
+      // Start a new sub-table if the previous one is full.
+      if (low >= table_size) {
+        table += table_size;
+        table_bits = NextTableBitSize(tmp_count, len);
+        table_size = 1 << table_bits;
+        low = 0;
+        lut[key].bits = table_bits + kJpegHuffmanRootTableBits;
+        lut[key].value = (table - lut) - key;
+        ++key;
+      }
+      code.bits = len - kJpegHuffmanRootTableBits;
+      code.value = symbols[idx++];
+      reps = 1 << (table_bits - code.bits);
+      while (reps--) {
+        table[low++] = code;
+      }
+    }
+  }
+}
+
+}  // namespace jpeg
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.h b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.h
new file mode 100644
index 0000000000..b8a60e4107
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.h
@@ -0,0 +1,41 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Utility function for building a Huffman lookup table for the jpeg decoder.
+
+#ifndef LIB_JXL_JPEG_ENC_JPEG_HUFFMAN_DECODE_H_
+#define LIB_JXL_JPEG_ENC_JPEG_HUFFMAN_DECODE_H_
+
+#include <stdint.h>
+
+namespace jxl {
+namespace jpeg {
+
+constexpr int kJpegHuffmanRootTableBits = 8;
+// Maximum huffman lookup table size.
+// According to zlib/examples/enough.c, 758 entries are always enough for
+// an alphabet of 257 symbols (256 + 1 special symbol for the all 1s code) and
+// max bit length 16 if the root table has 8 bits.
+constexpr int kJpegHuffmanLutSize = 758;
+
+struct HuffmanTableEntry {
+  // Initialize the value to an invalid symbol so that we can recognize it
+  // when reading the bit stream using a Huffman code with space > 0.
+  HuffmanTableEntry() : bits(0), value(0xffff) {}
+
+  uint8_t bits;    // number of bits used for this symbol
+  uint16_t value;  // symbol value or table offset
+};
+
+// Builds jpeg-style Huffman lookup table from the given symbols.
+// The symbols are in order of increasing bit lengths. The number of symbols
+// with bit length n is given in counts[n] for each n >= 1.
+void BuildJpegHuffmanTable(const uint32_t* counts, const uint32_t* symbols,
+                           HuffmanTableEntry* lut);
+
+}  // namespace jpeg
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_ENC_JPEG_HUFFMAN_DECODE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc b/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc
new file mode 100644
index 0000000000..430707b9ed
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc
@@ -0,0 +1,451 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace jpeg {
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+
+namespace {
+enum JPEGComponentType : uint32_t {
+  kGray = 0,
+  kYCbCr = 1,
+  kRGB = 2,
+  kCustom = 3,
+};
+
+struct JPEGInfo {
+  size_t num_app_markers = 0;
+  size_t num_com_markers = 0;
+  size_t num_scans = 0;
+  size_t num_intermarker = 0;
+  bool has_dri = false;
+};
+
+Status VisitMarker(uint8_t* marker, Visitor* visitor, JPEGInfo* info) {
+  uint32_t marker32 = *marker - 0xc0;
+  JXL_RETURN_IF_ERROR(visitor->Bits(6, 0x00, &marker32));
+  *marker = marker32 + 0xc0;
+  if ((*marker & 0xf0) == 0xe0) {
+    info->num_app_markers++;
+  }
+  if (*marker == 0xfe) {
+    info->num_com_markers++;
+  }
+  if (*marker == 0xda) {
+    info->num_scans++;
+  }
+  // We use a fake 0xff marker to signal intermarker data.
+  if (*marker == 0xff) {
+    info->num_intermarker++;
+  }
+  if (*marker == 0xdd) {
+    info->has_dri = true;
+  }
+  return true;
+}
+
+}  // namespace
+
+Status JPEGData::VisitFields(Visitor* visitor) {
+  bool is_gray = components.size() == 1;
+  JXL_RETURN_IF_ERROR(visitor->Bool(false, &is_gray));
+  if (visitor->IsReading()) {
+    components.resize(is_gray ? 1 : 3);
+  }
+  JPEGInfo info;
+  if (visitor->IsReading()) {
+    uint8_t marker = 0xc0;
+    do {
+      JXL_RETURN_IF_ERROR(VisitMarker(&marker, visitor, &info));
+      marker_order.push_back(marker);
+      if (marker_order.size() > 16384) {
+        return JXL_FAILURE("Too many markers: %" PRIuS "\n",
+                           marker_order.size());
+      }
+    } while (marker != 0xd9);
+  } else {
+    if (marker_order.size() > 16384) {
+      return JXL_FAILURE("Too many markers: %" PRIuS "\n", marker_order.size());
+    }
+    for (size_t i = 0; i < marker_order.size(); i++) {
+      JXL_RETURN_IF_ERROR(VisitMarker(&marker_order[i], visitor, &info));
+    }
+    if (!marker_order.empty()) {
+      // Last marker should always be EOI marker.
+      JXL_CHECK(marker_order.back() == 0xd9);
+    }
+  }
+
+  // Size of the APP and COM markers.
+  if (visitor->IsReading()) {
+    app_data.resize(info.num_app_markers);
+    app_marker_type.resize(info.num_app_markers);
+    com_data.resize(info.num_com_markers);
+    scan_info.resize(info.num_scans);
+  }
+  JXL_ASSERT(app_data.size() == info.num_app_markers);
+  JXL_ASSERT(app_marker_type.size() == info.num_app_markers);
+  JXL_ASSERT(com_data.size() == info.num_com_markers);
+  JXL_ASSERT(scan_info.size() == info.num_scans);
+  for (size_t i = 0; i < app_data.size(); i++) {
+    auto& app = app_data[i];
+    // Encodes up to 8 different values.
+    JXL_RETURN_IF_ERROR(
+        visitor->U32(Val(0), Val(1), BitsOffset(1, 2), BitsOffset(2, 4), 0,
+                     reinterpret_cast<uint32_t*>(&app_marker_type[i])));
+    if (app_marker_type[i] != AppMarkerType::kUnknown &&
+        app_marker_type[i] != AppMarkerType::kICC &&
+        app_marker_type[i] != AppMarkerType::kExif &&
+        app_marker_type[i] != AppMarkerType::kXMP) {
+      return JXL_FAILURE("Unknown app marker type %u",
+                         static_cast<uint32_t>(app_marker_type[i]));
+    }
+    uint32_t len = app.size() - 1;
+    JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &len));
+    if (visitor->IsReading()) app.resize(len + 1);
+    if (app.size() < 3) {
+      return JXL_FAILURE("Invalid marker size: %" PRIuS "\n", app.size());
+    }
+  }
+  for (auto& com : com_data) {
+    uint32_t len = com.size() - 1;
+    JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &len));
+    if (visitor->IsReading()) com.resize(len + 1);
+    if (com.size() < 3) {
+      return JXL_FAILURE("Invalid marker size: %" PRIuS "\n", com.size());
+    }
+  }
+
+  uint32_t num_quant_tables = quant.size();
+  JXL_RETURN_IF_ERROR(
+      visitor->U32(Val(1), Val(2), Val(3), Val(4), 2, &num_quant_tables));
+  if (num_quant_tables == 4) {
+    return JXL_FAILURE("Invalid number of quant tables");
+  }
+  if (visitor->IsReading()) {
+    quant.resize(num_quant_tables);
+  }
+  for (size_t i = 0; i < num_quant_tables; i++) {
+    if (quant[i].precision > 1) {
+      return JXL_FAILURE(
+          "Quant tables with more than 16 bits are not supported");
+    }
+    JXL_RETURN_IF_ERROR(visitor->Bits(1, 0, &quant[i].precision));
+    JXL_RETURN_IF_ERROR(visitor->Bits(2, i, &quant[i].index));
+    JXL_RETURN_IF_ERROR(visitor->Bool(true, &quant[i].is_last));
+  }
+
+  JPEGComponentType component_type =
+      components.size() == 1 && components[0].id == 1 ? JPEGComponentType::kGray
+      : components.size() == 3 && components[0].id == 1 &&
+              components[1].id == 2 && components[2].id == 3
+          ? JPEGComponentType::kYCbCr
+      : components.size() == 3 && components[0].id == 'R' &&
+              components[1].id == 'G' && components[2].id == 'B'
+          ? JPEGComponentType::kRGB
+          : JPEGComponentType::kCustom;
+  JXL_RETURN_IF_ERROR(
+      visitor->Bits(2, JPEGComponentType::kYCbCr,
+                    reinterpret_cast<uint32_t*>(&component_type)));
+  uint32_t num_components;
+  if (component_type == JPEGComponentType::kGray) {
+    num_components = 1;
+  } else if (component_type != JPEGComponentType::kCustom) {
+    num_components = 3;
+  } else {
+    num_components = components.size();
+    JXL_RETURN_IF_ERROR(
+        visitor->U32(Val(1), Val(2), Val(3), Val(4), 3, &num_components));
+    if (num_components != 1 && num_components != 3) {
+      return JXL_FAILURE("Invalid number of components: %u", num_components);
+    }
+  }
+  if (visitor->IsReading()) {
+    components.resize(num_components);
+  }
+  if (component_type == JPEGComponentType::kCustom) {
+    for (size_t i = 0; i < components.size(); i++) {
+      JXL_RETURN_IF_ERROR(visitor->Bits(8, 0, &components[i].id));
+    }
+  } else if (component_type == JPEGComponentType::kGray) {
+    components[0].id = 1;
+  } else if (component_type == JPEGComponentType::kRGB) {
+    components[0].id = 'R';
+    components[1].id = 'G';
+    components[2].id = 'B';
+  } else {
+    components[0].id = 1;
+    components[1].id = 2;
+    components[2].id = 3;
+  }
+  size_t used_tables = 0;
+  for (size_t i = 0; i < components.size(); i++) {
+    JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &components[i].quant_idx));
+    if (components[i].quant_idx >= quant.size()) {
+      return JXL_FAILURE("Invalid quant table for component %" PRIuS ": %u\n",
+                         i, components[i].quant_idx);
+    }
+    used_tables |= 1U << components[i].quant_idx;
+  }
+  for (size_t i = 0; i < quant.size(); i++) {
+    if (used_tables & (1 << i)) continue;
+    if (i == 0) return JXL_FAILURE("First quant table unused.");
+    // Unused quant table has to be set to copy of previous quant table
+    for (size_t j = 0; j < 64; j++) {
+      if (quant[i].values[j] != quant[i - 1].values[j]) {
+        return JXL_FAILURE("Non-trivial unused quant table");
+      }
+    }
+  }
+
+  uint32_t num_huff = huffman_code.size();
+  JXL_RETURN_IF_ERROR(visitor->U32(Val(4), BitsOffset(3, 2), BitsOffset(4, 10),
+                                   BitsOffset(6, 26), 4, &num_huff));
+  if (visitor->IsReading()) {
+    huffman_code.resize(num_huff);
+  }
+  for (JPEGHuffmanCode& hc : huffman_code) {
+    bool is_ac = hc.slot_id >> 4;
+    uint32_t id = hc.slot_id & 0xF;
+    JXL_RETURN_IF_ERROR(visitor->Bool(false, &is_ac));
+    JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &id));
+    hc.slot_id = (static_cast<uint32_t>(is_ac) << 4) | id;
+    JXL_RETURN_IF_ERROR(visitor->Bool(true, &hc.is_last));
+    size_t num_symbols = 0;
+    for (size_t i = 0; i <= 16; i++) {
+      JXL_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), BitsOffset(3, 2),
+                                       Bits(8), 0, &hc.counts[i]));
+      num_symbols += hc.counts[i];
+    }
+    if (num_symbols < 1) {
+      // Actually, at least 2 symbols are required, since one of them is EOI.
+      return JXL_FAILURE("Empty Huffman table");
+    }
+    if (num_symbols > hc.values.size()) {
+      return JXL_FAILURE("Huffman code too large (%" PRIuS ")", num_symbols);
+    }
+    // Presence flags for 4 * 64 + 1 values.
+    uint64_t value_slots[5] = {};
+    for (size_t i = 0; i < num_symbols; i++) {
+      // Goes up to 256, included. Might have the same symbol appear twice...
+      JXL_RETURN_IF_ERROR(visitor->U32(Bits(2), BitsOffset(2, 4),
+                                       BitsOffset(4, 8), BitsOffset(8, 1), 0,
+                                       &hc.values[i]));
+      value_slots[hc.values[i] >> 6] |= (uint64_t)1 << (hc.values[i] & 0x3F);
+    }
+    if (hc.values[num_symbols - 1] != kJpegHuffmanAlphabetSize) {
+      return JXL_FAILURE("Missing EOI symbol");
+    }
+    // Last element, denoting EOI, have to be 1 after the loop.
+    JXL_ASSERT(value_slots[4] == 1);
+    size_t num_values = 1;
+    for (size_t i = 0; i < 4; ++i) num_values += hwy::PopCount(value_slots[i]);
+    if (num_values != num_symbols) {
+      return JXL_FAILURE("Duplicate Huffman symbols");
+    }
+    if (!is_ac) {
+      bool only_dc = ((value_slots[0] >> kJpegDCAlphabetSize) | value_slots[1] |
+                      value_slots[2] | value_slots[3]) == 0;
+      if (!only_dc) return JXL_FAILURE("Huffman symbols out of DC range");
+    }
+  }
+
+  for (auto& scan : scan_info) {
+    JXL_RETURN_IF_ERROR(
+        visitor->U32(Val(1), Val(2), Val(3), Val(4), 1, &scan.num_components));
+    if (scan.num_components >= 4) {
+      return JXL_FAILURE("Invalid number of components in SOS marker");
+    }
+    JXL_RETURN_IF_ERROR(visitor->Bits(6, 0, &scan.Ss));
+    JXL_RETURN_IF_ERROR(visitor->Bits(6, 63, &scan.Se));
+    JXL_RETURN_IF_ERROR(visitor->Bits(4, 0, &scan.Al));
+    JXL_RETURN_IF_ERROR(visitor->Bits(4, 0, &scan.Ah));
+    for (size_t i = 0; i < scan.num_components; i++) {
+      JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &scan.components[i].comp_idx));
+      if (scan.components[i].comp_idx >= components.size()) {
+        return JXL_FAILURE("Invalid component idx in SOS marker");
+      }
+      JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &scan.components[i].ac_tbl_idx));
+      JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &scan.components[i].dc_tbl_idx));
+    }
+    // TODO(veluca): actually set and use this value.
+    JXL_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), Val(2), BitsOffset(3, 3),
+                                     kMaxNumPasses - 1,
+                                     &scan.last_needed_pass));
+  }
+
+  // From here on, this is data that is not strictly necessary to get a valid
+  // JPEG, but necessary for bit-exact JPEG reconstruction.
+  if (info.has_dri) {
+    JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &restart_interval));
+  }
+
+  for (auto& scan : scan_info) {
+    uint32_t num_reset_points = scan.reset_points.size();
+    JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(2, 1), BitsOffset(4, 4),
+                                     BitsOffset(16, 20), 0, &num_reset_points));
+    if (visitor->IsReading()) {
+      scan.reset_points.resize(num_reset_points);
+    }
+    int last_block_idx = -1;
+    for (auto& block_idx : scan.reset_points) {
+      block_idx -= last_block_idx + 1;
+      JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(3, 1),
+                                       BitsOffset(5, 9), BitsOffset(28, 41), 0,
+                                       &block_idx));
+      block_idx += last_block_idx + 1;
+      if (static_cast<int>(block_idx) < last_block_idx + 1) {
+        return JXL_FAILURE("Invalid block ID: %u, last block was %d", block_idx,
+                           last_block_idx);
+      }
+      // TODO(eustas): better upper boundary could be given at this point; also
+      //               it could be applied during reset_points reading.
+      if (block_idx > (1u << 30)) {
+        // At most 8K x 8K x num_channels blocks are expected. That is,
+        // typically, 1.5 * 2^27. 2^30 should be sufficient for any sane
+        // image.
+        return JXL_FAILURE("Invalid block ID: %u", block_idx);
+      }
+      last_block_idx = block_idx;
+    }
+
+    uint32_t num_extra_zero_runs = scan.extra_zero_runs.size();
+    JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(2, 1), BitsOffset(4, 4),
+                                     BitsOffset(16, 20), 0,
+                                     &num_extra_zero_runs));
+    if (visitor->IsReading()) {
+      scan.extra_zero_runs.resize(num_extra_zero_runs);
+    }
+    last_block_idx = -1;
+    for (size_t i = 0; i < scan.extra_zero_runs.size(); ++i) {
+      uint32_t& block_idx = scan.extra_zero_runs[i].block_idx;
+      JXL_RETURN_IF_ERROR(visitor->U32(
+          Val(1), BitsOffset(2, 2), BitsOffset(4, 5), BitsOffset(8, 20), 1,
+          &scan.extra_zero_runs[i].num_extra_zero_runs));
+      block_idx -= last_block_idx + 1;
+      JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(3, 1),
+                                       BitsOffset(5, 9), BitsOffset(28, 41), 0,
+                                       &block_idx));
+      block_idx += last_block_idx + 1;
+      if (static_cast<int>(block_idx) < last_block_idx + 1) {
+        return JXL_FAILURE("Invalid block ID: %u, last block was %d", block_idx,
+                           last_block_idx);
+      }
+      if (block_idx > (1u << 30)) {
+        // At most 8K x 8K x num_channels blocks are expected. That is,
+        // typically, 1.5 * 2^27. 2^30 should be sufficient for any sane
+        // image.
+        return JXL_FAILURE("Invalid block ID: %u", block_idx);
+      }
+      last_block_idx = block_idx;
+    }
+  }
+  std::vector<uint32_t> inter_marker_data_sizes;
+  inter_marker_data_sizes.reserve(info.num_intermarker);
+  for (size_t i = 0; i < info.num_intermarker; ++i) {
+    uint32_t len = visitor->IsReading() ? 0 : inter_marker_data[i].size();
+    JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &len));
+    if (visitor->IsReading()) inter_marker_data_sizes.emplace_back(len);
+  }
+  uint32_t tail_data_len = tail_data.size();
+  if (!visitor->IsReading() && tail_data_len > 4260096) {
+    return JXL_FAILURE("Tail data too large (max size = 4260096, size = %u).",
+                       tail_data_len);
+  }
+  JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(8, 1),
+                                   BitsOffset(16, 257), BitsOffset(22, 65793),
+                                   0, &tail_data_len));
+
+  JXL_RETURN_IF_ERROR(visitor->Bool(false, &has_zero_padding_bit));
+  if (has_zero_padding_bit) {
+    uint32_t nbit = padding_bits.size();
+    JXL_RETURN_IF_ERROR(visitor->Bits(24, 0, &nbit));
+    if (visitor->IsReading()) {
+      padding_bits.reserve(std::min<uint32_t>(1024u, nbit));
+      for (uint32_t i = 0; i < nbit; i++) {
+        bool bbit = false;
+        JXL_RETURN_IF_ERROR(visitor->Bool(false, &bbit));
+        padding_bits.push_back(bbit);
+      }
+    } else {
+      for (uint8_t& bit : padding_bits) {
+        bool bbit = bit;
+        JXL_RETURN_IF_ERROR(visitor->Bool(false, &bbit));
+        bit = bbit;
+      }
+    }
+  }
+
+  // Apply postponed actions.
+  if (visitor->IsReading()) {
+    tail_data.resize(tail_data_len);
+    JXL_ASSERT(inter_marker_data_sizes.size() == info.num_intermarker);
+    inter_marker_data.reserve(info.num_intermarker);
+    for (size_t i = 0; i < info.num_intermarker; ++i) {
+      inter_marker_data.emplace_back(inter_marker_data_sizes[i]);
+    }
+  }
+
+  return true;
+}
+
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+void JPEGData::CalculateMcuSize(const JPEGScanInfo& scan, int* MCUs_per_row,
+                                int* MCU_rows) const {
+  const bool is_interleaved = (scan.num_components > 1);
+  const JPEGComponent& base_component = components[scan.components[0].comp_idx];
+  // h_group / v_group act as numerators for converting number of blocks to
+  // number of MCU. In interleaved mode it is 1, so MCU is represented with
+  // max_*_samp_factor blocks. In non-interleaved mode we choose numerator to
+  // be the samping factor, consequently MCU is always represented with single
+  // block.
+  const int h_group = is_interleaved ? 1 : base_component.h_samp_factor;
+  const int v_group = is_interleaved ? 1 : base_component.v_samp_factor;
+  int max_h_samp_factor = 1;
+  int max_v_samp_factor = 1;
+  for (const auto& c : components) {
+    max_h_samp_factor = std::max(c.h_samp_factor, max_h_samp_factor);
+    max_v_samp_factor = std::max(c.v_samp_factor, max_v_samp_factor);
+  }
+  *MCUs_per_row = DivCeil(width * h_group, 8 * max_h_samp_factor);
+  *MCU_rows = DivCeil(height * v_group, 8 * max_v_samp_factor);
+}
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+
+Status SetJPEGDataFromICC(const PaddedBytes& icc, jpeg::JPEGData* jpeg_data) {
+  size_t icc_pos = 0;
+  for (size_t i = 0; i < jpeg_data->app_data.size(); i++) {
+    if (jpeg_data->app_marker_type[i] != jpeg::AppMarkerType::kICC) {
+      continue;
+    }
+    size_t len = jpeg_data->app_data[i].size() - 17;
+    if (icc_pos + len > icc.size()) {
+      return JXL_FAILURE(
+          "ICC length is less than APP markers: requested %" PRIuS
+          " more bytes, "
+          "%" PRIuS " available",
+          len, icc.size() - icc_pos);
+    }
+    memcpy(&jpeg_data->app_data[i][17], icc.data() + icc_pos, len);
+    icc_pos += len;
+  }
+  if (icc_pos != icc.size() && icc_pos != 0) {
+    return JXL_FAILURE("ICC length is more than APP markers");
+  }
+  return true;
+}
+
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+}  // namespace jpeg
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.h b/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.h
new file mode 100644
index 0000000000..70ff4f8e05
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.h
@@ -0,0 +1,216 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Data structures that represent the non-pixel contents of a jpeg file.
+
+#ifndef LIB_JXL_JPEG_JPEG_DATA_H_
+#define LIB_JXL_JPEG_JPEG_DATA_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <array>
+#include <vector>
+
+#include "lib/jxl/common.h"  // JPEGXL_ENABLE_TRANSCODE_JPEG
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+namespace jpeg {
+
+constexpr int kMaxComponents = 4;
+constexpr int kMaxQuantTables = 4;
+constexpr int kMaxHuffmanTables = 4;
+constexpr size_t kJpegHuffmanMaxBitLength = 16;
+constexpr int kJpegHuffmanAlphabetSize = 256;
+constexpr int kJpegDCAlphabetSize = 12;
+constexpr int kMaxDHTMarkers = 512;
+constexpr int kMaxDimPixels = 65535;
+constexpr uint8_t kApp1 = 0xE1;
+constexpr uint8_t kApp2 = 0xE2;
+const uint8_t kIccProfileTag[12] = "ICC_PROFILE";
+const uint8_t kExifTag[6] = "Exif\0";
+const uint8_t kXMPTag[29] = "http://ns.adobe.com/xap/1.0/";
+
+/* clang-format off */
+constexpr uint32_t kJPEGNaturalOrder[80] = {
+  0,   1,  8, 16,  9,  2,  3, 10,
+  17, 24, 32, 25, 18, 11,  4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34,
+  27, 20, 13,  6,  7, 14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36,
+  29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46,
+  53, 60, 61, 54, 47, 55, 62, 63,
+  // extra entries for safety in decoder
+  63, 63, 63, 63, 63, 63, 63, 63,
+  63, 63, 63, 63, 63, 63, 63, 63
+};
+
+constexpr uint32_t kJPEGZigZagOrder[64] = {
+  0,   1,  5,  6, 14, 15, 27, 28,
+  2,   4,  7, 13, 16, 26, 29, 42,
+  3,   8, 12, 17, 25, 30, 41, 43,
+  9,  11, 18, 24, 31, 40, 44, 53,
+  10, 19, 23, 32, 39, 45, 52, 54,
+  20, 22, 33, 38, 46, 51, 55, 60,
+  21, 34, 37, 47, 50, 56, 59, 61,
+  35, 36, 48, 49, 57, 58, 62, 63
+};
+/* clang-format on */
+
+// Quantization values for an 8x8 pixel block.
+struct JPEGQuantTable {
+  std::array<int32_t, kDCTBlockSize> values;
+  uint32_t precision = 0;
+  // The index of this quantization table as it was parsed from the input JPEG.
+  // Each DQT marker segment contains an 'index' field, and we save this index
+  // here. Valid values are 0 to 3.
+  uint32_t index = 0;
+  // Set to true if this table is the last one within its marker segment.
+  bool is_last = true;
+};
+
+// Huffman code and decoding lookup table used for DC and AC coefficients.
+struct JPEGHuffmanCode {
+  // Bit length histogram.
+  std::array<uint32_t, kJpegHuffmanMaxBitLength + 1> counts = {};
+  // Symbol values sorted by increasing bit lengths.
+  std::array<uint32_t, kJpegHuffmanAlphabetSize + 1> values = {};
+  // The index of the Huffman code in the current set of Huffman codes. For AC
+  // component Huffman codes, 0x10 is added to the index.
+  int slot_id = 0;
+  // Set to true if this Huffman code is the last one within its marker segment.
+  bool is_last = true;
+};
+
+// Huffman table indexes used for one component of one scan.
+struct JPEGComponentScanInfo {
+  uint32_t comp_idx;
+  uint32_t dc_tbl_idx;
+  uint32_t ac_tbl_idx;
+};
+
+// Contains information that is used in one scan.
+struct JPEGScanInfo {
+  // Parameters used for progressive scans (named the same way as in the spec):
+  //   Ss : Start of spectral band in zig-zag sequence.
+  //   Se : End of spectral band in zig-zag sequence.
+  //   Ah : Successive approximation bit position, high.
+  //   Al : Successive approximation bit position, low.
+  uint32_t Ss;
+  uint32_t Se;
+  uint32_t Ah;
+  uint32_t Al;
+  uint32_t num_components = 0;
+  std::array<JPEGComponentScanInfo, 4> components;
+  // Last codestream pass that is needed to write this scan.
+  uint32_t last_needed_pass = 0;
+
+  // Extra information required for bit-precise JPEG file reconstruction.
+
+  // Set of block indexes where the JPEG encoder has to flush the end-of-block
+  // runs and refinement bits.
+  std::vector<uint32_t> reset_points;
+  // The number of extra zero runs (Huffman symbol 0xf0) before the end of
+  // block (if nonzero), indexed by block index.
+  // All of these symbols can be omitted without changing the pixel values, but
+  // some jpeg encoders put these at the end of blocks.
+  typedef struct {
+    uint32_t block_idx;
+    uint32_t num_extra_zero_runs;
+  } ExtraZeroRunInfo;
+  std::vector<ExtraZeroRunInfo> extra_zero_runs;
+};
+
+typedef int16_t coeff_t;
+
+// Represents one component of a jpeg file.
+struct JPEGComponent {
+  JPEGComponent()
+      : id(0),
+        h_samp_factor(1),
+        v_samp_factor(1),
+        quant_idx(0),
+        width_in_blocks(0),
+        height_in_blocks(0) {}
+
+  // One-byte id of the component.
+  uint32_t id;
+  // Horizontal and vertical sampling factors.
+  // In interleaved mode, each minimal coded unit (MCU) has
+  // h_samp_factor x v_samp_factor DCT blocks from this component.
+  int h_samp_factor;
+  int v_samp_factor;
+  // The index of the quantization table used for this component.
+  uint32_t quant_idx;
+  // The dimensions of the component measured in 8x8 blocks.
+  uint32_t width_in_blocks;
+  uint32_t height_in_blocks;
+  // The DCT coefficients of this component, laid out block-by-block, divided
+  // through the quantization matrix values.
+  std::vector<coeff_t> coeffs;
+};
+
+enum class AppMarkerType : uint32_t {
+  kUnknown = 0,
+  kICC = 1,
+  kExif = 2,
+  kXMP = 3,
+};
+
+// Represents a parsed jpeg file.
+struct JPEGData : public Fields {
+  JPEGData()
+      : width(0), height(0), restart_interval(0), has_zero_padding_bit(false) {}
+
+  JXL_FIELDS_NAME(JPEGData)
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+  // Doesn't serialize everything - skips brotli-encoded data and what is
+  // already encoded in the codestream.
+  Status VisitFields(Visitor* visitor) override;
+#else
+  Status VisitFields(Visitor* /* visitor */) override {
+    JXL_ABORT("JPEG transcoding support not enabled");
+  }
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+  void CalculateMcuSize(const JPEGScanInfo& scan, int* MCUs_per_row,
+                        int* MCU_rows) const;
+
+  int width;
+  int height;
+  uint32_t restart_interval;
+  std::vector<std::vector<uint8_t>> app_data;
+  std::vector<AppMarkerType> app_marker_type;
+  std::vector<std::vector<uint8_t>> com_data;
+  std::vector<JPEGQuantTable> quant;
+  std::vector<JPEGHuffmanCode> huffman_code;
+  std::vector<JPEGComponent> components;
+  std::vector<JPEGScanInfo> scan_info;
+  std::vector<uint8_t> marker_order;
+  std::vector<std::vector<uint8_t>> inter_marker_data;
+  std::vector<uint8_t> tail_data;
+
+  // Extra information required for bit-precise JPEG file reconstruction.
+
+  bool has_zero_padding_bit;
+  std::vector<uint8_t> padding_bits;
+};
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+// Set ICC profile in jpeg_data.
+Status SetJPEGDataFromICC(const PaddedBytes& icc, jpeg::JPEGData* jpeg_data);
+#else
+static JXL_INLINE Status SetJPEGDataFromICC(const PaddedBytes& /* icc */,
+                                            jpeg::JPEGData* /* jpeg_data */) {
+  JXL_ABORT("JPEG transcoding support not enabled");
+}
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+}  // namespace jpeg
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_JPEG_DATA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jxl.syms b/third_party/jpeg-xl/lib/jxl/jxl.syms
new file mode 100644
index 0000000000..0f398d7151
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jxl.syms
@@ -0,0 +1,5 @@
+{
+  extern "C" {
+    jpegxl_*;
+  };
+};
diff --git a/third_party/jpeg-xl/lib/jxl/jxl.version b/third_party/jpeg-xl/lib/jxl/jxl.version
new file mode 100644
index 0000000000..26b0e9e54d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jxl.version
@@ -0,0 +1,17 @@
+JXL_0 {
+  global:
+    Jxl*;
+
+  local:
+    # Hide all the std namespace symbols. std namespace is explicitly marked
+    # as visibility(default) and header-only functions or methods (such as those
+    # from templates) should be exposed in shared libraries as weak symbols but
+    # this is only needed when we expose those types in the shared library API
+    # in any way. We don't use C++ std types in the API and we also don't
+    # support exceptions in the library.
+    # See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36022 for a discussion
+    # about this.
+    extern "C++" {
+      *std::*;
+    };
+};
diff --git a/third_party/jpeg-xl/lib/jxl/jxl_inspection.h b/third_party/jpeg-xl/lib/jxl/jxl_inspection.h
new file mode 100644
index 0000000000..0b70a58523
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jxl_inspection.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_JXL_INSPECTION_H_
+#define LIB_JXL_JXL_INSPECTION_H_
+
+#include <functional>
+
+#include "lib/jxl/image.h"
+
+namespace jxl {
+// Type of the inspection-callback which, if enabled, will be called on various
+// intermediate data during image processing, allowing inspection access.
+//
+// Returns false if processing can be stopped at that point, true otherwise.
+// This is only advisory - it is always OK to just continue processing.
+using InspectorImage3F = std::function<bool(const char*, const Image3F&)>;
+}  // namespace jxl
+
+#endif  // LIB_JXL_JXL_INSPECTION_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jxl_osx.syms b/third_party/jpeg-xl/lib/jxl/jxl_osx.syms
new file mode 100644
index 0000000000..96bc568025
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jxl_osx.syms
@@ -0,0 +1 @@
+_Jxl*
diff --git a/third_party/jpeg-xl/lib/jxl/jxl_test.cc b/third_party/jpeg-xl/lib/jxl/jxl_test.cc
new file mode 100644
index 0000000000..0a676802f6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jxl_test.cc
@@ -0,0 +1,1537 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/jxl.h"
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <array>
+#include <future>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/codec.h"
+#include "lib/extras/enc/encode.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_butteraugli_pnorm.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/fake_parallel_runner_testonly.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/jpeg/dec_jpeg_data.h"
+#include "lib/jxl/jpeg/dec_jpeg_data_writer.h"
+#include "lib/jxl/jpeg/enc_jpeg_data.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+#include "lib/jxl/modular/options.h"
+#include "lib/jxl/test_image.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+#include "tools/box/box.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+namespace {
+using extras::JXLCompressParams;
+using extras::JXLDecompressParams;
+using extras::PackedPixelFile;
+using test::ButteraugliDistance;
+using test::ComputeDistance2;
+using test::Roundtrip;
+using test::TestImage;
+using test::ThreadPoolForTests;
+
+#define JXL_TEST_NL 0  // Disabled in code
+
+TEST(JxlTest, RoundtripSinglePixel) {
+  TestImage t;
+  t.SetDimensions(1, 1).AddFrame().ZeroFill();
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), {}, {}, nullptr, &ppf_out), 55);
+}
+
+TEST(JxlTest, RoundtripSinglePixelWithAlpha) {
+  TestImage t;
+  t.SetDimensions(1, 1).SetChannels(4).AddFrame().ZeroFill();
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), {}, {}, nullptr, &ppf_out), 59);
+}
+
+// Changing serialized signature causes Decode to fail.
+#ifndef JXL_CRASH_ON_ERROR
+TEST(JxlTest, RoundtripMarker) {
+  TestImage t;
+  t.SetDimensions(1, 1).AddFrame().ZeroFill();
+  for (size_t i = 0; i < 2; ++i) {
+    std::vector<uint8_t> compressed;
+    EXPECT_TRUE(extras::EncodeImageJXL({}, t.ppf(), /*jpeg_bytes=*/nullptr,
+                                       &compressed));
+    compressed[i] ^= 0xFF;
+    PackedPixelFile ppf_out;
+    EXPECT_FALSE(extras::DecodeImageJXL(compressed.data(), compressed.size(),
+                                        {}, /*decodec_bytes=*/nullptr,
+                                        &ppf_out));
+  }
+}
+#endif
+
+TEST(JxlTest, RoundtripTinyFast) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(32, 32);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);
+  cparams.distance = 4.0f;
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 192, 10);
+}
+
+TEST(JxlTest, RoundtripSmallD1) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  size_t xsize = t.ppf().info.xsize / 8;
+  size_t ysize = t.ppf().info.ysize / 8;
+  t.SetDimensions(xsize, ysize);
+
+  {
+    PackedPixelFile ppf_out;
+    EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 766, 40);
+    EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.2));
+  }
+
+  // With a lower intensity target than the default, the bitrate should be
+  // smaller.
+  t.ppf().info.intensity_target = 100.0f;
+
+  {
+    PackedPixelFile ppf_out;
+    EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 659, 20);
+    EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.3));
+    EXPECT_EQ(ppf_out.info.intensity_target, t.ppf().info.intensity_target);
+  }
+}
+TEST(JxlTest, RoundtripResample2) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3);  // kFalcon
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 18772, 200);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(90));
+}
+
+TEST(JxlTest, RoundtripResample2Slow) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 9);  // kTortoise
+  cparams.distance = 10.0;
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 4088, 200);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(250));
+}
+
+TEST(JxlTest, RoundtripResample2MT) {
+  ThreadPoolForTests pool(4);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  // image has to be large enough to have multiple groups after downsampling
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3);  // kFalcon
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 228283, 1000);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(340));
+}
+
+// Roundtrip the image using a parallel runner that executes single-threaded but
+// in random order.
+TEST(JxlTest, RoundtripOutOfOrderProcessing) {
+  FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8);
+  ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  // Image size is selected so that the block border needed is larger than the
+  // amount of pixels available on the next block.
+  t.SetDimensions(513, 515);
+
+  JXLCompressParams cparams;
+  // Force epf so we end up needing a lot of border.
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 3);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 22584, 400);
+  EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out), 1.35);
+}
+
+TEST(JxlTest, RoundtripOutOfOrderProcessingBorder) {
+  FakeParallelRunner fake_pool(/*order_seed=*/47, /*num_threads=*/8);
+  ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  // Image size is selected so that the block border needed is larger than the
+  // amount of pixels available on the next block.
+  t.SetDimensions(513, 515);
+
+  JXLCompressParams cparams;
+  // Force epf so we end up needing a lot of border.
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 3);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 10907, 200);
+  EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out), 2.9);
+}
+
+TEST(JxlTest, RoundtripResample4) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 4);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 5824, 100);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(22));
+}
+
+TEST(JxlTest, RoundtripResample8) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 8);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 2036, 50);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(50));
+}
+
+TEST(JxlTest, RoundtripUnalignedD2) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  size_t xsize = t.ppf().info.xsize / 12;
+  size_t ysize = t.ppf().info.ysize / 7;
+  t.SetDimensions(xsize, ysize);
+
+  JXLCompressParams cparams;
+  cparams.distance = 2.0;
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 506, 30);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.72));
+}
+
+TEST(JxlTest, RoundtripMultiGroup) {
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(600, 1024);
+
+  auto test = [&](jxl::SpeedTier speed_tier, float target_distance,
+                  size_t expected_size, float expected_distance) {
+    ThreadPoolForTests pool(4);
+    JXLCompressParams cparams;
+    int64_t effort = 10 - static_cast<int>(speed_tier);
+    cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, effort);
+    cparams.distance = target_distance;
+
+    PackedPixelFile ppf_out;
+    EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), expected_size,
+                700);
+    EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out),
+                IsSlightlyBelow(expected_distance));
+  };
+
+  auto run_kitten = std::async(std::launch::async, test, SpeedTier::kKitten,
+                               1.0f, 54895u, 11.7);
+  auto run_wombat = std::async(std::launch::async, test, SpeedTier::kWombat,
+                               2.0f, 33507u, 20.0);
+}
+
+TEST(JxlTest, RoundtripRGBToGrayscale) {
+  ThreadPoolForTests pool(4);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  io.ShrinkTo(600, 1024);
+
+  CompressParams cparams;
+  cparams.butteraugli_distance = 1.0f;
+  cparams.speed_tier = SpeedTier::kFalcon;
+
+  JXLDecompressParams dparams;
+  dparams.color_space = "Gra_D65_Rel_SRG";
+
+  CodecInOut io2;
+  EXPECT_FALSE(io.Main().IsGray());
+  size_t compressed_size;
+  JXL_EXPECT_OK(
+      Roundtrip(&io, cparams, dparams, &io2, _, &compressed_size, &pool));
+  EXPECT_LE(compressed_size, 65000u);
+  EXPECT_TRUE(io2.Main().IsGray());
+
+  // Convert original to grayscale here, because TransformTo refuses to
+  // convert between grayscale and RGB.
+  ColorEncoding srgb_lin = ColorEncoding::LinearSRGB(/*is_gray=*/false);
+  ASSERT_TRUE(io.frames[0].TransformTo(srgb_lin, GetJxlCms()));
+  Image3F* color = io.Main().color();
+  for (size_t y = 0; y < color->ysize(); ++y) {
+    float* row_r = color->PlaneRow(0, y);
+    float* row_g = color->PlaneRow(1, y);
+    float* row_b = color->PlaneRow(2, y);
+    for (size_t x = 0; x < color->xsize(); ++x) {
+      float luma = 0.2126 * row_r[x] + 0.7152 * row_g[x] + 0.0722 * row_b[x];
+      row_r[x] = row_g[x] = row_b[x] = luma;
+    }
+  }
+  ColorEncoding srgb_gamma = ColorEncoding::SRGB(/*is_gray=*/false);
+  ASSERT_TRUE(io.frames[0].TransformTo(srgb_gamma, GetJxlCms()));
+  io.metadata.m.color_encoding = io2.Main().c_current();
+  io.Main().OverrideProfile(io2.Main().c_current());
+  EXPECT_THAT(
+      ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(),
+                          /*distmap=*/nullptr, &pool),
+      IsSlightlyBelow(1.36));
+}
+
+TEST(JxlTest, RoundtripLargeFast) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 445684, 5000);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(100));
+}
+
+TEST(JxlTest, RoundtripDotsForceEpf) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/cvo9xd_keong_macan_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_DOTS, 1);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 41472, 300);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(18));
+}
+
+// Checks for differing size/distance in two consecutive runs of distance 2,
+// which involves additional processing including adaptive reconstruction.
+// Failing this may be a sign of race conditions or invalid memory accesses.
+TEST(JxlTest, RoundtripD2Consistent) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+  cparams.distance = 2.0;
+
+  // Try each xsize mod kBlockDim to verify right border handling.
+  for (size_t xsize = 48; xsize > 40; --xsize) {
+    t.SetDimensions(xsize, 15);
+
+    PackedPixelFile ppf2;
+    const size_t size2 = Roundtrip(t.ppf(), cparams, {}, &pool, &ppf2);
+
+    PackedPixelFile ppf3;
+    const size_t size3 = Roundtrip(t.ppf(), cparams, {}, &pool, &ppf3);
+
+    // Exact same compressed size.
+    EXPECT_EQ(size2, size3);
+
+    // Exact same distance.
+    const float dist2 = ComputeDistance2(t.ppf(), ppf2);
+    const float dist3 = ComputeDistance2(t.ppf(), ppf3);
+    EXPECT_EQ(dist2, dist3);
+  }
+}
+
+// Same as above, but for full image, testing multiple groups.
+TEST(JxlTest, RoundtripLargeConsistent) {
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+  cparams.distance = 2.0;
+
+  auto roundtrip_and_compare = [&]() {
+    ThreadPoolForTests pool(8);
+    PackedPixelFile ppf2;
+    size_t size = Roundtrip(t.ppf(), cparams, {}, &pool, &ppf2);
+    double dist = ComputeDistance2(t.ppf(), ppf2);
+    return std::tuple<size_t, double>(size, dist);
+  };
+
+  // Try each xsize mod kBlockDim to verify right border handling.
+  auto future2 = std::async(std::launch::async, roundtrip_and_compare);
+  auto future3 = std::async(std::launch::async, roundtrip_and_compare);
+
+  const auto result2 = future2.get();
+  const auto result3 = future3.get();
+
+  // Exact same compressed size.
+  EXPECT_EQ(std::get<0>(result2), std::get<0>(result3));
+
+  // Exact same distance.
+  EXPECT_EQ(std::get<1>(result2), std::get<1>(result3));
+}
+
+TEST(JxlTest, RoundtripSmallNL) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  size_t xsize = t.ppf().info.xsize / 8;
+  size_t ysize = t.ppf().info.ysize / 8;
+  t.SetDimensions(xsize, ysize);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 783, 25);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.1));
+}
+
+TEST(JxlTest, RoundtripNoGaborishNoAR) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 0);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_GABORISH, 0);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 38561, 200);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.8));
+}
+
+TEST(JxlTest, RoundtripSmallNoGaborish) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  size_t xsize = t.ppf().info.xsize / 8;
+  size_t ysize = t.ppf().info.ysize / 8;
+  t.SetDimensions(xsize, ysize);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_GABORISH, 0);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 811, 20);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.1));
+}
+
+TEST(JxlTest, RoundtripSmallPatchesAlpha) {
+  ThreadPool* pool = nullptr;
+  TestImage t;
+  t.SetDimensions(256, 256).SetChannels(4);
+  t.SetColorEncoding("RGB_D65_SRG_Rel_Lin");
+  TestImage::Frame frame = t.AddFrame();
+  frame.ZeroFill();
+  // This pattern should be picked up by the patch detection heuristics.
+  for (size_t y = 0; y < t.ppf().info.ysize; ++y) {
+    for (size_t x = 0; x < t.ppf().info.xsize; ++x) {
+      if (x % 4 == 0 && (y / 32) % 4 == 0) {
+        frame.SetValue(y, x, 1, 127.0f / 255.0f);
+      }
+      frame.SetValue(y, x, 3, 1.0f);
+    }
+  }
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+  cparams.distance = 0.1f;
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 597, 100);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.012f));
+}
+
+TEST(JxlTest, RoundtripSmallPatches) {
+  ThreadPool* pool = nullptr;
+  TestImage t;
+  t.SetDimensions(256, 256);
+  t.SetColorEncoding("RGB_D65_SRG_Rel_Lin");
+  TestImage::Frame frame = t.AddFrame();
+  frame.ZeroFill();
+  // This pattern should be picked up by the patch detection heuristics.
+  for (size_t y = 0; y < t.ppf().info.ysize; ++y) {
+    for (size_t x = 0; x < t.ppf().info.xsize; ++x) {
+      if (x % 4 == 0 && (y / 32) % 4 == 0) {
+        frame.SetValue(y, x, 1, 127.0f / 255.0f);
+      }
+    }
+  }
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+  cparams.distance = 0.1f;
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 486, 100);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.012f));
+}
+
+// TODO(szabadka) Add encoder and decoder API functions that accept frame
+// buffers in arbitrary unsigned and floating point formats, and then roundtrip
+// test the lossless codepath to make sure the exact binary representations
+// are preserved.
+#if 0
+TEST(JxlTest, RoundtripImageBundleOriginalBits) {
+  // Image does not matter, only io.metadata.m and io2.metadata.m are tested.
+  Image3F image(1, 1);
+  ZeroFillImage(&image);
+  CodecInOut io;
+  io.metadata.m.color_encoding = ColorEncoding::LinearSRGB();
+  io.SetFromImage(std::move(image), ColorEncoding::LinearSRGB());
+
+  CompressParams cparams;
+
+  // Test unsigned integers from 1 to 32 bits
+  for (uint32_t bit_depth = 1; bit_depth <= 32; bit_depth++) {
+    if (bit_depth == 32) {
+      // TODO(lode): allow testing 32, however the code below ends up in
+      // enc_modular which does not support 32. We only want to test the header
+      // encoding though, so try without modular.
+      break;
+    }
+
+    io.metadata.m.SetUintSamples(bit_depth);
+    CodecInOut io2;
+    JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _));
+
+    EXPECT_EQ(bit_depth, io2.metadata.m.bit_depth.bits_per_sample);
+    EXPECT_FALSE(io2.metadata.m.bit_depth.floating_point_sample);
+    EXPECT_EQ(0u, io2.metadata.m.bit_depth.exponent_bits_per_sample);
+    EXPECT_EQ(0u, io2.metadata.m.GetAlphaBits());
+  }
+
+  // Test various existing and non-existing floating point formats
+  for (uint32_t bit_depth = 8; bit_depth <= 32; bit_depth++) {
+    if (bit_depth != 32) {
+      // TODO: test other float types once they work
+      break;
+    }
+
+    uint32_t exponent_bit_depth;
+    if (bit_depth < 10) {
+      exponent_bit_depth = 2;
+    } else if (bit_depth < 12) {
+      exponent_bit_depth = 3;
+    } else if (bit_depth < 16) {
+      exponent_bit_depth = 4;
+    } else if (bit_depth < 20) {
+      exponent_bit_depth = 5;
+    } else if (bit_depth < 24) {
+      exponent_bit_depth = 6;
+    } else if (bit_depth < 28) {
+      exponent_bit_depth = 7;
+    } else {
+      exponent_bit_depth = 8;
+    }
+
+    io.metadata.m.bit_depth.bits_per_sample = bit_depth;
+    io.metadata.m.bit_depth.floating_point_sample = true;
+    io.metadata.m.bit_depth.exponent_bits_per_sample = exponent_bit_depth;
+
+    CodecInOut io2;
+    JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2));
+
+    EXPECT_EQ(bit_depth, io2.metadata.m.bit_depth.bits_per_sample);
+    EXPECT_TRUE(io2.metadata.m.bit_depth.floating_point_sample);
+    EXPECT_EQ(exponent_bit_depth,
+              io2.metadata.m.bit_depth.exponent_bits_per_sample);
+    EXPECT_EQ(0u, io2.metadata.m.GetAlphaBits());
+  }
+}
+#endif
+
+TEST(JxlTest, RoundtripGrayscale) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/cvo9xd_keong_macan_grayscale.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  ASSERT_NE(io.xsize(), 0u);
+  io.ShrinkTo(128, 128);
+  EXPECT_TRUE(io.Main().IsGray());
+  EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample);
+  EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
+  EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample);
+  EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB());
+
+  PassesEncoderState enc_state;
+  AuxOut* aux_out = nullptr;
+
+  {
+    CompressParams cparams;
+    cparams.butteraugli_distance = 1.0;
+
+    PaddedBytes compressed;
+    EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                           aux_out));
+    CodecInOut io2;
+    EXPECT_TRUE(test::DecodeFile({}, Span<const uint8_t>(compressed), &io2));
+    EXPECT_TRUE(io2.Main().IsGray());
+
+    EXPECT_LE(compressed.size(), 7000u);
+    EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, cparams.ba_params,
+                                    GetJxlCms(),
+                                    /*distmap=*/nullptr),
+                IsSlightlyBelow(1.6));
+  }
+
+  // Test with larger butteraugli distance and other settings enabled so
+  // different jxl codepaths trigger.
+  {
+    CompressParams cparams;
+    cparams.butteraugli_distance = 8.0;
+
+    PaddedBytes compressed;
+    EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                           aux_out));
+    CodecInOut io2;
+    EXPECT_TRUE(test::DecodeFile({}, Span<const uint8_t>(compressed), &io2));
+    EXPECT_TRUE(io2.Main().IsGray());
+
+    EXPECT_LE(compressed.size(), 1300u);
+    EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, cparams.ba_params,
+                                    GetJxlCms(),
+                                    /*distmap=*/nullptr),
+                IsSlightlyBelow(6.0));
+  }
+
+  {
+    CompressParams cparams;
+    cparams.butteraugli_distance = 1.0;
+
+    PaddedBytes compressed;
+    EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                           aux_out));
+
+    CodecInOut io2;
+    JXLDecompressParams dparams;
+    dparams.color_space = "RGB_D65_SRG_Rel_SRG";
+    EXPECT_TRUE(
+        test::DecodeFile(dparams, Span<const uint8_t>(compressed), &io2));
+    EXPECT_FALSE(io2.Main().IsGray());
+
+    EXPECT_LE(compressed.size(), 7000u);
+    EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, cparams.ba_params,
+                                    GetJxlCms(),
+                                    /*distmap=*/nullptr),
+                IsSlightlyBelow(1.6));
+  }
+}
+
+TEST(JxlTest, RoundtripAlpha) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+
+  ASSERT_NE(io.xsize(), 0u);
+  ASSERT_TRUE(io.metadata.m.HasAlpha());
+  ASSERT_TRUE(io.Main().HasAlpha());
+  io.ShrinkTo(300, 300);
+
+  CompressParams cparams;
+  cparams.butteraugli_distance = 1.0;
+
+  EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample);
+  EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
+  EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample);
+  EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB());
+  PassesEncoderState enc_state;
+  AuxOut* aux_out = nullptr;
+  PaddedBytes compressed;
+  EXPECT_TRUE(
+      EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(), aux_out));
+
+  EXPECT_LE(compressed.size(), 10077u);
+
+  for (bool use_image_callback : {false, true}) {
+    for (bool unpremul_alpha : {false, true}) {
+      CodecInOut io2;
+      JXLDecompressParams dparams;
+      dparams.use_image_callback = use_image_callback;
+      dparams.unpremultiply_alpha = unpremul_alpha;
+      EXPECT_TRUE(
+          test::DecodeFile(dparams, Span<const uint8_t>(compressed), &io2));
+      EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, cparams.ba_params,
+                                      GetJxlCms(),
+                                      /*distmap=*/nullptr),
+                  IsSlightlyBelow(1.15));
+    }
+  }
+}
+
+namespace {
+// Performs "PremultiplyAlpha" for each ImageBundle (preview/frames).
+bool PremultiplyAlpha(CodecInOut& io) {
+  const auto doPremultiplyAlpha = [](ImageBundle& bundle) {
+    if (!bundle.HasAlpha()) return;
+    if (!bundle.HasColor()) return;
+    auto* color = bundle.color();
+    const auto* alpha = bundle.alpha();
+    JXL_CHECK(color->ysize() == alpha->ysize());
+    JXL_CHECK(color->xsize() == alpha->xsize());
+    for (size_t y = 0; y < color->ysize(); y++) {
+      ::jxl::PremultiplyAlpha(color->PlaneRow(0, y), color->PlaneRow(1, y),
+                              color->PlaneRow(2, y), alpha->Row(y),
+                              color->xsize());
+    }
+  };
+  ExtraChannelInfo* eci = io.metadata.m.Find(ExtraChannel::kAlpha);
+  if (eci == nullptr || eci->alpha_associated) return false;
+  if (io.metadata.m.have_preview) {
+    doPremultiplyAlpha(io.preview_frame);
+  }
+  for (ImageBundle& ib : io.frames) {
+    doPremultiplyAlpha(ib);
+  }
+  eci->alpha_associated = true;
+  return true;
+}
+
+bool UnpremultiplyAlpha(CodecInOut& io) {
+  const auto doUnpremultiplyAlpha = [](ImageBundle& bundle) {
+    if (!bundle.HasAlpha()) return;
+    if (!bundle.HasColor()) return;
+    auto* color = bundle.color();
+    const auto* alpha = bundle.alpha();
+    JXL_CHECK(color->ysize() == alpha->ysize());
+    JXL_CHECK(color->xsize() == alpha->xsize());
+    for (size_t y = 0; y < color->ysize(); y++) {
+      ::jxl::UnpremultiplyAlpha(color->PlaneRow(0, y), color->PlaneRow(1, y),
+                                color->PlaneRow(2, y), alpha->Row(y),
+                                color->xsize());
+    }
+  };
+  ExtraChannelInfo* eci = io.metadata.m.Find(ExtraChannel::kAlpha);
+  if (eci == nullptr || !eci->alpha_associated) return false;
+  if (io.metadata.m.have_preview) {
+    doUnpremultiplyAlpha(io.preview_frame);
+  }
+  for (ImageBundle& ib : io.frames) {
+    doUnpremultiplyAlpha(ib);
+  }
+  eci->alpha_associated = false;
+  return true;
+}
+}  // namespace
+
+TEST(JxlTest, RoundtripAlphaPremultiplied) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
+  CodecInOut io, io_nopremul;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io_nopremul));
+
+  ASSERT_NE(io.xsize(), 0u);
+  ASSERT_TRUE(io.metadata.m.HasAlpha());
+  ASSERT_TRUE(io.Main().HasAlpha());
+  io.ShrinkTo(300, 300);
+  io_nopremul.ShrinkTo(300, 300);
+
+  CompressParams cparams;
+  cparams.butteraugli_distance = 1.0;
+
+  EXPECT_FALSE(io.Main().AlphaIsPremultiplied());
+  EXPECT_TRUE(PremultiplyAlpha(io));
+  EXPECT_TRUE(io.Main().AlphaIsPremultiplied());
+
+  EXPECT_FALSE(io_nopremul.Main().AlphaIsPremultiplied());
+
+  PassesEncoderState enc_state;
+  AuxOut* aux_out = nullptr;
+  PaddedBytes compressed;
+  EXPECT_TRUE(
+      EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(), aux_out));
+  EXPECT_LE(compressed.size(), 10000u);
+
+  for (bool use_image_callback : {false, true}) {
+    for (bool unpremul_alpha : {false, true}) {
+      for (bool use_uint8 : {false, true}) {
+        printf(
+            "Testing premultiplied alpha using %s %s requesting "
+            "%spremultiplied output.\n",
+            use_uint8 ? "uint8" : "float",
+            use_image_callback ? "image callback" : "image_buffer",
+            unpremul_alpha ? "un" : "");
+        CodecInOut io2;
+        JXLDecompressParams dparams;
+        dparams.use_image_callback = use_image_callback;
+        dparams.unpremultiply_alpha = unpremul_alpha;
+        if (use_uint8) {
+          dparams.accepted_formats = {
+              {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}};
+        }
+        EXPECT_TRUE(
+            test::DecodeFile(dparams, Span<const uint8_t>(compressed), &io2));
+
+        EXPECT_EQ(unpremul_alpha, !io2.Main().AlphaIsPremultiplied());
+        if (!unpremul_alpha) {
+          EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames,
+                                          cparams.ba_params, GetJxlCms(),
+                                          /*distmap=*/nullptr),
+                      IsSlightlyBelow(1.2));
+          EXPECT_TRUE(UnpremultiplyAlpha(io2));
+          EXPECT_FALSE(io2.Main().AlphaIsPremultiplied());
+        }
+        EXPECT_THAT(ButteraugliDistance(io_nopremul.frames, io2.frames,
+                                        cparams.ba_params, GetJxlCms(),
+                                        /*distmap=*/nullptr),
+                    IsSlightlyBelow(1.47));
+      }
+    }
+  }
+}
+
+TEST(JxlTest, RoundtripAlphaResampling) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  ASSERT_TRUE(t.ppf().info.alpha_bits > 0);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 5);  // kHare
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING, 2);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 12803, 130);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(5.2));
+}
+
+TEST(JxlTest, RoundtripAlphaResamplingOnlyAlpha) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  ASSERT_TRUE(t.ppf().info.alpha_bits > 0);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3);  // kFalcon
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING, 2);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 33571, 400);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.49));
+}
+
+TEST(JxlTest, RoundtripAlphaNonMultipleOf8) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(12, 12);
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  ASSERT_TRUE(t.ppf().info.alpha_bits > 0);
+  EXPECT_EQ(t.ppf().frames[0].color.format.data_type, JXL_TYPE_UINT8);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 107, 10);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.95));
+}
+
+TEST(JxlTest, RoundtripAlpha16) {
+  ThreadPoolForTests pool(4);
+  // The image is wider than 512 pixels to ensure multiple groups are tested.
+  size_t xsize = 1200, ysize = 160;
+  TestImage t;
+  t.SetDimensions(xsize, ysize).SetChannels(4).SetAllBitDepths(16);
+  TestImage::Frame frame = t.AddFrame();
+  // Generate 16-bit pattern that uses various colors and alpha values.
+  const float mul = 1.0f / 65535;
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      uint16_t r = y * 65535 / ysize;
+      uint16_t g = x * 65535 / xsize;
+      uint16_t b = (y + x) * 65535 / (xsize + ysize);
+      frame.SetValue(y, x, 0, r * mul);
+      frame.SetValue(y, x, 1, g * mul);
+      frame.SetValue(y, x, 2, b * mul);
+      frame.SetValue(y, x, 3, g * mul);
+    }
+  }
+
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  ASSERT_EQ(t.ppf().info.alpha_bits, 16);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 6);  // kWombat
+  cparams.distance = 0.5;
+
+  PackedPixelFile ppf_out;
+  // TODO(szabadka) Investigate big size difference on i686
+  // This still keeps happening (2023-04-18).
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 3466, 120);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.65));
+}
+
+namespace {
+JXLCompressParams CompressParamsForLossless() {
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR, 1);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM, 1);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR, 6);  // Weighted
+  cparams.distance = 0;
+  return cparams;
+}
+}  // namespace
+
+TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 222167);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+}
+
+TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8ThunderGradient)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 2);             // kThunder
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR, 5);  // Gradient
+
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 261684);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+}
+
+TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8LightningGradient)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 1);  // kLightning
+
+  PackedPixelFile ppf_out;
+  // Lax comparison because different SIMD will cause different compression.
+  EXPECT_THAT(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out),
+              IsSlightlyBelow(286848u));
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+}
+
+TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8Falcon)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3);  // kFalcon
+
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 230766);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+}
+
+TEST(JxlTest, RoundtripLossless8Alpha) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_EQ(t.ppf().info.alpha_bits, 8);
+  EXPECT_EQ(t.ppf().frames[0].color.format.data_type, JXL_TYPE_UINT8);
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 248817);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+  EXPECT_EQ(ppf_out.info.alpha_bits, 8);
+  EXPECT_TRUE(test::SameAlpha(t.ppf(), ppf_out));
+}
+
+TEST(JxlTest, RoundtripLossless16Alpha) {
+  ThreadPool* pool = nullptr;
+  size_t xsize = 1200, ysize = 160;
+  TestImage t;
+  t.SetDimensions(xsize, ysize).SetChannels(4).SetAllBitDepths(16);
+  TestImage::Frame frame = t.AddFrame();
+  // Generate 16-bit pattern that uses various colors and alpha values.
+  const float mul = 1.0f / 65535;
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      uint16_t r = y * 65535 / ysize;
+      uint16_t g = x * 65535 / xsize + 37;
+      uint16_t b = (y + x) * 65535 / (xsize + ysize);
+      frame.SetValue(y, x, 0, r * mul);
+      frame.SetValue(y, x, 1, g * mul);
+      frame.SetValue(y, x, 2, b * mul);
+      frame.SetValue(y, x, 3, g * mul);
+    }
+  }
+  ASSERT_EQ(t.ppf().info.bits_per_sample, 16);
+  ASSERT_EQ(t.ppf().info.alpha_bits, 16);
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  // TODO(szabadka) Investigate big size difference on i686
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 4849, 100);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+  EXPECT_EQ(ppf_out.info.alpha_bits, 16);
+  EXPECT_TRUE(test::SameAlpha(t.ppf(), ppf_out));
+}
+
+TEST(JxlTest, RoundtripLossless16AlphaNotMisdetectedAs8Bit) {
+  ThreadPool* pool = nullptr;
+  size_t xsize = 128, ysize = 128;
+  TestImage t;
+  t.SetDimensions(xsize, ysize).SetChannels(4).SetAllBitDepths(16);
+  TestImage::Frame frame = t.AddFrame();
+  // All 16-bit values, both color and alpha, of this image are below 64.
+  // This allows testing if a code path wrongly concludes it's an 8-bit instead
+  // of 16-bit image (or even 6-bit).
+  const float mul = 1.0f / 65535;
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      uint16_t r = y * 64 / ysize;
+      uint16_t g = x * 64 / xsize + 37;
+      uint16_t b = (y + x) * 64 / (xsize + ysize);
+      frame.SetValue(y, x, 0, r * mul);
+      frame.SetValue(y, x, 1, g * mul);
+      frame.SetValue(y, x, 2, b * mul);
+      frame.SetValue(y, x, 3, g * mul);
+    }
+  }
+  ASSERT_EQ(t.ppf().info.bits_per_sample, 16);
+  ASSERT_EQ(t.ppf().info.alpha_bits, 16);
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 543, 75);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+  EXPECT_EQ(ppf_out.info.bits_per_sample, 16);
+  EXPECT_EQ(ppf_out.info.alpha_bits, 16);
+  EXPECT_TRUE(test::SameAlpha(t.ppf(), ppf_out));
+}
+
+TEST(JxlTest, RoundtripDots) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/cvo9xd_keong_macan_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  EXPECT_EQ(t.ppf().info.bits_per_sample, 8);
+  EXPECT_EQ(t.ppf().color_encoding.transfer_function,
+            JXL_TRANSFER_FUNCTION_SRGB);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSkirrel
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_DOTS, 1);
+  cparams.distance = 0.04;
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 284295, 3000);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.35));
+}
+
+TEST(JxlTest, RoundtripNoise) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/cvo9xd_keong_macan_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  EXPECT_EQ(t.ppf().info.bits_per_sample, 8);
+  EXPECT_EQ(t.ppf().color_encoding.transfer_function,
+            JXL_TRANSFER_FUNCTION_SRGB);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSkirrel
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_NOISE, 1);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 41385, 750);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.48));
+}
+
+TEST(JxlTest, RoundtripLossless8Gray) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/cvo9xd_keong_macan_grayscale.png");
+  TestImage t;
+  t.SetColorEncoding("Gra_D65_Rel_SRG").DecodeFromBytes(orig).ClearMetadata();
+  EXPECT_EQ(t.ppf().color_encoding.color_space, JXL_COLOR_SPACE_GRAY);
+  EXPECT_EQ(t.ppf().info.bits_per_sample, 8);
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 92766);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+  EXPECT_EQ(ppf_out.color_encoding.color_space, JXL_COLOR_SPACE_GRAY);
+  EXPECT_EQ(ppf_out.info.bits_per_sample, 8);
+}
+
+#if JPEGXL_ENABLE_GIF
+
+TEST(JxlTest, RoundtripAnimation) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/traffic_light.gif");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  EXPECT_EQ(4, t.ppf().frames.size());
+
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  EXPECT_THAT(Roundtrip(t.ppf(), {}, dparams, pool, &ppf_out),
+              IsSlightlyBelow(2600));
+
+  t.CoalesceGIFAnimationWithAlpha();
+  ASSERT_EQ(ppf_out.frames.size(), t.ppf().frames.size());
+  EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out),
+#if JXL_HIGH_PRECISION
+            1.55);
+#else
+            1.75);
+#endif
+}
+
+TEST(JxlTest, RoundtripLosslessAnimation) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/traffic_light.gif");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  EXPECT_EQ(4, t.ppf().frames.size());
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 958);
+
+  t.CoalesceGIFAnimationWithAlpha();
+  ASSERT_EQ(ppf_out.frames.size(), t.ppf().frames.size());
+  EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out), 5e-4);
+}
+
+TEST(JxlTest, RoundtripAnimationPatches) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/animation_patches.gif");
+
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_EQ(2u, t.ppf().frames.size());
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_PATCHES, 1);
+
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  // 40k with no patches, 27k with patch frames encoded multiple times.
+  EXPECT_THAT(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out),
+              IsSlightlyBelow(16710));
+  EXPECT_EQ(ppf_out.frames.size(), t.ppf().frames.size());
+  // >10 with broken patches
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.05));
+}
+
+#endif  // JPEGXL_ENABLE_GIF
+
+size_t RoundtripJpeg(const PaddedBytes& jpeg_in, ThreadPool* pool) {
+  std::vector<uint8_t> jpeg_bytes(jpeg_in.data(),
+                                  jpeg_in.data() + jpeg_in.size());
+  std::vector<uint8_t> compressed;
+  EXPECT_TRUE(extras::EncodeImageJXL({}, extras::PackedPixelFile(), &jpeg_bytes,
+                                     &compressed));
+
+  jxl::JXLDecompressParams dparams;
+  test::SetThreadParallelRunner(dparams, pool);
+  std::vector<uint8_t> out;
+  jxl::PackedPixelFile ppf;
+  EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams,
+                             nullptr, &ppf, &out));
+  EXPECT_EQ(out.size(), jpeg_in.size());
+  size_t failures = 0;
+  for (size_t i = 0; i < std::min(out.size(), jpeg_in.size()); i++) {
+    if (out[i] != jpeg_in[i]) {
+      EXPECT_EQ(out[i], jpeg_in[i])
+          << "byte mismatch " << i << " " << out[i] << " != " << jpeg_in[i];
+      if (++failures > 4) {
+        return compressed.size();
+      }
+    }
+  }
+  return compressed.size();
+}
+
+void RoundtripJpegToPixels(const PaddedBytes& jpeg_in,
+                           JXLDecompressParams dparams, ThreadPool* pool,
+                           PackedPixelFile* ppf_out) {
+  std::vector<uint8_t> jpeg_bytes(jpeg_in.data(),
+                                  jpeg_in.data() + jpeg_in.size());
+  std::vector<uint8_t> compressed;
+  EXPECT_TRUE(extras::EncodeImageJXL({}, extras::PackedPixelFile(), &jpeg_bytes,
+                                     &compressed));
+
+  test::SetThreadParallelRunner(dparams, pool);
+  EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams,
+                             nullptr, ppf_out, nullptr));
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression444)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_444.jpg");
+  // JPEG size is 696,659 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 568940u, 10);
+}
+
+#if JPEGXL_ENABLE_JPEG
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_444.jpg");
+  TestImage t;
+  t.DecodeFromBytes(orig);
+
+  PackedPixelFile ppf_out;
+  RoundtripJpegToPixels(orig, {}, &pool, &ppf_out);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(12));
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels420)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_420.jpg");
+  TestImage t;
+  t.DecodeFromBytes(orig);
+
+  PackedPixelFile ppf_out;
+  RoundtripJpegToPixels(orig, {}, &pool, &ppf_out);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(11));
+}
+
+TEST(JxlTest,
+     JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels420EarlyFlush)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_420.jpg");
+  TestImage t;
+  t.DecodeFromBytes(orig);
+
+  JXLDecompressParams dparams;
+  dparams.max_downsampling = 8;
+
+  PackedPixelFile ppf_out;
+  RoundtripJpegToPixels(orig, dparams, &pool, &ppf_out);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(4410));
+}
+
+TEST(JxlTest,
+     JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels420Mul16)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower_cropped.jpg");
+  TestImage t;
+  t.DecodeFromBytes(orig);
+
+  PackedPixelFile ppf_out;
+  RoundtripJpegToPixels(orig, {}, &pool, &ppf_out);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(4));
+}
+
+TEST(JxlTest,
+     JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels_asymmetric)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_asymmetric.jpg");
+  TestImage t;
+  t.DecodeFromBytes(orig);
+
+  PackedPixelFile ppf_out;
+  RoundtripJpegToPixels(orig, {}, &pool, &ppf_out);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(10));
+}
+
+#endif
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionGray)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_gray.jpg");
+  // JPEG size is 456,528 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 387496u, 200);
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression420)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_420.jpg");
+  // JPEG size is 546,797 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 455560u, 10);
+}
+
+TEST(JxlTest,
+     JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression_luma_subsample)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "jxl/flower/flower.png.im_q85_luma_subsample.jpg");
+  // JPEG size is 400,724 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 325354u, 10);
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression444_12)) {
+  // 444 JPEG that has an interesting sampling-factor (1x2, 1x2, 1x2).
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_444_1x2.jpg");
+  // JPEG size is 703,874 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 569679u, 10);
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression422)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_422.jpg");
+  // JPEG size is 522,057 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 499282u, 10);
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression440)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_440.jpg");
+  // JPEG size is 603,623 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 501151u, 10);
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression_asymmetric)) {
+  // 2x vertical downsample of one chroma channel, 2x horizontal downsample of
+  // the other.
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_asymmetric.jpg");
+  // JPEG size is 604,601 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 500602u, 10);
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression420Progr)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_420_progr.jpg");
+  // JPEG size is 522,057 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 455499u, 10);
+}
+
+TEST(JxlTest, RoundtripProgressive) {
+  ThreadPoolForTests pool(4);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(600, 1024);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 1);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC, 1);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESPONSIVE, 1);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 61635, 750);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.4));
+}
+
+TEST(JxlTest, RoundtripProgressiveLevel2Slow) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(600, 1024);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 9);  // kTortoise
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC, 1);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESPONSIVE, 1);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 72841, 1000);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.17));
+}
+
+TEST(JxlTest, RoundtripUnsignedCustomBitdepthLossless) {
+  ThreadPool* pool = nullptr;
+  for (uint32_t num_channels = 1; num_channels < 6; ++num_channels) {
+    for (JxlEndianness endianness : {JXL_LITTLE_ENDIAN, JXL_BIG_ENDIAN}) {
+      for (uint32_t bitdepth = 3; bitdepth <= 16; ++bitdepth) {
+        if (bitdepth <= 8 && endianness == JXL_BIG_ENDIAN) continue;
+        printf("Testing %u channel unsigned %u bit %s endian lossless.\n",
+               num_channels, bitdepth,
+               endianness == JXL_LITTLE_ENDIAN ? "little" : "big");
+        TestImage t;
+        t.SetDimensions(256, 256).SetChannels(num_channels);
+        t.SetAllBitDepths(bitdepth).SetEndianness(endianness);
+        TestImage::Frame frame = t.AddFrame();
+        frame.RandomFill();
+
+        JXLCompressParams cparams = CompressParamsForLossless();
+        cparams.input_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM;
+
+        JXLDecompressParams dparams;
+        dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+        dparams.output_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM;
+
+        PackedPixelFile ppf_out;
+        Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out);
+
+        ASSERT_TRUE(test::SamePixels(t.ppf(), ppf_out));
+      }
+    }
+  }
+}
+
+TEST(JxlTest, LosslessPNMRoundtrip) {
+  static const char* kChannels[] = {"", "g", "ga", "rgb", "rgba"};
+  static const char* kExtension[] = {"", ".pgm", ".pam", ".ppm", ".pam"};
+  for (size_t bit_depth = 1; bit_depth <= 16; ++bit_depth) {
+    for (size_t channels = 1; channels <= 4; ++channels) {
+      if (bit_depth == 1 && (channels == 2 || channels == 4)) continue;
+      std::string extension(kExtension[channels]);
+      std::string filename = "jxl/flower/flower_small." +
+                             std::string(kChannels[channels]) + ".depth" +
+                             std::to_string(bit_depth) + extension;
+      const PaddedBytes orig = jxl::test::ReadTestData(filename);
+      test::TestImage t;
+      if (channels < 3) t.SetColorEncoding("Gra_D65_Rel_SRG");
+      t.DecodeFromBytes(orig);
+
+      JXLCompressParams cparams = CompressParamsForLossless();
+      cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 1);  // kLightning
+      cparams.input_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM;
+
+      JXLDecompressParams dparams;
+      dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+      dparams.output_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM;
+
+      PackedPixelFile ppf_out;
+      Roundtrip(t.ppf(), cparams, dparams, nullptr, &ppf_out);
+
+      extras::EncodedImage encoded;
+      auto encoder = extras::Encoder::FromExtension(extension);
+      ASSERT_TRUE(encoder.get());
+      ASSERT_TRUE(encoder->Encode(ppf_out, &encoded, nullptr));
+      ASSERT_EQ(encoded.bitstreams.size(), 1);
+      ASSERT_EQ(orig.size(), encoded.bitstreams[0].size());
+      EXPECT_EQ(0,
+                memcmp(orig.data(), encoded.bitstreams[0].data(), orig.size()));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/lehmer_code.h b/third_party/jpeg-xl/lib/jxl/lehmer_code.h
new file mode 100644
index 0000000000..dd1d21c6f7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/lehmer_code.h
@@ -0,0 +1,102 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_LEHMER_CODE_H_
+#define LIB_JXL_LEHMER_CODE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Permutation <=> factorial base representation (Lehmer code).
+
+using LehmerT = uint32_t;
+
+template <typename T>
+constexpr T ValueOfLowest1Bit(T t) {
+  return t & -t;
+}
+
+// Computes the Lehmer (factorial basis) code of permutation, an array of n
+// unique indices in [0..n), and stores it in code[0..len). N*logN time.
+// temp must have n + 1 elements but need not be initialized.
+template <typename PermutationT>
+void ComputeLehmerCode(const PermutationT* JXL_RESTRICT permutation,
+                       uint32_t* JXL_RESTRICT temp, const size_t n,
+                       LehmerT* JXL_RESTRICT code) {
+  for (size_t idx = 0; idx < n + 1; ++idx) temp[idx] = 0;
+
+  for (size_t idx = 0; idx < n; ++idx) {
+    const PermutationT s = permutation[idx];
+
+    // Compute sum in Fenwick tree
+    uint32_t penalty = 0;
+    uint32_t i = s + 1;
+    while (i != 0) {
+      penalty += temp[i];
+      i &= i - 1;  // clear lowest bit
+    }
+    JXL_DASSERT(s >= penalty);
+    code[idx] = s - penalty;
+    i = s + 1;
+    // Add operation in Fenwick tree
+    while (i < n + 1) {
+      temp[i] += 1;
+      i += ValueOfLowest1Bit(i);
+    }
+  }
+}
+
+// Decodes the Lehmer code in code[0..n) into permutation[0..n).
+// temp must have 1 << CeilLog2(n) elements but need not be initialized.
+template <typename PermutationT>
+void DecodeLehmerCode(const LehmerT* JXL_RESTRICT code,
+                      uint32_t* JXL_RESTRICT temp, size_t n,
+                      PermutationT* JXL_RESTRICT permutation) {
+  JXL_DASSERT(n != 0);
+  const size_t log2n = CeilLog2Nonzero(n);
+  const size_t padded_n = 1ull << log2n;
+
+  for (size_t i = 0; i < padded_n; i++) {
+    const int32_t i1 = static_cast<int32_t>(i + 1);
+    temp[i] = static_cast<uint32_t>(ValueOfLowest1Bit(i1));
+  }
+
+  for (size_t i = 0; i < n; i++) {
+    JXL_DASSERT(code[i] + i < n);
+    uint32_t rank = code[i] + 1;
+
+    // Extract i-th unused element via implicit order-statistics tree.
+    size_t bit = padded_n;
+    size_t next = 0;
+    for (size_t i = 0; i <= log2n; i++) {
+      const size_t cand = next + bit;
+      JXL_DASSERT(cand >= 1);
+      bit >>= 1;
+      if (temp[cand - 1] < rank) {
+        next = cand;
+        rank -= temp[cand - 1];
+      }
+    }
+
+    permutation[i] = next;
+
+    // Mark as used
+    next += 1;
+    while (next <= padded_n) {
+      temp[next - 1] -= 1;
+      next += ValueOfLowest1Bit(next);
+    }
+  }
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_LEHMER_CODE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/lehmer_code_test.cc b/third_party/jpeg-xl/lib/jxl/lehmer_code_test.cc
new file mode 100644
index 0000000000..acda762545
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/lehmer_code_test.cc
@@ -0,0 +1,98 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/lehmer_code.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <numeric>
+#include <vector>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+template <typename PermutationT>
+struct WorkingSet {
+  explicit WorkingSet(size_t max_n)
+      : padded_n(1ull << CeilLog2Nonzero(max_n + 1)),
+        permutation(max_n),
+        temp(padded_n),
+        lehmer(max_n),
+        decoded(max_n) {}
+
+  size_t padded_n;
+  std::vector<PermutationT> permutation;
+  std::vector<uint32_t> temp;
+  std::vector<LehmerT> lehmer;
+  std::vector<PermutationT> decoded;
+};
+
+template <typename PermutationT>
+void Roundtrip(size_t n, WorkingSet<PermutationT>* ws) {
+  JXL_ASSERT(n != 0);
+  const size_t padded_n = 1ull << CeilLog2Nonzero(n);
+
+  Rng rng(n * 65537 + 13);
+
+  // Ensure indices fit into PermutationT
+  EXPECT_LE(n, 1ULL << (sizeof(PermutationT) * 8));
+
+  std::iota(ws->permutation.begin(), ws->permutation.begin() + n, 0);
+
+  // For various random permutations:
+  for (size_t rep = 0; rep < 3; ++rep) {
+    rng.Shuffle(ws->permutation.data(), n);
+
+    // Must decode to the same permutation
+    ComputeLehmerCode(ws->permutation.data(), ws->temp.data(), n,
+                      ws->lehmer.data());
+    memset(ws->temp.data(), 0, padded_n * 4);
+    DecodeLehmerCode(ws->lehmer.data(), ws->temp.data(), n, ws->decoded.data());
+
+    for (size_t i = 0; i < n; ++i) {
+      EXPECT_EQ(ws->permutation[i], ws->decoded[i]);
+    }
+  }
+}
+
+// Preallocates arrays and tests n = [begin, end).
+template <typename PermutationT>
+void RoundtripSizeRange(ThreadPool* pool, uint32_t begin, uint32_t end) {
+  ASSERT_NE(0u, begin);  // n = 0 not allowed.
+  std::vector<WorkingSet<PermutationT>> working_sets;
+
+  JXL_CHECK(RunOnPool(
+      pool, begin, end,
+      [&working_sets, end](const size_t num_threads) {
+        for (size_t i = 0; i < num_threads; i++) {
+          working_sets.emplace_back(end - 1);
+        }
+        return true;
+      },
+      [&working_sets](const uint32_t n, const size_t thread) {
+        Roundtrip(n, &working_sets[thread]);
+      },
+      "lehmer test"));
+}
+
+TEST(LehmerCodeTest, TestRoundtrips) {
+  test::ThreadPoolForTests pool(8);
+
+  RoundtripSizeRange<uint16_t>(&pool, 1, 1026);
+
+  // Ensures PermutationT can fit > 16 bit values.
+  RoundtripSizeRange<uint32_t>(&pool, 65536, 65540);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/libjxl.pc.in b/third_party/jpeg-xl/lib/jxl/libjxl.pc.in
new file mode 100644
index 0000000000..4a7af65b7c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/libjxl.pc.in
@@ -0,0 +1,13 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+libdir=@PKGCONFIG_TARGET_LIBS@
+includedir=@PKGCONFIG_TARGET_INCLUDES@
+
+Name: libjxl
+Description: Loads and saves JPEG XL files
+Version: @JPEGXL_LIBRARY_VERSION@
+Requires.private: @JPEGXL_LIBRARY_REQUIRES@
+Libs: -L${libdir} -ljxl
+Libs.private: -lm
+Cflags: -I${includedir}
+Cflags.private: -DJXL_STATIC_DEFINE
diff --git a/third_party/jpeg-xl/lib/jxl/loop_filter.cc b/third_party/jpeg-xl/lib/jxl/loop_filter.cc
new file mode 100644
index 0000000000..5afe87617d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/loop_filter.cc
@@ -0,0 +1,98 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/loop_filter.h"
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+
+LoopFilter::LoopFilter() { Bundle::Init(this); }
+Status LoopFilter::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  // Must come before AllDefault.
+
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(true, &gab));
+  if (visitor->Conditional(gab)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &gab_custom));
+    if (visitor->Conditional(gab_custom)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(1.1 * 0.104699568f, &gab_x_weight1));
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(1.1 * 0.055680538f, &gab_x_weight2));
+      if (std::abs(1.0f + (gab_x_weight1 + gab_x_weight2) * 4) < 1e-8) {
+        return JXL_FAILURE(
+            "Gaborish x weights lead to near 0 unnormalized kernel");
+      }
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(1.1 * 0.104699568f, &gab_y_weight1));
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(1.1 * 0.055680538f, &gab_y_weight2));
+      if (std::abs(1.0f + (gab_y_weight1 + gab_y_weight2) * 4) < 1e-8) {
+        return JXL_FAILURE(
+            "Gaborish y weights lead to near 0 unnormalized kernel");
+      }
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(1.1 * 0.104699568f, &gab_b_weight1));
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(1.1 * 0.055680538f, &gab_b_weight2));
+      if (std::abs(1.0f + (gab_b_weight1 + gab_b_weight2) * 4) < 1e-8) {
+        return JXL_FAILURE(
+            "Gaborish b weights lead to near 0 unnormalized kernel");
+      }
+    }
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 2, &epf_iters));
+  if (visitor->Conditional(epf_iters > 0)) {
+    if (visitor->Conditional(!nonserialized_is_modular)) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &epf_sharp_custom));
+      if (visitor->Conditional(epf_sharp_custom)) {
+        for (size_t i = 0; i < kEpfSharpEntries; ++i) {
+          JXL_QUIET_RETURN_IF_ERROR(visitor->F16(
+              float(i) / float(kEpfSharpEntries - 1), &epf_sharp_lut[i]));
+        }
+      }
+    }
+
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &epf_weight_custom));
+    if (visitor->Conditional(epf_weight_custom)) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(40.0f, &epf_channel_scale[0]));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(5.0f, &epf_channel_scale[1]));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(3.5f, &epf_channel_scale[2]));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.45f, &epf_pass1_zeroflush));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.6f, &epf_pass2_zeroflush));
+    }
+
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &epf_sigma_custom));
+    if (visitor->Conditional(epf_sigma_custom)) {
+      if (visitor->Conditional(!nonserialized_is_modular)) {
+        JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.46f, &epf_quant_mul));
+      }
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.9f, &epf_pass0_sigma_scale));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(6.5f, &epf_pass2_sigma_scale));
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(0.6666666666666666f, &epf_border_sad_mul));
+    }
+    if (visitor->Conditional(nonserialized_is_modular)) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(1.0f, &epf_sigma_for_modular));
+      if (epf_sigma_for_modular < 1e-8) {
+        return JXL_FAILURE("EPF: sigma for modular is too small");
+      }
+    }
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions));
+  // Extensions: in chronological order of being added to the format.
+  return visitor->EndExtensions();
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/loop_filter.h b/third_party/jpeg-xl/lib/jxl/loop_filter.h
new file mode 100644
index 0000000000..e4b418ba2b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/loop_filter.h
@@ -0,0 +1,76 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_LOOP_FILTER_H_
+#define LIB_JXL_LOOP_FILTER_H_
+
+// Parameters for loop filter(s), stored in each frame.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/field_encodings.h"
+
+namespace jxl {
+
+struct LoopFilter : public Fields {
+  LoopFilter();
+  JXL_FIELDS_NAME(LoopFilter)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  size_t Padding() const {
+    static const size_t padding_per_epf_iter[4] = {0, 2, 3, 6};
+    return padding_per_epf_iter[epf_iters] + (gab ? 1 : 0);
+  }
+
+  mutable bool all_default;
+
+  // --- Gaborish convolution
+  bool gab;
+
+  bool gab_custom;
+  float gab_x_weight1;
+  float gab_x_weight2;
+  float gab_y_weight1;
+  float gab_y_weight2;
+  float gab_b_weight1;
+  float gab_b_weight2;
+
+  // --- Edge-preserving filter
+
+  // Number of EPF stages to apply. 0 means EPF disabled. 1 applies only the
+  // first stage, 2 applies both stages and 3 applies the first stage twice and
+  // the second stage once.
+  uint32_t epf_iters;
+
+  bool epf_sharp_custom;
+  enum { kEpfSharpEntries = 8 };
+  float epf_sharp_lut[kEpfSharpEntries];
+
+  bool epf_weight_custom;      // Custom weight params
+  float epf_channel_scale[3];  // Relative weight of each channel
+  float epf_pass1_zeroflush;   // Minimum weight for first pass
+  float epf_pass2_zeroflush;   // Minimum weight for second pass
+
+  bool epf_sigma_custom;        // Custom sigma parameters
+  float epf_quant_mul;          // Sigma is ~ this * quant
+  float epf_pass0_sigma_scale;  // Multiplier for sigma in pass 0
+  float epf_pass2_sigma_scale;  // Multiplier for sigma in the second pass
+  float epf_border_sad_mul;     // (inverse) multiplier for sigma on borders
+
+  float epf_sigma_for_modular;
+
+  uint64_t extensions;
+
+  bool nonserialized_is_modular = false;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_LOOP_FILTER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/luminance.cc b/third_party/jpeg-xl/lib/jxl/luminance.cc
new file mode 100644
index 0000000000..10151f4267
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/luminance.cc
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/luminance.h"
+
+#include "lib/jxl/image_metadata.h"
+
+namespace jxl {
+
+void SetIntensityTarget(ImageMetadata* m) {
+  if (m->color_encoding.tf.IsPQ()) {
+    // Peak luminance of PQ as defined by SMPTE ST 2084:2014.
+    m->SetIntensityTarget(10000);
+  } else if (m->color_encoding.tf.IsHLG()) {
+    // Nominal display peak luminance used as a reference by
+    // Rec. ITU-R BT.2100-2.
+    m->SetIntensityTarget(1000);
+  } else {
+    // SDR
+    m->SetIntensityTarget(kDefaultIntensityTarget);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/luminance.h b/third_party/jpeg-xl/lib/jxl/luminance.h
new file mode 100644
index 0000000000..3181576823
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/luminance.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_LUMINANCE_H_
+#define LIB_JXL_LUMINANCE_H_
+
+namespace jxl {
+
+// Chooses a default intensity target based on the transfer function of the
+// image, if known. For SDR images or images not known to be HDR, returns
+// kDefaultIntensityTarget, for images known to have PQ or HLG transfer function
+// returns a higher value.
+
+struct ImageMetadata;
+// TODO(eustas): rename
+void SetIntensityTarget(ImageMetadata* m);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_LUMINANCE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/matrix_ops.h b/third_party/jpeg-xl/lib/jxl/matrix_ops.h
new file mode 100644
index 0000000000..1a969bd4f0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/matrix_ops.h
@@ -0,0 +1,84 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MATRIX_OPS_H_
+#define LIB_JXL_MATRIX_OPS_H_
+
+// 3x3 matrix operations.
+
+#include <cmath>  // abs
+#include <cstddef>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Computes C = A * B, where A, B, C are 3x3 matrices.
+template <typename T>
+void Mul3x3Matrix(const T* a, const T* b, T* c) {
+  alignas(16) T temp[3];  // For transposed column
+  for (size_t x = 0; x < 3; x++) {
+    for (size_t z = 0; z < 3; z++) {
+      temp[z] = b[z * 3 + x];
+    }
+    for (size_t y = 0; y < 3; y++) {
+      double e = 0;
+      for (size_t z = 0; z < 3; z++) {
+        e += a[y * 3 + z] * temp[z];
+      }
+      c[y * 3 + x] = e;
+    }
+  }
+}
+
+// Computes C = A * B, where A is 3x3 matrix and B is vector.
+template <typename T>
+void Mul3x3Vector(const T* a, const T* b, T* c) {
+  for (size_t y = 0; y < 3; y++) {
+    double e = 0;
+    for (size_t x = 0; x < 3; x++) {
+      e += a[y * 3 + x] * b[x];
+    }
+    c[y] = e;
+  }
+}
+
+// Inverts a 3x3 matrix in place.
+template <typename T>
+Status Inv3x3Matrix(T* matrix) {
+  // Intermediate computation is done in double precision.
+  double temp[9];
+  temp[0] = static_cast<double>(matrix[4]) * matrix[8] -
+            static_cast<double>(matrix[5]) * matrix[7];
+  temp[1] = static_cast<double>(matrix[2]) * matrix[7] -
+            static_cast<double>(matrix[1]) * matrix[8];
+  temp[2] = static_cast<double>(matrix[1]) * matrix[5] -
+            static_cast<double>(matrix[2]) * matrix[4];
+  temp[3] = static_cast<double>(matrix[5]) * matrix[6] -
+            static_cast<double>(matrix[3]) * matrix[8];
+  temp[4] = static_cast<double>(matrix[0]) * matrix[8] -
+            static_cast<double>(matrix[2]) * matrix[6];
+  temp[5] = static_cast<double>(matrix[2]) * matrix[3] -
+            static_cast<double>(matrix[0]) * matrix[5];
+  temp[6] = static_cast<double>(matrix[3]) * matrix[7] -
+            static_cast<double>(matrix[4]) * matrix[6];
+  temp[7] = static_cast<double>(matrix[1]) * matrix[6] -
+            static_cast<double>(matrix[0]) * matrix[7];
+  temp[8] = static_cast<double>(matrix[0]) * matrix[4] -
+            static_cast<double>(matrix[1]) * matrix[3];
+  double det = matrix[0] * temp[0] + matrix[1] * temp[3] + matrix[2] * temp[6];
+  if (std::abs(det) < 1e-10) {
+    return JXL_FAILURE("Matrix determinant is too close to 0");
+  }
+  double idet = 1.0 / det;
+  for (size_t i = 0; i < 9; i++) {
+    matrix[i] = temp[i] * idet;
+  }
+  return true;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MATRIX_OPS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/memory_manager_internal.cc b/third_party/jpeg-xl/lib/jxl/memory_manager_internal.cc
new file mode 100644
index 0000000000..87727e75cd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/memory_manager_internal.cc
@@ -0,0 +1,18 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/memory_manager_internal.h"
+
+#include <stdlib.h>
+
+namespace jxl {
+
+void* MemoryManagerDefaultAlloc(void* opaque, size_t size) {
+  return malloc(size);
+}
+
+void MemoryManagerDefaultFree(void* opaque, void* address) { free(address); }
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/memory_manager_internal.h b/third_party/jpeg-xl/lib/jxl/memory_manager_internal.h
new file mode 100644
index 0000000000..f8a5cd8d59
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/memory_manager_internal.h
@@ -0,0 +1,101 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MEMORY_MANAGER_INTERNAL_H_
+#define LIB_JXL_MEMORY_MANAGER_INTERNAL_H_
+
+// Memory allocator with support for alignment + misalignment.
+
+#include <jxl/memory_manager.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>  // memcpy
+
+#include <atomic>
+#include <memory>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Default alloc and free functions.
+void* MemoryManagerDefaultAlloc(void* opaque, size_t size);
+void MemoryManagerDefaultFree(void* opaque, void* address);
+
+// Initializes the memory manager instance with the passed one. The
+// MemoryManager passed in |memory_manager| may be NULL or contain NULL
+// functions which will be initialized with the default ones. If either alloc
+// or free are NULL, then both must be NULL, otherwise this function returns an
+// error.
+static JXL_INLINE Status MemoryManagerInit(
+    JxlMemoryManager* self, const JxlMemoryManager* memory_manager) {
+  if (memory_manager) {
+    *self = *memory_manager;
+  } else {
+    memset(self, 0, sizeof(*self));
+  }
+  if (!self->alloc != !self->free) {
+    return false;
+  }
+  if (!self->alloc) self->alloc = jxl::MemoryManagerDefaultAlloc;
+  if (!self->free) self->free = jxl::MemoryManagerDefaultFree;
+
+  return true;
+}
+
+static JXL_INLINE void* MemoryManagerAlloc(
+    const JxlMemoryManager* memory_manager, size_t size) {
+  return memory_manager->alloc(memory_manager->opaque, size);
+}
+
+static JXL_INLINE void MemoryManagerFree(const JxlMemoryManager* memory_manager,
+                                         void* address) {
+  return memory_manager->free(memory_manager->opaque, address);
+}
+
+// Helper class to be used as a deleter in a unique_ptr<T> call.
+class MemoryManagerDeleteHelper {
+ public:
+  explicit MemoryManagerDeleteHelper(const JxlMemoryManager* memory_manager)
+      : memory_manager_(memory_manager) {}
+
+  // Delete and free the passed pointer using the memory_manager.
+  template <typename T>
+  void operator()(T* address) const {
+    if (!address) {
+      return;
+    }
+    address->~T();
+    return memory_manager_->free(memory_manager_->opaque, address);
+  }
+
+ private:
+  const JxlMemoryManager* memory_manager_;
+};
+
+template <typename T>
+using MemoryManagerUniquePtr = std::unique_ptr<T, MemoryManagerDeleteHelper>;
+
+// Creates a new object T allocating it with the memory allocator into a
+// unique_ptr.
+template <typename T, typename... Args>
+JXL_INLINE MemoryManagerUniquePtr<T> MemoryManagerMakeUnique(
+    const JxlMemoryManager* memory_manager, Args&&... args) {
+  T* mem =
+      static_cast<T*>(memory_manager->alloc(memory_manager->opaque, sizeof(T)));
+  if (!mem) {
+    // Allocation error case.
+    return MemoryManagerUniquePtr<T>(nullptr,
+                                     MemoryManagerDeleteHelper(memory_manager));
+  }
+  return MemoryManagerUniquePtr<T>(new (mem) T(std::forward<Args>(args)...),
+                                   MemoryManagerDeleteHelper(memory_manager));
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MEMORY_MANAGER_INTERNAL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/context_predict.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/context_predict.h
new file mode 100644
index 0000000000..914cd6a4e4
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/context_predict.h
@@ -0,0 +1,626 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_ENCODING_CONTEXT_PREDICT_H_
+#define LIB_JXL_MODULAR_ENCODING_CONTEXT_PREDICT_H_
+
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/fields.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/options.h"
+
+namespace jxl {
+
+namespace weighted {
+constexpr static size_t kNumPredictors = 4;
+constexpr static int64_t kPredExtraBits = 3;
+constexpr static int64_t kPredictionRound = ((1 << kPredExtraBits) >> 1) - 1;
+constexpr static size_t kNumProperties = 1;
+
+struct Header : public Fields {
+  JXL_FIELDS_NAME(WeightedPredictorHeader)
+  // TODO(janwas): move to cc file, avoid including fields.h.
+  Header() { Bundle::Init(this); }
+
+  Status VisitFields(Visitor *JXL_RESTRICT visitor) override {
+    if (visitor->AllDefault(*this, &all_default)) {
+      // Overwrite all serialized fields, but not any nonserialized_*.
+      visitor->SetDefault(this);
+      return true;
+    }
+    auto visit_p = [visitor](pixel_type val, pixel_type *p) {
+      uint32_t up = *p;
+      JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(5, val, &up));
+      *p = up;
+      return Status(true);
+    };
+    JXL_QUIET_RETURN_IF_ERROR(visit_p(16, &p1C));
+    JXL_QUIET_RETURN_IF_ERROR(visit_p(10, &p2C));
+    JXL_QUIET_RETURN_IF_ERROR(visit_p(7, &p3Ca));
+    JXL_QUIET_RETURN_IF_ERROR(visit_p(7, &p3Cb));
+    JXL_QUIET_RETURN_IF_ERROR(visit_p(7, &p3Cc));
+    JXL_QUIET_RETURN_IF_ERROR(visit_p(0, &p3Cd));
+    JXL_QUIET_RETURN_IF_ERROR(visit_p(0, &p3Ce));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xd, &w[0]));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xc, &w[1]));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xc, &w[2]));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xc, &w[3]));
+    return true;
+  }
+
+  bool all_default;
+  pixel_type p1C = 0, p2C = 0, p3Ca = 0, p3Cb = 0, p3Cc = 0, p3Cd = 0, p3Ce = 0;
+  uint32_t w[kNumPredictors] = {};
+};
+
+struct State {
+  pixel_type_w prediction[kNumPredictors] = {};
+  pixel_type_w pred = 0;  // *before* removing the added bits.
+  std::vector<uint32_t> pred_errors[kNumPredictors];
+  std::vector<int32_t> error;
+  const Header header;
+
+  // Allows to approximate division by a number from 1 to 64.
+  uint32_t divlookup[64];
+
+  constexpr static pixel_type_w AddBits(pixel_type_w x) {
+    return uint64_t(x) << kPredExtraBits;
+  }
+
+  State(Header header, size_t xsize, size_t ysize) : header(header) {
+    // Extra margin to avoid out-of-bounds writes.
+    // All have space for two rows of data.
+    for (size_t i = 0; i < 4; i++) {
+      pred_errors[i].resize((xsize + 2) * 2);
+    }
+    error.resize((xsize + 2) * 2);
+    // Initialize division lookup table.
+    for (int i = 0; i < 64; i++) {
+      divlookup[i] = (1 << 24) / (i + 1);
+    }
+  }
+
+  // Approximates 4+(maxweight<<24)/(x+1), avoiding division
+  JXL_INLINE uint32_t ErrorWeight(uint64_t x, uint32_t maxweight) const {
+    int shift = static_cast<int>(FloorLog2Nonzero(x + 1)) - 5;
+    if (shift < 0) shift = 0;
+    return 4 + ((maxweight * divlookup[x >> shift]) >> shift);
+  }
+
+  // Approximates the weighted average of the input values with the given
+  // weights, avoiding division. Weights must sum to at least 16.
+  JXL_INLINE pixel_type_w
+  WeightedAverage(const pixel_type_w *JXL_RESTRICT p,
+                  std::array<uint32_t, kNumPredictors> w) const {
+    uint32_t weight_sum = 0;
+    for (size_t i = 0; i < kNumPredictors; i++) {
+      weight_sum += w[i];
+    }
+    JXL_DASSERT(weight_sum > 15);
+    uint32_t log_weight = FloorLog2Nonzero(weight_sum);  // at least 4.
+    weight_sum = 0;
+    for (size_t i = 0; i < kNumPredictors; i++) {
+      w[i] >>= log_weight - 4;
+      weight_sum += w[i];
+    }
+    // for rounding.
+    pixel_type_w sum = (weight_sum >> 1) - 1;
+    for (size_t i = 0; i < kNumPredictors; i++) {
+      sum += p[i] * w[i];
+    }
+    return (sum * divlookup[weight_sum - 1]) >> 24;
+  }
+
+  template <bool compute_properties>
+  JXL_INLINE pixel_type_w Predict(size_t x, size_t y, size_t xsize,
+                                  pixel_type_w N, pixel_type_w W,
+                                  pixel_type_w NE, pixel_type_w NW,
+                                  pixel_type_w NN, Properties *properties,
+                                  size_t offset) {
+    size_t cur_row = y & 1 ? 0 : (xsize + 2);
+    size_t prev_row = y & 1 ? (xsize + 2) : 0;
+    size_t pos_N = prev_row + x;
+    size_t pos_NE = x < xsize - 1 ? pos_N + 1 : pos_N;
+    size_t pos_NW = x > 0 ? pos_N - 1 : pos_N;
+    std::array<uint32_t, kNumPredictors> weights;
+    for (size_t i = 0; i < kNumPredictors; i++) {
+      // pred_errors[pos_N] also contains the error of pixel W.
+      // pred_errors[pos_NW] also contains the error of pixel WW.
+      weights[i] = pred_errors[i][pos_N] + pred_errors[i][pos_NE] +
+                   pred_errors[i][pos_NW];
+      weights[i] = ErrorWeight(weights[i], header.w[i]);
+    }
+
+    N = AddBits(N);
+    W = AddBits(W);
+    NE = AddBits(NE);
+    NW = AddBits(NW);
+    NN = AddBits(NN);
+
+    pixel_type_w teW = x == 0 ? 0 : error[cur_row + x - 1];
+    pixel_type_w teN = error[pos_N];
+    pixel_type_w teNW = error[pos_NW];
+    pixel_type_w sumWN = teN + teW;
+    pixel_type_w teNE = error[pos_NE];
+
+    if (compute_properties) {
+      pixel_type_w p = teW;
+      if (std::abs(teN) > std::abs(p)) p = teN;
+      if (std::abs(teNW) > std::abs(p)) p = teNW;
+      if (std::abs(teNE) > std::abs(p)) p = teNE;
+      (*properties)[offset++] = p;
+    }
+
+    prediction[0] = W + NE - N;
+    prediction[1] = N - (((sumWN + teNE) * header.p1C) >> 5);
+    prediction[2] = W - (((sumWN + teNW) * header.p2C) >> 5);
+    prediction[3] =
+        N - ((teNW * header.p3Ca + teN * header.p3Cb + teNE * header.p3Cc +
+              (NN - N) * header.p3Cd + (NW - W) * header.p3Ce) >>
+             5);
+
+    pred = WeightedAverage(prediction, weights);
+
+    // If all three have the same sign, skip clamping.
+    if (((teN ^ teW) | (teN ^ teNW)) > 0) {
+      return (pred + kPredictionRound) >> kPredExtraBits;
+    }
+
+    // Otherwise, clamp to min/max of neighbouring pixels (just W, NE, N).
+    pixel_type_w mx = std::max(W, std::max(NE, N));
+    pixel_type_w mn = std::min(W, std::min(NE, N));
+    pred = std::max(mn, std::min(mx, pred));
+    return (pred + kPredictionRound) >> kPredExtraBits;
+  }
+
+  JXL_INLINE void UpdateErrors(pixel_type_w val, size_t x, size_t y,
+                               size_t xsize) {
+    size_t cur_row = y & 1 ? 0 : (xsize + 2);
+    size_t prev_row = y & 1 ? (xsize + 2) : 0;
+    val = AddBits(val);
+    error[cur_row + x] = pred - val;
+    for (size_t i = 0; i < kNumPredictors; i++) {
+      pixel_type_w err =
+          (std::abs(prediction[i] - val) + kPredictionRound) >> kPredExtraBits;
+      // For predicting in the next row.
+      pred_errors[i][cur_row + x] = err;
+      // Add the error on this pixel to the error on the NE pixel. This has the
+      // effect of adding the error on this pixel to the E and EE pixels.
+      pred_errors[i][prev_row + x + 1] += err;
+    }
+  }
+};
+
+// Encoder helper function to set the parameters to some presets.
+inline void PredictorMode(int i, Header *header) {
+  switch (i) {
+    case 0:
+      // ~ lossless16 predictor
+      header->w[0] = 0xd;
+      header->w[1] = 0xc;
+      header->w[2] = 0xc;
+      header->w[3] = 0xc;
+      header->p1C = 16;
+      header->p2C = 10;
+      header->p3Ca = 7;
+      header->p3Cb = 7;
+      header->p3Cc = 7;
+      header->p3Cd = 0;
+      header->p3Ce = 0;
+      break;
+    case 1:
+      // ~ default lossless8 predictor
+      header->w[0] = 0xd;
+      header->w[1] = 0xc;
+      header->w[2] = 0xc;
+      header->w[3] = 0xb;
+      header->p1C = 8;
+      header->p2C = 8;
+      header->p3Ca = 4;
+      header->p3Cb = 0;
+      header->p3Cc = 3;
+      header->p3Cd = 23;
+      header->p3Ce = 2;
+      break;
+    case 2:
+      // ~ west lossless8 predictor
+      header->w[0] = 0xd;
+      header->w[1] = 0xc;
+      header->w[2] = 0xd;
+      header->w[3] = 0xc;
+      header->p1C = 10;
+      header->p2C = 9;
+      header->p3Ca = 7;
+      header->p3Cb = 0;
+      header->p3Cc = 0;
+      header->p3Cd = 16;
+      header->p3Ce = 9;
+      break;
+    case 3:
+      // ~ north lossless8 predictor
+      header->w[0] = 0xd;
+      header->w[1] = 0xd;
+      header->w[2] = 0xc;
+      header->w[3] = 0xc;
+      header->p1C = 16;
+      header->p2C = 8;
+      header->p3Ca = 0;
+      header->p3Cb = 16;
+      header->p3Cc = 0;
+      header->p3Cd = 23;
+      header->p3Ce = 0;
+      break;
+    case 4:
+    default:
+      // something else, because why not
+      header->w[0] = 0xd;
+      header->w[1] = 0xc;
+      header->w[2] = 0xc;
+      header->w[3] = 0xc;
+      header->p1C = 10;
+      header->p2C = 10;
+      header->p3Ca = 5;
+      header->p3Cb = 5;
+      header->p3Cc = 5;
+      header->p3Cd = 12;
+      header->p3Ce = 4;
+      break;
+  }
+}
+}  // namespace weighted
+
+// Stores a node and its two children at the same time. This significantly
+// reduces the number of branches needed during decoding.
+struct FlatDecisionNode {
+  // Property + splitval of the top node.
+  int32_t property0;  // -1 if leaf.
+  union {
+    PropertyVal splitval0;
+    Predictor predictor;
+  };
+  uint32_t childID;  // childID is ctx id if leaf.
+  // Property+splitval of the two child nodes.
+  union {
+    PropertyVal splitvals[2];
+    int32_t multiplier;
+  };
+  union {
+    int32_t properties[2];
+    int64_t predictor_offset;
+  };
+};
+using FlatTree = std::vector<FlatDecisionNode>;
+
+class MATreeLookup {
+ public:
+  explicit MATreeLookup(const FlatTree &tree) : nodes_(tree) {}
+  struct LookupResult {
+    uint32_t context;
+    Predictor predictor;
+    int64_t offset;
+    int32_t multiplier;
+  };
+  JXL_INLINE LookupResult Lookup(const Properties &properties) const {
+    uint32_t pos = 0;
+    while (true) {
+      const FlatDecisionNode &node = nodes_[pos];
+      if (node.property0 < 0) {
+        return {node.childID, node.predictor, node.predictor_offset,
+                node.multiplier};
+      }
+      bool p0 = properties[node.property0] <= node.splitval0;
+      uint32_t off0 = properties[node.properties[0]] <= node.splitvals[0];
+      uint32_t off1 =
+          2 | (properties[node.properties[1]] <= node.splitvals[1] ? 1 : 0);
+      pos = node.childID + (p0 ? off1 : off0);
+    }
+  }
+
+ private:
+  const FlatTree &nodes_;
+};
+
+static constexpr size_t kExtraPropsPerChannel = 4;
+static constexpr size_t kNumNonrefProperties =
+    kNumStaticProperties + 13 + weighted::kNumProperties;
+
+constexpr size_t kWPProp = kNumNonrefProperties - weighted::kNumProperties;
+constexpr size_t kGradientProp = 9;
+
+// Clamps gradient to the min/max of n, w (and l, implicitly).
+static JXL_INLINE int32_t ClampedGradient(const int32_t n, const int32_t w,
+                                          const int32_t l) {
+  const int32_t m = std::min(n, w);
+  const int32_t M = std::max(n, w);
+  // The end result of this operation doesn't overflow or underflow if the
+  // result is between m and M, but the intermediate value may overflow, so we
+  // do the intermediate operations in uint32_t and check later if we had an
+  // overflow or underflow condition comparing m, M and l directly.
+  // grad = M + m - l = n + w - l
+  const int32_t grad =
+      static_cast<int32_t>(static_cast<uint32_t>(n) + static_cast<uint32_t>(w) -
+                           static_cast<uint32_t>(l));
+  // We use two sets of ternary operators to force the evaluation of them in
+  // any case, allowing the compiler to avoid branches and use cmovl/cmovg in
+  // x86.
+  const int32_t grad_clamp_M = (l < m) ? M : grad;
+  return (l > M) ? m : grad_clamp_M;
+}
+
+inline pixel_type_w Select(pixel_type_w a, pixel_type_w b, pixel_type_w c) {
+  pixel_type_w p = a + b - c;
+  pixel_type_w pa = std::abs(p - a);
+  pixel_type_w pb = std::abs(p - b);
+  return pa < pb ? a : b;
+}
+
+inline void PrecomputeReferences(const Channel &ch, size_t y,
+                                 const Image &image, uint32_t i,
+                                 Channel *references) {
+  ZeroFillImage(&references->plane);
+  uint32_t offset = 0;
+  size_t num_extra_props = references->w;
+  intptr_t onerow = references->plane.PixelsPerRow();
+  for (int32_t j = static_cast<int32_t>(i) - 1;
+       j >= 0 && offset < num_extra_props; j--) {
+    if (image.channel[j].w != image.channel[i].w ||
+        image.channel[j].h != image.channel[i].h) {
+      continue;
+    }
+    if (image.channel[j].hshift != image.channel[i].hshift) continue;
+    if (image.channel[j].vshift != image.channel[i].vshift) continue;
+    pixel_type *JXL_RESTRICT rp = references->Row(0) + offset;
+    const pixel_type *JXL_RESTRICT rpp = image.channel[j].Row(y);
+    const pixel_type *JXL_RESTRICT rpprev = image.channel[j].Row(y ? y - 1 : 0);
+    for (size_t x = 0; x < ch.w; x++, rp += onerow) {
+      pixel_type_w v = rpp[x];
+      rp[0] = std::abs(v);
+      rp[1] = v;
+      pixel_type_w vleft = (x ? rpp[x - 1] : 0);
+      pixel_type_w vtop = (y ? rpprev[x] : vleft);
+      pixel_type_w vtopleft = (x && y ? rpprev[x - 1] : vleft);
+      pixel_type_w vpredicted = ClampedGradient(vleft, vtop, vtopleft);
+      rp[2] = std::abs(v - vpredicted);
+      rp[3] = v - vpredicted;
+    }
+
+    offset += kExtraPropsPerChannel;
+  }
+}
+
+struct PredictionResult {
+  int context = 0;
+  pixel_type_w guess = 0;
+  Predictor predictor;
+  int32_t multiplier;
+};
+
+inline void InitPropsRow(
+    Properties *p,
+    const std::array<pixel_type, kNumStaticProperties> &static_props,
+    const int y) {
+  for (size_t i = 0; i < kNumStaticProperties; i++) {
+    (*p)[i] = static_props[i];
+  }
+  (*p)[2] = y;
+  (*p)[9] = 0;  // local gradient.
+}
+
+namespace detail {
+enum PredictorMode {
+  kUseTree = 1,
+  kUseWP = 2,
+  kForceComputeProperties = 4,
+  kAllPredictions = 8,
+  kNoEdgeCases = 16
+};
+
+JXL_INLINE pixel_type_w PredictOne(Predictor p, pixel_type_w left,
+                                   pixel_type_w top, pixel_type_w toptop,
+                                   pixel_type_w topleft, pixel_type_w topright,
+                                   pixel_type_w leftleft,
+                                   pixel_type_w toprightright,
+                                   pixel_type_w wp_pred) {
+  switch (p) {
+    case Predictor::Zero:
+      return pixel_type_w{0};
+    case Predictor::Left:
+      return left;
+    case Predictor::Top:
+      return top;
+    case Predictor::Select:
+      return Select(left, top, topleft);
+    case Predictor::Weighted:
+      return wp_pred;
+    case Predictor::Gradient:
+      return pixel_type_w{ClampedGradient(left, top, topleft)};
+    case Predictor::TopLeft:
+      return topleft;
+    case Predictor::TopRight:
+      return topright;
+    case Predictor::LeftLeft:
+      return leftleft;
+    case Predictor::Average0:
+      return (left + top) / 2;
+    case Predictor::Average1:
+      return (left + topleft) / 2;
+    case Predictor::Average2:
+      return (topleft + top) / 2;
+    case Predictor::Average3:
+      return (top + topright) / 2;
+    case Predictor::Average4:
+      return (6 * top - 2 * toptop + 7 * left + 1 * leftleft +
+              1 * toprightright + 3 * topright + 8) /
+             16;
+    default:
+      return pixel_type_w{0};
+  }
+}
+
+template <int mode>
+JXL_INLINE PredictionResult Predict(
+    Properties *p, size_t w, const pixel_type *JXL_RESTRICT pp,
+    const intptr_t onerow, const size_t x, const size_t y, Predictor predictor,
+    const MATreeLookup *lookup, const Channel *references,
+    weighted::State *wp_state, pixel_type_w *predictions) {
+  // We start in position 3 because of 2 static properties + y.
+  size_t offset = 3;
+  constexpr bool compute_properties =
+      mode & kUseTree || mode & kForceComputeProperties;
+  constexpr bool nec = mode & kNoEdgeCases;
+  pixel_type_w left = (nec || x ? pp[-1] : (y ? pp[-onerow] : 0));
+  pixel_type_w top = (nec || y ? pp[-onerow] : left);
+  pixel_type_w topleft = (nec || (x && y) ? pp[-1 - onerow] : left);
+  pixel_type_w topright = (nec || (x + 1 < w && y) ? pp[1 - onerow] : top);
+  pixel_type_w leftleft = (nec || x > 1 ? pp[-2] : left);
+  pixel_type_w toptop = (nec || y > 1 ? pp[-onerow - onerow] : top);
+  pixel_type_w toprightright =
+      (nec || (x + 2 < w && y) ? pp[2 - onerow] : topright);
+
+  if (compute_properties) {
+    // location
+    (*p)[offset++] = x;
+    // neighbors
+    (*p)[offset++] = std::abs(top);
+    (*p)[offset++] = std::abs(left);
+    (*p)[offset++] = top;
+    (*p)[offset++] = left;
+
+    // local gradient
+    (*p)[offset] = left - (*p)[offset + 1];
+    offset++;
+    // local gradient
+    (*p)[offset++] = left + top - topleft;
+
+    // FFV1 context properties
+    (*p)[offset++] = left - topleft;
+    (*p)[offset++] = topleft - top;
+    (*p)[offset++] = top - topright;
+    (*p)[offset++] = top - toptop;
+    (*p)[offset++] = left - leftleft;
+  }
+
+  pixel_type_w wp_pred = 0;
+  if (mode & kUseWP) {
+    wp_pred = wp_state->Predict<compute_properties>(
+        x, y, w, top, left, topright, topleft, toptop, p, offset);
+  }
+  if (!nec && compute_properties) {
+    offset += weighted::kNumProperties;
+    // Extra properties.
+    const pixel_type *JXL_RESTRICT rp = references->Row(x);
+    for (size_t i = 0; i < references->w; i++) {
+      (*p)[offset++] = rp[i];
+    }
+  }
+  PredictionResult result;
+  if (mode & kUseTree) {
+    MATreeLookup::LookupResult lr = lookup->Lookup(*p);
+    result.context = lr.context;
+    result.guess = lr.offset;
+    result.multiplier = lr.multiplier;
+    predictor = lr.predictor;
+  }
+  if (mode & kAllPredictions) {
+    for (size_t i = 0; i < kNumModularPredictors; i++) {
+      predictions[i] = PredictOne((Predictor)i, left, top, toptop, topleft,
+                                  topright, leftleft, toprightright, wp_pred);
+    }
+  }
+  result.guess += PredictOne(predictor, left, top, toptop, topleft, topright,
+                             leftleft, toprightright, wp_pred);
+  result.predictor = predictor;
+
+  return result;
+}
+}  // namespace detail
+
+inline PredictionResult PredictNoTreeNoWP(size_t w,
+                                          const pixel_type *JXL_RESTRICT pp,
+                                          const intptr_t onerow, const int x,
+                                          const int y, Predictor predictor) {
+  return detail::Predict</*mode=*/0>(
+      /*p=*/nullptr, w, pp, onerow, x, y, predictor, /*lookup=*/nullptr,
+      /*references=*/nullptr, /*wp_state=*/nullptr, /*predictions=*/nullptr);
+}
+
+inline PredictionResult PredictNoTreeWP(size_t w,
+                                        const pixel_type *JXL_RESTRICT pp,
+                                        const intptr_t onerow, const int x,
+                                        const int y, Predictor predictor,
+                                        weighted::State *wp_state) {
+  return detail::Predict<detail::kUseWP>(
+      /*p=*/nullptr, w, pp, onerow, x, y, predictor, /*lookup=*/nullptr,
+      /*references=*/nullptr, wp_state, /*predictions=*/nullptr);
+}
+
+inline PredictionResult PredictTreeNoWP(Properties *p, size_t w,
+                                        const pixel_type *JXL_RESTRICT pp,
+                                        const intptr_t onerow, const int x,
+                                        const int y,
+                                        const MATreeLookup &tree_lookup,
+                                        const Channel &references) {
+  return detail::Predict<detail::kUseTree>(
+      p, w, pp, onerow, x, y, Predictor::Zero, &tree_lookup, &references,
+      /*wp_state=*/nullptr, /*predictions=*/nullptr);
+}
+// Only use for y > 1, x > 1, x < w-2, and empty references
+JXL_INLINE PredictionResult
+PredictTreeNoWPNEC(Properties *p, size_t w, const pixel_type *JXL_RESTRICT pp,
+                   const intptr_t onerow, const int x, const int y,
+                   const MATreeLookup &tree_lookup, const Channel &references) {
+  return detail::Predict<detail::kUseTree | detail::kNoEdgeCases>(
+      p, w, pp, onerow, x, y, Predictor::Zero, &tree_lookup, &references,
+      /*wp_state=*/nullptr, /*predictions=*/nullptr);
+}
+
+inline PredictionResult PredictTreeWP(Properties *p, size_t w,
+                                      const pixel_type *JXL_RESTRICT pp,
+                                      const intptr_t onerow, const int x,
+                                      const int y,
+                                      const MATreeLookup &tree_lookup,
+                                      const Channel &references,
+                                      weighted::State *wp_state) {
+  return detail::Predict<detail::kUseTree | detail::kUseWP>(
+      p, w, pp, onerow, x, y, Predictor::Zero, &tree_lookup, &references,
+      wp_state, /*predictions=*/nullptr);
+}
+
+inline PredictionResult PredictLearn(Properties *p, size_t w,
+                                     const pixel_type *JXL_RESTRICT pp,
+                                     const intptr_t onerow, const int x,
+                                     const int y, Predictor predictor,
+                                     const Channel &references,
+                                     weighted::State *wp_state) {
+  return detail::Predict<detail::kForceComputeProperties | detail::kUseWP>(
+      p, w, pp, onerow, x, y, predictor, /*lookup=*/nullptr, &references,
+      wp_state, /*predictions=*/nullptr);
+}
+
+inline void PredictLearnAll(Properties *p, size_t w,
+                            const pixel_type *JXL_RESTRICT pp,
+                            const intptr_t onerow, const int x, const int y,
+                            const Channel &references,
+                            weighted::State *wp_state,
+                            pixel_type_w *predictions) {
+  detail::Predict<detail::kForceComputeProperties | detail::kUseWP |
+                  detail::kAllPredictions>(
+      p, w, pp, onerow, x, y, Predictor::Zero,
+      /*lookup=*/nullptr, &references, wp_state, predictions);
+}
+
+inline void PredictAllNoWP(size_t w, const pixel_type *JXL_RESTRICT pp,
+                           const intptr_t onerow, const int x, const int y,
+                           pixel_type_w *predictions) {
+  detail::Predict<detail::kAllPredictions>(
+      /*p=*/nullptr, w, pp, onerow, x, y, Predictor::Zero,
+      /*lookup=*/nullptr,
+      /*references=*/nullptr, /*wp_state=*/nullptr, predictions);
+}
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_ENCODING_CONTEXT_PREDICT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc b/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc
new file mode 100644
index 0000000000..66562f7dfd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc
@@ -0,0 +1,107 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/encoding/dec_ma.h"
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/modular/encoding/ma_common.h"
+#include "lib/jxl/modular/modular_image.h"
+
+namespace jxl {
+
+namespace {
+
+Status ValidateTree(
+    const Tree &tree,
+    const std::vector<std::pair<pixel_type, pixel_type>> &prop_bounds,
+    size_t root) {
+  if (tree[root].property == -1) return true;
+  size_t p = tree[root].property;
+  int val = tree[root].splitval;
+  if (prop_bounds[p].first > val) return JXL_FAILURE("Invalid tree");
+  // Splitting at max value makes no sense: left range will be exactly same
+  // as parent, right range will be invalid (min > max).
+  if (prop_bounds[p].second <= val) return JXL_FAILURE("Invalid tree");
+  auto new_bounds = prop_bounds;
+  new_bounds[p].first = val + 1;
+  JXL_RETURN_IF_ERROR(ValidateTree(tree, new_bounds, tree[root].lchild));
+  new_bounds[p] = prop_bounds[p];
+  new_bounds[p].second = val;
+  return ValidateTree(tree, new_bounds, tree[root].rchild);
+}
+
+Status DecodeTree(BitReader *br, ANSSymbolReader *reader,
+                  const std::vector<uint8_t> &context_map, Tree *tree,
+                  size_t tree_size_limit) {
+  size_t leaf_id = 0;
+  size_t to_decode = 1;
+  tree->clear();
+  while (to_decode > 0) {
+    JXL_RETURN_IF_ERROR(br->AllReadsWithinBounds());
+    if (tree->size() > tree_size_limit) {
+      return JXL_FAILURE("Tree is too large: %" PRIuS " nodes vs %" PRIuS
+                         " max nodes",
+                         tree->size(), tree_size_limit);
+    }
+    to_decode--;
+    uint32_t prop1 = reader->ReadHybridUint(kPropertyContext, br, context_map);
+    if (prop1 > 256) return JXL_FAILURE("Invalid tree property value");
+    int property = prop1 - 1;
+    if (property == -1) {
+      size_t predictor =
+          reader->ReadHybridUint(kPredictorContext, br, context_map);
+      if (predictor >= kNumModularPredictors) {
+        return JXL_FAILURE("Invalid predictor");
+      }
+      int64_t predictor_offset =
+          UnpackSigned(reader->ReadHybridUint(kOffsetContext, br, context_map));
+      uint32_t mul_log =
+          reader->ReadHybridUint(kMultiplierLogContext, br, context_map);
+      if (mul_log >= 31) {
+        return JXL_FAILURE("Invalid multiplier logarithm");
+      }
+      uint32_t mul_bits =
+          reader->ReadHybridUint(kMultiplierBitsContext, br, context_map);
+      if (mul_bits + 1 >= 1u << (31u - mul_log)) {
+        return JXL_FAILURE("Invalid multiplier");
+      }
+      uint32_t multiplier = (mul_bits + 1U) << mul_log;
+      tree->emplace_back(-1, 0, leaf_id++, 0, static_cast<Predictor>(predictor),
+                         predictor_offset, multiplier);
+      continue;
+    }
+    int splitval =
+        UnpackSigned(reader->ReadHybridUint(kSplitValContext, br, context_map));
+    tree->emplace_back(property, splitval, tree->size() + to_decode + 1,
+                       tree->size() + to_decode + 2, Predictor::Zero, 0, 1);
+    to_decode += 2;
+  }
+  std::vector<std::pair<pixel_type, pixel_type>> prop_bounds;
+  prop_bounds.resize(256, {std::numeric_limits<pixel_type>::min(),
+                           std::numeric_limits<pixel_type>::max()});
+  return ValidateTree(*tree, prop_bounds, 0);
+}
+}  // namespace
+
+Status DecodeTree(BitReader *br, Tree *tree, size_t tree_size_limit) {
+  std::vector<uint8_t> tree_context_map;
+  ANSCode tree_code;
+  JXL_RETURN_IF_ERROR(
+      DecodeHistograms(br, kNumTreeContexts, &tree_code, &tree_context_map));
+  // TODO(eustas): investigate more infinite tree cases.
+  if (tree_code.degenerate_symbols[tree_context_map[kPropertyContext]] > 0) {
+    return JXL_FAILURE("Infinite tree");
+  }
+  ANSSymbolReader reader(&tree_code, br);
+  JXL_RETURN_IF_ERROR(DecodeTree(br, &reader, tree_context_map, tree,
+                                 std::min(tree_size_limit, kMaxTreeSize)));
+  if (!reader.CheckANSFinalState()) {
+    return JXL_FAILURE("ANS decode final state failed");
+  }
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.h
new file mode 100644
index 0000000000..a910c4deb1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.h
@@ -0,0 +1,66 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_ENCODING_DEC_MA_H_
+#define LIB_JXL_MODULAR_ENCODING_DEC_MA_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/modular/options.h"
+
+namespace jxl {
+
+// inner nodes
+struct PropertyDecisionNode {
+  PropertyVal splitval;
+  int16_t property;  // -1: leaf node, lchild points to leaf node
+  uint32_t lchild;
+  uint32_t rchild;
+  Predictor predictor;
+  int64_t predictor_offset;
+  uint32_t multiplier;
+
+  PropertyDecisionNode(int p, int split_val, int lchild, int rchild,
+                       Predictor predictor, int64_t predictor_offset,
+                       uint32_t multiplier)
+      : splitval(split_val),
+        property(p),
+        lchild(lchild),
+        rchild(rchild),
+        predictor(predictor),
+        predictor_offset(predictor_offset),
+        multiplier(multiplier) {}
+  PropertyDecisionNode()
+      : splitval(0),
+        property(-1),
+        lchild(0),
+        rchild(0),
+        predictor(Predictor::Zero),
+        predictor_offset(0),
+        multiplier(1) {}
+  static PropertyDecisionNode Leaf(Predictor predictor, int64_t offset = 0,
+                                   uint32_t multiplier = 1) {
+    return PropertyDecisionNode(-1, 0, 0, 0, predictor, offset, multiplier);
+  }
+  static PropertyDecisionNode Split(int p, int split_val, int lchild,
+                                    int rchild = -1) {
+    if (rchild == -1) rchild = lchild + 1;
+    return PropertyDecisionNode(p, split_val, lchild, rchild, Predictor::Zero,
+                                0, 1);
+  }
+};
+
+using Tree = std::vector<PropertyDecisionNode>;
+
+Status DecodeTree(BitReader *br, Tree *tree, size_t tree_size_limit);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_ENCODING_DEC_MA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.cc b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.cc
new file mode 100644
index 0000000000..f2a1705e4b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.cc
@@ -0,0 +1,124 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/encoding/enc_debug_tree.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "lib/jxl/base/os_macros.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/encoding/dec_ma.h"
+#include "lib/jxl/modular/options.h"
+
+#if JXL_OS_IOS
+#define JXL_ENABLE_DOT 0
+#else
+#define JXL_ENABLE_DOT 1  // iOS lacks C89 system()
+#endif
+
+namespace jxl {
+
+const char *PredictorName(Predictor p) {
+  switch (p) {
+    case Predictor::Zero:
+      return "Zero";
+    case Predictor::Left:
+      return "Left";
+    case Predictor::Top:
+      return "Top";
+    case Predictor::Average0:
+      return "Avg0";
+    case Predictor::Average1:
+      return "Avg1";
+    case Predictor::Average2:
+      return "Avg2";
+    case Predictor::Average3:
+      return "Avg3";
+    case Predictor::Average4:
+      return "Avg4";
+    case Predictor::Select:
+      return "Sel";
+    case Predictor::Gradient:
+      return "Grd";
+    case Predictor::Weighted:
+      return "Wgh";
+    case Predictor::TopLeft:
+      return "TopL";
+    case Predictor::TopRight:
+      return "TopR";
+    case Predictor::LeftLeft:
+      return "LL";
+    default:
+      return "INVALID";
+  };
+}
+
+std::string PropertyName(size_t i) {
+  static_assert(kNumNonrefProperties == 16, "Update this function");
+  switch (i) {
+    case 0:
+      return "c";
+    case 1:
+      return "g";
+    case 2:
+      return "y";
+    case 3:
+      return "x";
+    case 4:
+      return "|N|";
+    case 5:
+      return "|W|";
+    case 6:
+      return "N";
+    case 7:
+      return "W";
+    case 8:
+      return "W-WW-NW+NWW";
+    case 9:
+      return "W+N-NW";
+    case 10:
+      return "W-NW";
+    case 11:
+      return "NW-N";
+    case 12:
+      return "N-NE";
+    case 13:
+      return "N-NN";
+    case 14:
+      return "W-WW";
+    case 15:
+      return "WGH";
+    default:
+      return "ch[" + ToString(15 - (int)i) + "]";
+  }
+}
+
+void PrintTree(const Tree &tree, const std::string &path) {
+  FILE *f = fopen((path + ".dot").c_str(), "w");
+  fprintf(f, "graph{\n");
+  for (size_t cur = 0; cur < tree.size(); cur++) {
+    if (tree[cur].property < 0) {
+      fprintf(f, "n%05" PRIuS " [label=\"%s%+" PRId64 " (x%u)\"];\n", cur,
+              PredictorName(tree[cur].predictor), tree[cur].predictor_offset,
+              tree[cur].multiplier);
+    } else {
+      fprintf(f, "n%05" PRIuS " [label=\"%s>%d\"];\n", cur,
+              PropertyName(tree[cur].property).c_str(), tree[cur].splitval);
+      fprintf(f, "n%05" PRIuS " -- n%05d;\n", cur, tree[cur].lchild);
+      fprintf(f, "n%05" PRIuS " -- n%05d;\n", cur, tree[cur].rchild);
+    }
+  }
+  fprintf(f, "}\n");
+  fclose(f);
+#if JXL_ENABLE_DOT
+  JXL_ASSERT(
+      system(("dot " + path + ".dot -T svg -o " + path + ".svg").c_str()) == 0);
+#endif
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.h
new file mode 100644
index 0000000000..78deaab1b8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.h
@@ -0,0 +1,27 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_ENCODING_ENC_DEBUG_TREE_H_
+#define LIB_JXL_MODULAR_ENCODING_ENC_DEBUG_TREE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "lib/jxl/modular/encoding/dec_ma.h"
+#include "lib/jxl/modular/options.h"
+
+namespace jxl {
+
+const char *PredictorName(Predictor p);
+std::string PropertyName(size_t i);
+
+void PrintTree(const Tree &tree, const std::string &path);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_ENCODING_ENC_DEBUG_TREE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.cc b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.cc
new file mode 100644
index 0000000000..c8c183335e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.cc
@@ -0,0 +1,562 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <cinttypes>
+#include <limits>
+#include <numeric>
+#include <queue>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/encoding/enc_debug_tree.h"
+#include "lib/jxl/modular/encoding/enc_ma.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/encoding/ma_common.h"
+#include "lib/jxl/modular/options.h"
+#include "lib/jxl/modular/transform/transform.h"
+#include "lib/jxl/toc.h"
+
+namespace jxl {
+
+namespace {
+// Plot tree (if enabled) and predictor usage map.
+constexpr bool kWantDebug = false;
+constexpr bool kPrintTree = false;
+
+inline std::array<uint8_t, 3> PredictorColor(Predictor p) {
+  switch (p) {
+    case Predictor::Zero:
+      return {{0, 0, 0}};
+    case Predictor::Left:
+      return {{255, 0, 0}};
+    case Predictor::Top:
+      return {{0, 255, 0}};
+    case Predictor::Average0:
+      return {{0, 0, 255}};
+    case Predictor::Average4:
+      return {{192, 128, 128}};
+    case Predictor::Select:
+      return {{255, 255, 0}};
+    case Predictor::Gradient:
+      return {{255, 0, 255}};
+    case Predictor::Weighted:
+      return {{0, 255, 255}};
+      // TODO
+    default:
+      return {{255, 255, 255}};
+  };
+}
+
+}  // namespace
+
+void GatherTreeData(const Image &image, pixel_type chan, size_t group_id,
+                    const weighted::Header &wp_header,
+                    const ModularOptions &options, TreeSamples &tree_samples,
+                    size_t *total_pixels) {
+  const Channel &channel = image.channel[chan];
+
+  JXL_DEBUG_V(7, "Learning %" PRIuS "x%" PRIuS " channel %d", channel.w,
+              channel.h, chan);
+
+  std::array<pixel_type, kNumStaticProperties> static_props = {
+      {chan, (int)group_id}};
+  Properties properties(kNumNonrefProperties +
+                        kExtraPropsPerChannel * options.max_properties);
+  double pixel_fraction = std::min(1.0f, options.nb_repeats);
+  // a fraction of 0 is used to disable learning entirely.
+  if (pixel_fraction > 0) {
+    pixel_fraction = std::max(pixel_fraction,
+                              std::min(1.0, 1024.0 / (channel.w * channel.h)));
+  }
+  uint64_t threshold =
+      (std::numeric_limits<uint64_t>::max() >> 32) * pixel_fraction;
+  uint64_t s[2] = {static_cast<uint64_t>(0x94D049BB133111EBull),
+                   static_cast<uint64_t>(0xBF58476D1CE4E5B9ull)};
+  // Xorshift128+ adapted from xorshift128+-inl.h
+  auto use_sample = [&]() {
+    auto s1 = s[0];
+    const auto s0 = s[1];
+    const auto bits = s1 + s0;  // b, c
+    s[0] = s0;
+    s1 ^= s1 << 23;
+    s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
+    s[1] = s1;
+    return (bits >> 32) <= threshold;
+  };
+
+  const intptr_t onerow = channel.plane.PixelsPerRow();
+  Channel references(properties.size() - kNumNonrefProperties, channel.w);
+  weighted::State wp_state(wp_header, channel.w, channel.h);
+  tree_samples.PrepareForSamples(pixel_fraction * channel.h * channel.w + 64);
+  for (size_t y = 0; y < channel.h; y++) {
+    const pixel_type *JXL_RESTRICT p = channel.Row(y);
+    PrecomputeReferences(channel, y, image, chan, &references);
+    InitPropsRow(&properties, static_props, y);
+    // TODO(veluca): avoid computing WP if we don't use its property or
+    // predictions.
+    for (size_t x = 0; x < channel.w; x++) {
+      pixel_type_w pred[kNumModularPredictors];
+      if (tree_samples.NumPredictors() != 1) {
+        PredictLearnAll(&properties, channel.w, p + x, onerow, x, y, references,
+                        &wp_state, pred);
+      } else {
+        pred[static_cast<int>(tree_samples.PredictorFromIndex(0))] =
+            PredictLearn(&properties, channel.w, p + x, onerow, x, y,
+                         tree_samples.PredictorFromIndex(0), references,
+                         &wp_state)
+                .guess;
+      }
+      (*total_pixels)++;
+      if (use_sample()) {
+        tree_samples.AddSample(p[x], properties, pred);
+      }
+      wp_state.UpdateErrors(p[x], x, y, channel.w);
+    }
+  }
+}
+
+Tree LearnTree(TreeSamples &&tree_samples, size_t total_pixels,
+               const ModularOptions &options,
+               const std::vector<ModularMultiplierInfo> &multiplier_info = {},
+               StaticPropRange static_prop_range = {}) {
+  for (size_t i = 0; i < kNumStaticProperties; i++) {
+    if (static_prop_range[i][1] == 0) {
+      static_prop_range[i][1] = std::numeric_limits<uint32_t>::max();
+    }
+  }
+  if (!tree_samples.HasSamples()) {
+    Tree tree;
+    tree.emplace_back();
+    tree.back().predictor = tree_samples.PredictorFromIndex(0);
+    tree.back().property = -1;
+    tree.back().predictor_offset = 0;
+    tree.back().multiplier = 1;
+    return tree;
+  }
+  float pixel_fraction = tree_samples.NumSamples() * 1.0f / total_pixels;
+  float required_cost = pixel_fraction * 0.9 + 0.1;
+  tree_samples.AllSamplesDone();
+  Tree tree;
+  ComputeBestTree(tree_samples,
+                  options.splitting_heuristics_node_threshold * required_cost,
+                  multiplier_info, static_prop_range,
+                  options.fast_decode_multiplier, &tree);
+  return tree;
+}
+
+Status EncodeModularChannelMAANS(const Image &image, pixel_type chan,
+                                 const weighted::Header &wp_header,
+                                 const Tree &global_tree, Token **tokenpp,
+                                 AuxOut *aux_out, size_t group_id,
+                                 bool skip_encoder_fast_path) {
+  const Channel &channel = image.channel[chan];
+  Token *tokenp = *tokenpp;
+  JXL_ASSERT(channel.w != 0 && channel.h != 0);
+
+  Image3F predictor_img;
+  if (kWantDebug) predictor_img = Image3F(channel.w, channel.h);
+
+  JXL_DEBUG_V(6,
+              "Encoding %" PRIuS "x%" PRIuS
+              " channel %d, "
+              "(shift=%i,%i)",
+              channel.w, channel.h, chan, channel.hshift, channel.vshift);
+
+  std::array<pixel_type, kNumStaticProperties> static_props = {
+      {chan, (int)group_id}};
+  bool use_wp, is_wp_only;
+  bool is_gradient_only;
+  size_t num_props;
+  FlatTree tree = FilterTree(global_tree, static_props, &num_props, &use_wp,
+                             &is_wp_only, &is_gradient_only);
+  Properties properties(num_props);
+  MATreeLookup tree_lookup(tree);
+  JXL_DEBUG_V(3, "Encoding using a MA tree with %" PRIuS " nodes", tree.size());
+
+  // Check if this tree is a WP-only tree with a small enough property value
+  // range.
+  // Initialized to avoid clang-tidy complaining.
+  uint16_t context_lookup[2 * kPropRangeFast] = {};
+  int8_t offsets[2 * kPropRangeFast] = {};
+  if (is_wp_only) {
+    is_wp_only = TreeToLookupTable(tree, context_lookup, offsets);
+  }
+  if (is_gradient_only) {
+    is_gradient_only = TreeToLookupTable(tree, context_lookup, offsets);
+  }
+
+  if (is_wp_only && !skip_encoder_fast_path) {
+    for (size_t c = 0; c < 3; c++) {
+      FillImage(static_cast<float>(PredictorColor(Predictor::Weighted)[c]),
+                &predictor_img.Plane(c));
+    }
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    weighted::State wp_state(wp_header, channel.w, channel.h);
+    Properties properties(1);
+    for (size_t y = 0; y < channel.h; y++) {
+      const pixel_type *JXL_RESTRICT r = channel.Row(y);
+      for (size_t x = 0; x < channel.w; x++) {
+        size_t offset = 0;
+        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+        pixel_type_w top = (y ? *(r + x - onerow) : left);
+        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
+        pixel_type_w topright =
+            (x + 1 < channel.w && y ? *(r + x + 1 - onerow) : top);
+        pixel_type_w toptop = (y > 1 ? *(r + x - onerow - onerow) : top);
+        int32_t guess = wp_state.Predict</*compute_properties=*/true>(
+            x, y, channel.w, top, left, topright, topleft, toptop, &properties,
+            offset);
+        uint32_t pos =
+            kPropRangeFast + std::min(std::max(-kPropRangeFast, properties[0]),
+                                      kPropRangeFast - 1);
+        uint32_t ctx_id = context_lookup[pos];
+        int32_t residual = r[x] - guess - offsets[pos];
+        *tokenp++ = Token(ctx_id, PackSigned(residual));
+        wp_state.UpdateErrors(r[x], x, y, channel.w);
+      }
+    }
+  } else if (tree.size() == 1 && tree[0].predictor == Predictor::Gradient &&
+             tree[0].multiplier == 1 && tree[0].predictor_offset == 0 &&
+             !skip_encoder_fast_path) {
+    for (size_t c = 0; c < 3; c++) {
+      FillImage(static_cast<float>(PredictorColor(Predictor::Gradient)[c]),
+                &predictor_img.Plane(c));
+    }
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    for (size_t y = 0; y < channel.h; y++) {
+      const pixel_type *JXL_RESTRICT r = channel.Row(y);
+      for (size_t x = 0; x < channel.w; x++) {
+        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+        pixel_type_w top = (y ? *(r + x - onerow) : left);
+        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
+        int32_t guess = ClampedGradient(top, left, topleft);
+        int32_t residual = r[x] - guess;
+        *tokenp++ = Token(tree[0].childID, PackSigned(residual));
+      }
+    }
+  } else if (is_gradient_only && !skip_encoder_fast_path) {
+    for (size_t c = 0; c < 3; c++) {
+      FillImage(static_cast<float>(PredictorColor(Predictor::Gradient)[c]),
+                &predictor_img.Plane(c));
+    }
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    for (size_t y = 0; y < channel.h; y++) {
+      const pixel_type *JXL_RESTRICT r = channel.Row(y);
+      for (size_t x = 0; x < channel.w; x++) {
+        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+        pixel_type_w top = (y ? *(r + x - onerow) : left);
+        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
+        int32_t guess = ClampedGradient(top, left, topleft);
+        uint32_t pos =
+            kPropRangeFast +
+            std::min<pixel_type_w>(
+                std::max<pixel_type_w>(-kPropRangeFast, top + left - topleft),
+                kPropRangeFast - 1);
+        uint32_t ctx_id = context_lookup[pos];
+        int32_t residual = r[x] - guess - offsets[pos];
+        *tokenp++ = Token(ctx_id, PackSigned(residual));
+      }
+    }
+  } else if (tree.size() == 1 && tree[0].predictor == Predictor::Zero &&
+             tree[0].multiplier == 1 && tree[0].predictor_offset == 0 &&
+             !skip_encoder_fast_path) {
+    for (size_t c = 0; c < 3; c++) {
+      FillImage(static_cast<float>(PredictorColor(Predictor::Zero)[c]),
+                &predictor_img.Plane(c));
+    }
+    for (size_t y = 0; y < channel.h; y++) {
+      const pixel_type *JXL_RESTRICT p = channel.Row(y);
+      for (size_t x = 0; x < channel.w; x++) {
+        *tokenp++ = Token(tree[0].childID, PackSigned(p[x]));
+      }
+    }
+  } else if (tree.size() == 1 && tree[0].predictor != Predictor::Weighted &&
+             (tree[0].multiplier & (tree[0].multiplier - 1)) == 0 &&
+             tree[0].predictor_offset == 0 && !skip_encoder_fast_path) {
+    // multiplier is a power of 2.
+    for (size_t c = 0; c < 3; c++) {
+      FillImage(static_cast<float>(PredictorColor(tree[0].predictor)[c]),
+                &predictor_img.Plane(c));
+    }
+    uint32_t mul_shift = FloorLog2Nonzero((uint32_t)tree[0].multiplier);
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    for (size_t y = 0; y < channel.h; y++) {
+      const pixel_type *JXL_RESTRICT r = channel.Row(y);
+      for (size_t x = 0; x < channel.w; x++) {
+        PredictionResult pred = PredictNoTreeNoWP(channel.w, r + x, onerow, x,
+                                                  y, tree[0].predictor);
+        pixel_type_w residual = r[x] - pred.guess;
+        JXL_DASSERT((residual >> mul_shift) * tree[0].multiplier == residual);
+        *tokenp++ = Token(tree[0].childID, PackSigned(residual >> mul_shift));
+      }
+    }
+
+  } else if (!use_wp && !skip_encoder_fast_path) {
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    Channel references(properties.size() - kNumNonrefProperties, channel.w);
+    for (size_t y = 0; y < channel.h; y++) {
+      const pixel_type *JXL_RESTRICT p = channel.Row(y);
+      PrecomputeReferences(channel, y, image, chan, &references);
+      float *pred_img_row[3];
+      if (kWantDebug) {
+        for (size_t c = 0; c < 3; c++) {
+          pred_img_row[c] = predictor_img.PlaneRow(c, y);
+        }
+      }
+      InitPropsRow(&properties, static_props, y);
+      for (size_t x = 0; x < channel.w; x++) {
+        PredictionResult res =
+            PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y,
+                            tree_lookup, references);
+        if (kWantDebug) {
+          for (size_t i = 0; i < 3; i++) {
+            pred_img_row[i][x] = PredictorColor(res.predictor)[i];
+          }
+        }
+        pixel_type_w residual = p[x] - res.guess;
+        JXL_ASSERT(residual % res.multiplier == 0);
+        *tokenp++ = Token(res.context, PackSigned(residual / res.multiplier));
+      }
+    }
+  } else {
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    Channel references(properties.size() - kNumNonrefProperties, channel.w);
+    weighted::State wp_state(wp_header, channel.w, channel.h);
+    for (size_t y = 0; y < channel.h; y++) {
+      const pixel_type *JXL_RESTRICT p = channel.Row(y);
+      PrecomputeReferences(channel, y, image, chan, &references);
+      float *pred_img_row[3];
+      if (kWantDebug) {
+        for (size_t c = 0; c < 3; c++) {
+          pred_img_row[c] = predictor_img.PlaneRow(c, y);
+        }
+      }
+      InitPropsRow(&properties, static_props, y);
+      for (size_t x = 0; x < channel.w; x++) {
+        PredictionResult res =
+            PredictTreeWP(&properties, channel.w, p + x, onerow, x, y,
+                          tree_lookup, references, &wp_state);
+        if (kWantDebug) {
+          for (size_t i = 0; i < 3; i++) {
+            pred_img_row[i][x] = PredictorColor(res.predictor)[i];
+          }
+        }
+        pixel_type_w residual = p[x] - res.guess;
+        JXL_ASSERT(residual % res.multiplier == 0);
+        *tokenp++ = Token(res.context, PackSigned(residual / res.multiplier));
+        wp_state.UpdateErrors(p[x], x, y, channel.w);
+      }
+    }
+  }
+  if (kWantDebug && WantDebugOutput(aux_out)) {
+    aux_out->DumpImage(
+        ("pred_" + ToString(group_id) + "_" + ToString(chan)).c_str(),
+        predictor_img);
+  }
+  *tokenpp = tokenp;
+  return true;
+}
+
+Status ModularEncode(const Image &image, const ModularOptions &options,
+                     BitWriter *writer, AuxOut *aux_out, size_t layer,
+                     size_t group_id, TreeSamples *tree_samples,
+                     size_t *total_pixels, const Tree *tree,
+                     GroupHeader *header, std::vector<Token> *tokens,
+                     size_t *width) {
+  if (image.error) return JXL_FAILURE("Invalid image");
+  size_t nb_channels = image.channel.size();
+  JXL_DEBUG_V(
+      2, "Encoding %" PRIuS "-channel, %i-bit, %" PRIuS "x%" PRIuS " image.",
+      nb_channels, image.bitdepth, image.w, image.h);
+
+  if (nb_channels < 1) {
+    return true;  // is there any use for a zero-channel image?
+  }
+
+  // encode transforms
+  GroupHeader header_storage;
+  if (header == nullptr) header = &header_storage;
+  Bundle::Init(header);
+  if (options.predictor == Predictor::Weighted) {
+    weighted::PredictorMode(options.wp_mode, &header->wp_header);
+  }
+  header->transforms = image.transform;
+  // This doesn't actually work
+  if (tree != nullptr) {
+    header->use_global_tree = true;
+  }
+  if (tree_samples == nullptr && tree == nullptr) {
+    JXL_RETURN_IF_ERROR(Bundle::Write(*header, writer, layer, aux_out));
+  }
+
+  TreeSamples tree_samples_storage;
+  size_t total_pixels_storage = 0;
+  if (!total_pixels) total_pixels = &total_pixels_storage;
+  // If there's no tree, compute one (or gather data to).
+  if (tree == nullptr) {
+    bool gather_data = tree_samples != nullptr;
+    if (tree_samples == nullptr) {
+      JXL_RETURN_IF_ERROR(tree_samples_storage.SetPredictor(
+          options.predictor, options.wp_tree_mode));
+      JXL_RETURN_IF_ERROR(tree_samples_storage.SetProperties(
+          options.splitting_heuristics_properties, options.wp_tree_mode));
+      std::vector<pixel_type> pixel_samples;
+      std::vector<pixel_type> diff_samples;
+      std::vector<uint32_t> group_pixel_count;
+      std::vector<uint32_t> channel_pixel_count;
+      CollectPixelSamples(image, options, 0, group_pixel_count,
+                          channel_pixel_count, pixel_samples, diff_samples);
+      std::vector<ModularMultiplierInfo> dummy_multiplier_info;
+      StaticPropRange range;
+      tree_samples_storage.PreQuantizeProperties(
+          range, dummy_multiplier_info, group_pixel_count, channel_pixel_count,
+          pixel_samples, diff_samples, options.max_property_values);
+    }
+    for (size_t i = 0; i < nb_channels; i++) {
+      if (!image.channel[i].w || !image.channel[i].h) {
+        continue;  // skip empty channels
+      }
+      if (i >= image.nb_meta_channels &&
+          (image.channel[i].w > options.max_chan_size ||
+           image.channel[i].h > options.max_chan_size)) {
+        break;
+      }
+      GatherTreeData(image, i, group_id, header->wp_header, options,
+                     gather_data ? *tree_samples : tree_samples_storage,
+                     total_pixels);
+    }
+    if (gather_data) return true;
+  }
+
+  JXL_ASSERT((tree == nullptr) == (tokens == nullptr));
+
+  Tree tree_storage;
+  std::vector<std::vector<Token>> tokens_storage(1);
+  // Compute tree.
+  if (tree == nullptr) {
+    EntropyEncodingData code;
+    std::vector<uint8_t> context_map;
+
+    std::vector<std::vector<Token>> tree_tokens(1);
+    tree_storage =
+        LearnTree(std::move(tree_samples_storage), *total_pixels, options);
+    tree = &tree_storage;
+    tokens = &tokens_storage[0];
+
+    Tree decoded_tree;
+    TokenizeTree(*tree, &tree_tokens[0], &decoded_tree);
+    JXL_ASSERT(tree->size() == decoded_tree.size());
+    tree_storage = std::move(decoded_tree);
+
+    if (kWantDebug && kPrintTree && WantDebugOutput(aux_out)) {
+      PrintTree(*tree, aux_out->debug_prefix + "/tree_" + ToString(group_id));
+    }
+    // Write tree
+    BuildAndEncodeHistograms(HistogramParams(), kNumTreeContexts, tree_tokens,
+                             &code, &context_map, writer, kLayerModularTree,
+                             aux_out);
+    WriteTokens(tree_tokens[0], code, context_map, writer, kLayerModularTree,
+                aux_out);
+  }
+
+  size_t image_width = 0;
+  size_t total_tokens = 0;
+  for (size_t i = 0; i < nb_channels; i++) {
+    if (i >= image.nb_meta_channels &&
+        (image.channel[i].w > options.max_chan_size ||
+         image.channel[i].h > options.max_chan_size)) {
+      break;
+    }
+    if (image.channel[i].w > image_width) image_width = image.channel[i].w;
+    total_tokens += image.channel[i].w * image.channel[i].h;
+  }
+  if (options.zero_tokens) {
+    tokens->resize(tokens->size() + total_tokens, {0, 0});
+  } else {
+    // Do one big allocation for all the tokens we'll need,
+    // to avoid reallocs that might require copying.
+    size_t pos = tokens->size();
+    tokens->resize(pos + total_tokens);
+    Token *tokenp = tokens->data() + pos;
+    for (size_t i = 0; i < nb_channels; i++) {
+      if (!image.channel[i].w || !image.channel[i].h) {
+        continue;  // skip empty channels
+      }
+      if (i >= image.nb_meta_channels &&
+          (image.channel[i].w > options.max_chan_size ||
+           image.channel[i].h > options.max_chan_size)) {
+        break;
+      }
+      JXL_RETURN_IF_ERROR(EncodeModularChannelMAANS(
+          image, i, header->wp_header, *tree, &tokenp, aux_out, group_id,
+          options.skip_encoder_fast_path));
+    }
+    // Make sure we actually wrote all tokens
+    JXL_CHECK(tokenp == tokens->data() + tokens->size());
+  }
+
+  // Write data if not using a global tree/ANS stream.
+  if (!header->use_global_tree) {
+    EntropyEncodingData code;
+    std::vector<uint8_t> context_map;
+    HistogramParams histo_params;
+    histo_params.image_widths.push_back(image_width);
+    BuildAndEncodeHistograms(histo_params, (tree->size() + 1) / 2,
+                             tokens_storage, &code, &context_map, writer, layer,
+                             aux_out);
+    WriteTokens(tokens_storage[0], code, context_map, writer, layer, aux_out);
+  } else {
+    *width = image_width;
+  }
+  return true;
+}
+
+Status ModularGenericCompress(Image &image, const ModularOptions &opts,
+                              BitWriter *writer, AuxOut *aux_out, size_t layer,
+                              size_t group_id, TreeSamples *tree_samples,
+                              size_t *total_pixels, const Tree *tree,
+                              GroupHeader *header, std::vector<Token> *tokens,
+                              size_t *width) {
+  if (image.w == 0 || image.h == 0) return true;
+  ModularOptions options = opts;  // Make a copy to modify it.
+
+  if (options.predictor == static_cast<Predictor>(-1)) {
+    options.predictor = Predictor::Gradient;
+  }
+
+  size_t bits = writer ? writer->BitsWritten() : 0;
+  JXL_RETURN_IF_ERROR(ModularEncode(image, options, writer, aux_out, layer,
+                                    group_id, tree_samples, total_pixels, tree,
+                                    header, tokens, width));
+  bits = writer ? writer->BitsWritten() - bits : 0;
+  if (writer) {
+    JXL_DEBUG_V(4,
+                "Modular-encoded a %" PRIuS "x%" PRIuS
+                " bitdepth=%i nbchans=%" PRIuS " image in %" PRIuS " bytes",
+                image.w, image.h, image.bitdepth, image.channel.size(),
+                bits / 8);
+  }
+  (void)bits;
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.h
new file mode 100644
index 0000000000..04df504750
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.h
@@ -0,0 +1,47 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_ENCODING_ENC_ENCODING_H_
+#define LIB_JXL_MODULAR_ENCODING_ENC_ENCODING_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/encoding/enc_ma.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/options.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+namespace jxl {
+
+Tree LearnTree(TreeSamples &&tree_samples, size_t total_pixels,
+               const ModularOptions &options,
+               const std::vector<ModularMultiplierInfo> &multiplier_info = {},
+               StaticPropRange static_prop_range = {});
+
+// TODO(veluca): make cleaner interfaces.
+
+Status ModularGenericCompress(
+    Image &image, const ModularOptions &opts, BitWriter *writer,
+    AuxOut *aux_out = nullptr, size_t layer = 0, size_t group_id = 0,
+    // For gathering data for producing a global tree.
+    TreeSamples *tree_samples = nullptr, size_t *total_pixels = nullptr,
+    // For encoding with global tree.
+    const Tree *tree = nullptr, GroupHeader *header = nullptr,
+    std::vector<Token> *tokens = nullptr, size_t *widths = nullptr);
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_ENCODING_ENC_ENCODING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.cc b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.cc
new file mode 100644
index 0000000000..d0f6b47566
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.cc
@@ -0,0 +1,1023 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/encoding/enc_ma.h"
+
+#include <algorithm>
+#include <limits>
+#include <numeric>
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "lib/jxl/modular/encoding/ma_common.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/modular/encoding/enc_ma.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/options.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Eq;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::Lt;
+
+const HWY_FULL(float) df;
+const HWY_FULL(int32_t) di;
+size_t Padded(size_t x) { return RoundUpTo(x, Lanes(df)); }
+
+float EstimateBits(const int32_t *counts, int32_t *rounded_counts,
+                   size_t num_symbols) {
+  // Try to approximate the effect of rounding up nonzero probabilities.
+  int32_t total = std::accumulate(counts, counts + num_symbols, 0);
+  const auto min = Set(di, (total + ANS_TAB_SIZE - 1) >> ANS_LOG_TAB_SIZE);
+  const auto zero_i = Zero(di);
+  for (size_t i = 0; i < num_symbols; i += Lanes(df)) {
+    auto counts_v = LoadU(di, &counts[i]);
+    counts_v = IfThenElse(Eq(counts_v, zero_i), zero_i,
+                          IfThenElse(Lt(counts_v, min), min, counts_v));
+    StoreU(counts_v, di, &rounded_counts[i]);
+  }
+  // Compute entropy of the "rounded" probabilities.
+  const auto zero = Zero(df);
+  const size_t total_scalar =
+      std::accumulate(rounded_counts, rounded_counts + num_symbols, 0);
+  const auto inv_total = Set(df, 1.0f / total_scalar);
+  auto bits_lanes = Zero(df);
+  auto total_v = Set(di, total_scalar);
+  for (size_t i = 0; i < num_symbols; i += Lanes(df)) {
+    const auto counts_v = ConvertTo(df, LoadU(di, &counts[i]));
+    const auto round_counts_v = LoadU(di, &rounded_counts[i]);
+    const auto probs = Mul(ConvertTo(df, round_counts_v), inv_total);
+    const auto nbps = IfThenElse(Eq(round_counts_v, total_v), BitCast(di, zero),
+                                 BitCast(di, FastLog2f(df, probs)));
+    bits_lanes = Sub(bits_lanes, IfThenElse(Eq(counts_v, zero), zero,
+                                            Mul(counts_v, BitCast(df, nbps))));
+  }
+  return GetLane(SumOfLanes(df, bits_lanes));
+}
+
+void MakeSplitNode(size_t pos, int property, int splitval, Predictor lpred,
+                   int64_t loff, Predictor rpred, int64_t roff, Tree *tree) {
+  // Note that the tree splits on *strictly greater*.
+  (*tree)[pos].lchild = tree->size();
+  (*tree)[pos].rchild = tree->size() + 1;
+  (*tree)[pos].splitval = splitval;
+  (*tree)[pos].property = property;
+  tree->emplace_back();
+  tree->back().property = -1;
+  tree->back().predictor = rpred;
+  tree->back().predictor_offset = roff;
+  tree->back().multiplier = 1;
+  tree->emplace_back();
+  tree->back().property = -1;
+  tree->back().predictor = lpred;
+  tree->back().predictor_offset = loff;
+  tree->back().multiplier = 1;
+}
+
+enum class IntersectionType { kNone, kPartial, kInside };
+IntersectionType BoxIntersects(StaticPropRange needle, StaticPropRange haystack,
+                               uint32_t &partial_axis, uint32_t &partial_val) {
+  bool partial = false;
+  for (size_t i = 0; i < kNumStaticProperties; i++) {
+    if (haystack[i][0] >= needle[i][1]) {
+      return IntersectionType::kNone;
+    }
+    if (haystack[i][1] <= needle[i][0]) {
+      return IntersectionType::kNone;
+    }
+    if (haystack[i][0] <= needle[i][0] && haystack[i][1] >= needle[i][1]) {
+      continue;
+    }
+    partial = true;
+    partial_axis = i;
+    if (haystack[i][0] > needle[i][0] && haystack[i][0] < needle[i][1]) {
+      partial_val = haystack[i][0] - 1;
+    } else {
+      JXL_DASSERT(haystack[i][1] > needle[i][0] &&
+                  haystack[i][1] < needle[i][1]);
+      partial_val = haystack[i][1] - 1;
+    }
+  }
+  return partial ? IntersectionType::kPartial : IntersectionType::kInside;
+}
+
+void SplitTreeSamples(TreeSamples &tree_samples, size_t begin, size_t pos,
+                      size_t end, size_t prop) {
+  auto cmp = [&](size_t a, size_t b) {
+    return int32_t(tree_samples.Property(prop, a)) -
+           int32_t(tree_samples.Property(prop, b));
+  };
+  Rng rng(0);
+  while (end > begin + 1) {
+    {
+      size_t pivot = rng.UniformU(begin, end);
+      tree_samples.Swap(begin, pivot);
+    }
+    size_t pivot_begin = begin;
+    size_t pivot_end = pivot_begin + 1;
+    for (size_t i = begin + 1; i < end; i++) {
+      JXL_DASSERT(i >= pivot_end);
+      JXL_DASSERT(pivot_end > pivot_begin);
+      int32_t cmp_result = cmp(i, pivot_begin);
+      if (cmp_result < 0) {  // i < pivot, move pivot forward and put i before
+                             // the pivot.
+        tree_samples.ThreeShuffle(pivot_begin, pivot_end, i);
+        pivot_begin++;
+        pivot_end++;
+      } else if (cmp_result == 0) {
+        tree_samples.Swap(pivot_end, i);
+        pivot_end++;
+      }
+    }
+    JXL_DASSERT(pivot_begin >= begin);
+    JXL_DASSERT(pivot_end > pivot_begin);
+    JXL_DASSERT(pivot_end <= end);
+    for (size_t i = begin; i < pivot_begin; i++) {
+      JXL_DASSERT(cmp(i, pivot_begin) < 0);
+    }
+    for (size_t i = pivot_end; i < end; i++) {
+      JXL_DASSERT(cmp(i, pivot_begin) > 0);
+    }
+    for (size_t i = pivot_begin; i < pivot_end; i++) {
+      JXL_DASSERT(cmp(i, pivot_begin) == 0);
+    }
+    // We now have that [begin, pivot_begin) is < pivot, [pivot_begin,
+    // pivot_end) is = pivot, and [pivot_end, end) is > pivot.
+    // If pos falls in the first or the last interval, we continue in that
+    // interval; otherwise, we are done.
+    if (pivot_begin > pos) {
+      end = pivot_begin;
+    } else if (pivot_end < pos) {
+      begin = pivot_end;
+    } else {
+      break;
+    }
+  }
+}
+
+void FindBestSplit(TreeSamples &tree_samples, float threshold,
+                   const std::vector<ModularMultiplierInfo> &mul_info,
+                   StaticPropRange initial_static_prop_range,
+                   float fast_decode_multiplier, Tree *tree) {
+  struct NodeInfo {
+    size_t pos;
+    size_t begin;
+    size_t end;
+    uint64_t used_properties;
+    StaticPropRange static_prop_range;
+  };
+  std::vector<NodeInfo> nodes;
+  nodes.push_back(NodeInfo{0, 0, tree_samples.NumDistinctSamples(), 0,
+                           initial_static_prop_range});
+
+  size_t num_predictors = tree_samples.NumPredictors();
+  size_t num_properties = tree_samples.NumProperties();
+
+  // TODO(veluca): consider parallelizing the search (processing multiple nodes
+  // at a time).
+  while (!nodes.empty()) {
+    size_t pos = nodes.back().pos;
+    size_t begin = nodes.back().begin;
+    size_t end = nodes.back().end;
+    uint64_t used_properties = nodes.back().used_properties;
+    StaticPropRange static_prop_range = nodes.back().static_prop_range;
+    nodes.pop_back();
+    if (begin == end) continue;
+
+    struct SplitInfo {
+      size_t prop = 0;
+      uint32_t val = 0;
+      size_t pos = 0;
+      float lcost = std::numeric_limits<float>::max();
+      float rcost = std::numeric_limits<float>::max();
+      Predictor lpred = Predictor::Zero;
+      Predictor rpred = Predictor::Zero;
+      float Cost() { return lcost + rcost; }
+    };
+
+    SplitInfo best_split_static_constant;
+    SplitInfo best_split_static;
+    SplitInfo best_split_nonstatic;
+    SplitInfo best_split_nowp;
+
+    JXL_DASSERT(begin <= end);
+    JXL_DASSERT(end <= tree_samples.NumDistinctSamples());
+
+    // Compute the maximum token in the range.
+    size_t max_symbols = 0;
+    for (size_t pred = 0; pred < num_predictors; pred++) {
+      for (size_t i = begin; i < end; i++) {
+        uint32_t tok = tree_samples.Token(pred, i);
+        max_symbols = max_symbols > tok + 1 ? max_symbols : tok + 1;
+      }
+    }
+    max_symbols = Padded(max_symbols);
+    std::vector<int32_t> rounded_counts(max_symbols);
+    std::vector<int32_t> counts(max_symbols * num_predictors);
+    std::vector<uint32_t> tot_extra_bits(num_predictors);
+    for (size_t pred = 0; pred < num_predictors; pred++) {
+      for (size_t i = begin; i < end; i++) {
+        counts[pred * max_symbols + tree_samples.Token(pred, i)] +=
+            tree_samples.Count(i);
+        tot_extra_bits[pred] +=
+            tree_samples.NBits(pred, i) * tree_samples.Count(i);
+      }
+    }
+
+    float base_bits;
+    {
+      size_t pred = tree_samples.PredictorIndex((*tree)[pos].predictor);
+      base_bits = EstimateBits(counts.data() + pred * max_symbols,
+                               rounded_counts.data(), max_symbols) +
+                  tot_extra_bits[pred];
+    }
+
+    SplitInfo *best = &best_split_nonstatic;
+
+    SplitInfo forced_split;
+    // The multiplier ranges cut halfway through the current ranges of static
+    // properties. We do this even if the current node is not a leaf, to
+    // minimize the number of nodes in the resulting tree.
+    for (size_t i = 0; i < mul_info.size(); i++) {
+      uint32_t axis, val;
+      IntersectionType t =
+          BoxIntersects(static_prop_range, mul_info[i].range, axis, val);
+      if (t == IntersectionType::kNone) continue;
+      if (t == IntersectionType::kInside) {
+        (*tree)[pos].multiplier = mul_info[i].multiplier;
+        break;
+      }
+      if (t == IntersectionType::kPartial) {
+        forced_split.val = tree_samples.QuantizeProperty(axis, val);
+        forced_split.prop = axis;
+        forced_split.lcost = forced_split.rcost = base_bits / 2 - threshold;
+        forced_split.lpred = forced_split.rpred = (*tree)[pos].predictor;
+        best = &forced_split;
+        best->pos = begin;
+        JXL_ASSERT(best->prop == tree_samples.PropertyFromIndex(best->prop));
+        for (size_t x = begin; x < end; x++) {
+          if (tree_samples.Property(best->prop, x) <= best->val) {
+            best->pos++;
+          }
+        }
+        break;
+      }
+    }
+
+    if (best != &forced_split) {
+      std::vector<int> prop_value_used_count;
+      std::vector<int> count_increase;
+      std::vector<size_t> extra_bits_increase;
+      // For each property, compute which of its values are used, and what
+      // tokens correspond to those usages. Then, iterate through the values,
+      // and compute the entropy of each side of the split (of the form `prop >
+      // threshold`). Finally, find the split that minimizes the cost.
+      struct CostInfo {
+        float cost = std::numeric_limits<float>::max();
+        float extra_cost = 0;
+        float Cost() const { return cost + extra_cost; }
+        Predictor pred;  // will be uninitialized in some cases, but never used.
+      };
+      std::vector<CostInfo> costs_l;
+      std::vector<CostInfo> costs_r;
+
+      std::vector<int32_t> counts_above(max_symbols);
+      std::vector<int32_t> counts_below(max_symbols);
+
+      // The lower the threshold, the higher the expected noisiness of the
+      // estimate. Thus, discourage changing predictors.
+      float change_pred_penalty = 800.0f / (100.0f + threshold);
+      for (size_t prop = 0; prop < num_properties && base_bits > threshold;
+           prop++) {
+        costs_l.clear();
+        costs_r.clear();
+        size_t prop_size = tree_samples.NumPropertyValues(prop);
+        if (extra_bits_increase.size() < prop_size) {
+          count_increase.resize(prop_size * max_symbols);
+          extra_bits_increase.resize(prop_size);
+        }
+        // Clear prop_value_used_count (which cannot be cleared "on the go")
+        prop_value_used_count.clear();
+        prop_value_used_count.resize(prop_size);
+
+        size_t first_used = prop_size;
+        size_t last_used = 0;
+
+        // TODO(veluca): consider finding multiple splits along a single
+        // property at the same time, possibly with a bottom-up approach.
+        for (size_t i = begin; i < end; i++) {
+          size_t p = tree_samples.Property(prop, i);
+          prop_value_used_count[p]++;
+          last_used = std::max(last_used, p);
+          first_used = std::min(first_used, p);
+        }
+        costs_l.resize(last_used - first_used);
+        costs_r.resize(last_used - first_used);
+        // For all predictors, compute the right and left costs of each split.
+        for (size_t pred = 0; pred < num_predictors; pred++) {
+          // Compute cost and histogram increments for each property value.
+          for (size_t i = begin; i < end; i++) {
+            size_t p = tree_samples.Property(prop, i);
+            size_t cnt = tree_samples.Count(i);
+            size_t sym = tree_samples.Token(pred, i);
+            count_increase[p * max_symbols + sym] += cnt;
+            extra_bits_increase[p] += tree_samples.NBits(pred, i) * cnt;
+          }
+          memcpy(counts_above.data(), counts.data() + pred * max_symbols,
+                 max_symbols * sizeof counts_above[0]);
+          memset(counts_below.data(), 0, max_symbols * sizeof counts_below[0]);
+          size_t extra_bits_below = 0;
+          // Exclude last used: this ensures neither counts_above nor
+          // counts_below is empty.
+          for (size_t i = first_used; i < last_used; i++) {
+            if (!prop_value_used_count[i]) continue;
+            extra_bits_below += extra_bits_increase[i];
+            // The increase for this property value has been used, and will not
+            // be used again: clear it. Also below.
+            extra_bits_increase[i] = 0;
+            for (size_t sym = 0; sym < max_symbols; sym++) {
+              counts_above[sym] -= count_increase[i * max_symbols + sym];
+              counts_below[sym] += count_increase[i * max_symbols + sym];
+              count_increase[i * max_symbols + sym] = 0;
+            }
+            float rcost = EstimateBits(counts_above.data(),
+                                       rounded_counts.data(), max_symbols) +
+                          tot_extra_bits[pred] - extra_bits_below;
+            float lcost = EstimateBits(counts_below.data(),
+                                       rounded_counts.data(), max_symbols) +
+                          extra_bits_below;
+            JXL_DASSERT(extra_bits_below <= tot_extra_bits[pred]);
+            float penalty = 0;
+            // Never discourage moving away from the Weighted predictor.
+            if (tree_samples.PredictorFromIndex(pred) !=
+                    (*tree)[pos].predictor &&
+                (*tree)[pos].predictor != Predictor::Weighted) {
+              penalty = change_pred_penalty;
+            }
+            // If everything else is equal, disfavour Weighted (slower) and
+            // favour Zero (faster if it's the only predictor used in a
+            // group+channel combination)
+            if (tree_samples.PredictorFromIndex(pred) == Predictor::Weighted) {
+              penalty += 1e-8;
+            }
+            if (tree_samples.PredictorFromIndex(pred) == Predictor::Zero) {
+              penalty -= 1e-8;
+            }
+            if (rcost + penalty < costs_r[i - first_used].Cost()) {
+              costs_r[i - first_used].cost = rcost;
+              costs_r[i - first_used].extra_cost = penalty;
+              costs_r[i - first_used].pred =
+                  tree_samples.PredictorFromIndex(pred);
+            }
+            if (lcost + penalty < costs_l[i - first_used].Cost()) {
+              costs_l[i - first_used].cost = lcost;
+              costs_l[i - first_used].extra_cost = penalty;
+              costs_l[i - first_used].pred =
+                  tree_samples.PredictorFromIndex(pred);
+            }
+          }
+        }
+        // Iterate through the possible splits and find the one with minimum sum
+        // of costs of the two sides.
+        size_t split = begin;
+        for (size_t i = first_used; i < last_used; i++) {
+          if (!prop_value_used_count[i]) continue;
+          split += prop_value_used_count[i];
+          float rcost = costs_r[i - first_used].cost;
+          float lcost = costs_l[i - first_used].cost;
+          // WP was not used + we would use the WP property or predictor
+          bool adds_wp =
+              (tree_samples.PropertyFromIndex(prop) == kWPProp &&
+               (used_properties & (1LU << prop)) == 0) ||
+              ((costs_l[i - first_used].pred == Predictor::Weighted ||
+                costs_r[i - first_used].pred == Predictor::Weighted) &&
+               (*tree)[pos].predictor != Predictor::Weighted);
+          bool zero_entropy_side = rcost == 0 || lcost == 0;
+
+          SplitInfo &best =
+              prop < kNumStaticProperties
+                  ? (zero_entropy_side ? best_split_static_constant
+                                       : best_split_static)
+                  : (adds_wp ? best_split_nonstatic : best_split_nowp);
+          if (lcost + rcost < best.Cost()) {
+            best.prop = prop;
+            best.val = i;
+            best.pos = split;
+            best.lcost = lcost;
+            best.lpred = costs_l[i - first_used].pred;
+            best.rcost = rcost;
+            best.rpred = costs_r[i - first_used].pred;
+          }
+        }
+        // Clear extra_bits_increase and cost_increase for last_used.
+        extra_bits_increase[last_used] = 0;
+        for (size_t sym = 0; sym < max_symbols; sym++) {
+          count_increase[last_used * max_symbols + sym] = 0;
+        }
+      }
+
+      // Try to avoid introducing WP.
+      if (best_split_nowp.Cost() + threshold < base_bits &&
+          best_split_nowp.Cost() <= fast_decode_multiplier * best->Cost()) {
+        best = &best_split_nowp;
+      }
+      // Split along static props if possible and not significantly more
+      // expensive.
+      if (best_split_static.Cost() + threshold < base_bits &&
+          best_split_static.Cost() <= fast_decode_multiplier * best->Cost()) {
+        best = &best_split_static;
+      }
+      // Split along static props to create constant nodes if possible.
+      if (best_split_static_constant.Cost() + threshold < base_bits) {
+        best = &best_split_static_constant;
+      }
+    }
+
+    if (best->Cost() + threshold < base_bits) {
+      uint32_t p = tree_samples.PropertyFromIndex(best->prop);
+      pixel_type dequant =
+          tree_samples.UnquantizeProperty(best->prop, best->val);
+      // Split node and try to split children.
+      MakeSplitNode(pos, p, dequant, best->lpred, 0, best->rpred, 0, tree);
+      // "Sort" according to winning property
+      SplitTreeSamples(tree_samples, begin, best->pos, end, best->prop);
+      if (p >= kNumStaticProperties) {
+        used_properties |= 1 << best->prop;
+      }
+      auto new_sp_range = static_prop_range;
+      if (p < kNumStaticProperties) {
+        JXL_ASSERT(static_cast<uint32_t>(dequant + 1) <= new_sp_range[p][1]);
+        new_sp_range[p][1] = dequant + 1;
+        JXL_ASSERT(new_sp_range[p][0] < new_sp_range[p][1]);
+      }
+      nodes.push_back(NodeInfo{(*tree)[pos].rchild, begin, best->pos,
+                               used_properties, new_sp_range});
+      new_sp_range = static_prop_range;
+      if (p < kNumStaticProperties) {
+        JXL_ASSERT(new_sp_range[p][0] <= static_cast<uint32_t>(dequant + 1));
+        new_sp_range[p][0] = dequant + 1;
+        JXL_ASSERT(new_sp_range[p][0] < new_sp_range[p][1]);
+      }
+      nodes.push_back(NodeInfo{(*tree)[pos].lchild, best->pos, end,
+                               used_properties, new_sp_range});
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(FindBestSplit);  // Local function.
+
+void ComputeBestTree(TreeSamples &tree_samples, float threshold,
+                     const std::vector<ModularMultiplierInfo> &mul_info,
+                     StaticPropRange static_prop_range,
+                     float fast_decode_multiplier, Tree *tree) {
+  // TODO(veluca): take into account that different contexts can have different
+  // uint configs.
+  //
+  // Initialize tree.
+  tree->emplace_back();
+  tree->back().property = -1;
+  tree->back().predictor = tree_samples.PredictorFromIndex(0);
+  tree->back().predictor_offset = 0;
+  tree->back().multiplier = 1;
+  JXL_ASSERT(tree_samples.NumProperties() < 64);
+
+  JXL_ASSERT(tree_samples.NumDistinctSamples() <=
+             std::numeric_limits<uint32_t>::max());
+  HWY_DYNAMIC_DISPATCH(FindBestSplit)
+  (tree_samples, threshold, mul_info, static_prop_range, fast_decode_multiplier,
+   tree);
+}
+
+constexpr int32_t TreeSamples::kPropertyRange;
+constexpr uint32_t TreeSamples::kDedupEntryUnused;
+
+Status TreeSamples::SetPredictor(Predictor predictor,
+                                 ModularOptions::TreeMode wp_tree_mode) {
+  if (wp_tree_mode == ModularOptions::TreeMode::kWPOnly) {
+    predictors = {Predictor::Weighted};
+    residuals.resize(1);
+    return true;
+  }
+  if (wp_tree_mode == ModularOptions::TreeMode::kNoWP &&
+      predictor == Predictor::Weighted) {
+    return JXL_FAILURE("Invalid predictor settings");
+  }
+  if (predictor == Predictor::Variable) {
+    for (size_t i = 0; i < kNumModularPredictors; i++) {
+      predictors.push_back(static_cast<Predictor>(i));
+    }
+    std::swap(predictors[0], predictors[static_cast<int>(Predictor::Weighted)]);
+    std::swap(predictors[1], predictors[static_cast<int>(Predictor::Gradient)]);
+  } else if (predictor == Predictor::Best) {
+    predictors = {Predictor::Weighted, Predictor::Gradient};
+  } else {
+    predictors = {predictor};
+  }
+  if (wp_tree_mode == ModularOptions::TreeMode::kNoWP) {
+    auto wp_it =
+        std::find(predictors.begin(), predictors.end(), Predictor::Weighted);
+    if (wp_it != predictors.end()) {
+      predictors.erase(wp_it);
+    }
+  }
+  residuals.resize(predictors.size());
+  return true;
+}
+
+Status TreeSamples::SetProperties(const std::vector<uint32_t> &properties,
+                                  ModularOptions::TreeMode wp_tree_mode) {
+  props_to_use = properties;
+  if (wp_tree_mode == ModularOptions::TreeMode::kWPOnly) {
+    props_to_use = {static_cast<uint32_t>(kWPProp)};
+  }
+  if (wp_tree_mode == ModularOptions::TreeMode::kGradientOnly) {
+    props_to_use = {static_cast<uint32_t>(kGradientProp)};
+  }
+  if (wp_tree_mode == ModularOptions::TreeMode::kNoWP) {
+    auto it = std::find(props_to_use.begin(), props_to_use.end(), kWPProp);
+    if (it != props_to_use.end()) {
+      props_to_use.erase(it);
+    }
+  }
+  if (props_to_use.empty()) {
+    return JXL_FAILURE("Invalid property set configuration");
+  }
+  props.resize(props_to_use.size());
+  return true;
+}
+
+void TreeSamples::InitTable(size_t size) {
+  JXL_DASSERT((size & (size - 1)) == 0);
+  if (dedup_table_.size() == size) return;
+  dedup_table_.resize(size, kDedupEntryUnused);
+  for (size_t i = 0; i < NumDistinctSamples(); i++) {
+    if (sample_counts[i] != std::numeric_limits<uint16_t>::max()) {
+      AddToTable(i);
+    }
+  }
+}
+
+bool TreeSamples::AddToTableAndMerge(size_t a) {
+  size_t pos1 = Hash1(a);
+  size_t pos2 = Hash2(a);
+  if (dedup_table_[pos1] != kDedupEntryUnused &&
+      IsSameSample(a, dedup_table_[pos1])) {
+    JXL_DASSERT(sample_counts[a] == 1);
+    sample_counts[dedup_table_[pos1]]++;
+    // Remove from hash table samples that are saturated.
+    if (sample_counts[dedup_table_[pos1]] ==
+        std::numeric_limits<uint16_t>::max()) {
+      dedup_table_[pos1] = kDedupEntryUnused;
+    }
+    return true;
+  }
+  if (dedup_table_[pos2] != kDedupEntryUnused &&
+      IsSameSample(a, dedup_table_[pos2])) {
+    JXL_DASSERT(sample_counts[a] == 1);
+    sample_counts[dedup_table_[pos2]]++;
+    // Remove from hash table samples that are saturated.
+    if (sample_counts[dedup_table_[pos2]] ==
+        std::numeric_limits<uint16_t>::max()) {
+      dedup_table_[pos2] = kDedupEntryUnused;
+    }
+    return true;
+  }
+  AddToTable(a);
+  return false;
+}
+
+void TreeSamples::AddToTable(size_t a) {
+  size_t pos1 = Hash1(a);
+  size_t pos2 = Hash2(a);
+  if (dedup_table_[pos1] == kDedupEntryUnused) {
+    dedup_table_[pos1] = a;
+  } else if (dedup_table_[pos2] == kDedupEntryUnused) {
+    dedup_table_[pos2] = a;
+  }
+}
+
+void TreeSamples::PrepareForSamples(size_t num_samples) {
+  for (auto &res : residuals) {
+    res.reserve(res.size() + num_samples);
+  }
+  for (auto &p : props) {
+    p.reserve(p.size() + num_samples);
+  }
+  size_t total_num_samples = num_samples + sample_counts.size();
+  size_t next_pow2 = 1LLU << CeilLog2Nonzero(total_num_samples * 3 / 2);
+  InitTable(next_pow2);
+}
+
+size_t TreeSamples::Hash1(size_t a) const {
+  constexpr uint64_t constant = 0x1e35a7bd;
+  uint64_t h = constant;
+  for (const auto &r : residuals) {
+    h = h * constant + r[a].tok;
+    h = h * constant + r[a].nbits;
+  }
+  for (const auto &p : props) {
+    h = h * constant + p[a];
+  }
+  return (h >> 16) & (dedup_table_.size() - 1);
+}
+size_t TreeSamples::Hash2(size_t a) const {
+  constexpr uint64_t constant = 0x1e35a7bd1e35a7bd;
+  uint64_t h = constant;
+  for (const auto &p : props) {
+    h = h * constant ^ p[a];
+  }
+  for (const auto &r : residuals) {
+    h = h * constant ^ r[a].tok;
+    h = h * constant ^ r[a].nbits;
+  }
+  return (h >> 16) & (dedup_table_.size() - 1);
+}
+
+bool TreeSamples::IsSameSample(size_t a, size_t b) const {
+  bool ret = true;
+  for (const auto &r : residuals) {
+    if (r[a].tok != r[b].tok) {
+      ret = false;
+    }
+    if (r[a].nbits != r[b].nbits) {
+      ret = false;
+    }
+  }
+  for (const auto &p : props) {
+    if (p[a] != p[b]) {
+      ret = false;
+    }
+  }
+  return ret;
+}
+
+void TreeSamples::AddSample(pixel_type_w pixel, const Properties &properties,
+                            const pixel_type_w *predictions) {
+  for (size_t i = 0; i < predictors.size(); i++) {
+    pixel_type v = pixel - predictions[static_cast<int>(predictors[i])];
+    uint32_t tok, nbits, bits;
+    HybridUintConfig(4, 1, 2).Encode(PackSigned(v), &tok, &nbits, &bits);
+    JXL_DASSERT(tok < 256);
+    JXL_DASSERT(nbits < 256);
+    residuals[i].emplace_back(
+        ResidualToken{static_cast<uint8_t>(tok), static_cast<uint8_t>(nbits)});
+  }
+  for (size_t i = 0; i < props_to_use.size(); i++) {
+    props[i].push_back(QuantizeProperty(i, properties[props_to_use[i]]));
+  }
+  sample_counts.push_back(1);
+  num_samples++;
+  if (AddToTableAndMerge(sample_counts.size() - 1)) {
+    for (auto &r : residuals) r.pop_back();
+    for (auto &p : props) p.pop_back();
+    sample_counts.pop_back();
+  }
+}
+
+void TreeSamples::Swap(size_t a, size_t b) {
+  if (a == b) return;
+  for (auto &r : residuals) {
+    std::swap(r[a], r[b]);
+  }
+  for (auto &p : props) {
+    std::swap(p[a], p[b]);
+  }
+  std::swap(sample_counts[a], sample_counts[b]);
+}
+
+void TreeSamples::ThreeShuffle(size_t a, size_t b, size_t c) {
+  if (b == c) return Swap(a, b);
+  for (auto &r : residuals) {
+    auto tmp = r[a];
+    r[a] = r[c];
+    r[c] = r[b];
+    r[b] = tmp;
+  }
+  for (auto &p : props) {
+    auto tmp = p[a];
+    p[a] = p[c];
+    p[c] = p[b];
+    p[b] = tmp;
+  }
+  auto tmp = sample_counts[a];
+  sample_counts[a] = sample_counts[c];
+  sample_counts[c] = sample_counts[b];
+  sample_counts[b] = tmp;
+}
+
+namespace {
+std::vector<int32_t> QuantizeHistogram(const std::vector<uint32_t> &histogram,
+                                       size_t num_chunks) {
+  if (histogram.empty()) return {};
+  // TODO(veluca): selecting distinct quantiles is likely not the best
+  // way to go about this.
+  std::vector<int32_t> thresholds;
+  size_t sum = std::accumulate(histogram.begin(), histogram.end(), 0LU);
+  size_t cumsum = 0;
+  size_t threshold = 0;
+  for (size_t i = 0; i + 1 < histogram.size(); i++) {
+    cumsum += histogram[i];
+    if (cumsum > (threshold + 1) * sum / num_chunks) {
+      thresholds.push_back(i);
+      while (cumsum >= (threshold + 1) * sum / num_chunks) threshold++;
+    }
+  }
+  return thresholds;
+}
+
+std::vector<int32_t> QuantizeSamples(const std::vector<int32_t> &samples,
+                                     size_t num_chunks) {
+  if (samples.empty()) return {};
+  int min = *std::min_element(samples.begin(), samples.end());
+  constexpr int kRange = 512;
+  min = std::min(std::max(min, -kRange), kRange);
+  std::vector<uint32_t> counts(2 * kRange + 1);
+  for (int s : samples) {
+    uint32_t sample_offset = std::min(std::max(s, -kRange), kRange) - min;
+    counts[sample_offset]++;
+  }
+  std::vector<int32_t> thresholds = QuantizeHistogram(counts, num_chunks);
+  for (auto &v : thresholds) v += min;
+  return thresholds;
+}
+}  // namespace
+
+void TreeSamples::PreQuantizeProperties(
+    const StaticPropRange &range,
+    const std::vector<ModularMultiplierInfo> &multiplier_info,
+    const std::vector<uint32_t> &group_pixel_count,
+    const std::vector<uint32_t> &channel_pixel_count,
+    std::vector<pixel_type> &pixel_samples,
+    std::vector<pixel_type> &diff_samples, size_t max_property_values) {
+  // If we have forced splits because of multipliers, choose channel and group
+  // thresholds accordingly.
+  std::vector<int32_t> group_multiplier_thresholds;
+  std::vector<int32_t> channel_multiplier_thresholds;
+  for (const auto &v : multiplier_info) {
+    if (v.range[0][0] != range[0][0]) {
+      channel_multiplier_thresholds.push_back(v.range[0][0] - 1);
+    }
+    if (v.range[0][1] != range[0][1]) {
+      channel_multiplier_thresholds.push_back(v.range[0][1] - 1);
+    }
+    if (v.range[1][0] != range[1][0]) {
+      group_multiplier_thresholds.push_back(v.range[1][0] - 1);
+    }
+    if (v.range[1][1] != range[1][1]) {
+      group_multiplier_thresholds.push_back(v.range[1][1] - 1);
+    }
+  }
+  std::sort(channel_multiplier_thresholds.begin(),
+            channel_multiplier_thresholds.end());
+  channel_multiplier_thresholds.resize(
+      std::unique(channel_multiplier_thresholds.begin(),
+                  channel_multiplier_thresholds.end()) -
+      channel_multiplier_thresholds.begin());
+  std::sort(group_multiplier_thresholds.begin(),
+            group_multiplier_thresholds.end());
+  group_multiplier_thresholds.resize(
+      std::unique(group_multiplier_thresholds.begin(),
+                  group_multiplier_thresholds.end()) -
+      group_multiplier_thresholds.begin());
+
+  compact_properties.resize(props_to_use.size());
+  auto quantize_channel = [&]() {
+    if (!channel_multiplier_thresholds.empty()) {
+      return channel_multiplier_thresholds;
+    }
+    return QuantizeHistogram(channel_pixel_count, max_property_values);
+  };
+  auto quantize_group_id = [&]() {
+    if (!group_multiplier_thresholds.empty()) {
+      return group_multiplier_thresholds;
+    }
+    return QuantizeHistogram(group_pixel_count, max_property_values);
+  };
+  auto quantize_coordinate = [&]() {
+    std::vector<int32_t> quantized;
+    quantized.reserve(max_property_values - 1);
+    for (size_t i = 0; i + 1 < max_property_values; i++) {
+      quantized.push_back((i + 1) * 256 / max_property_values - 1);
+    }
+    return quantized;
+  };
+  std::vector<int32_t> abs_pixel_thr;
+  std::vector<int32_t> pixel_thr;
+  auto quantize_pixel_property = [&]() {
+    if (pixel_thr.empty()) {
+      pixel_thr = QuantizeSamples(pixel_samples, max_property_values);
+    }
+    return pixel_thr;
+  };
+  auto quantize_abs_pixel_property = [&]() {
+    if (abs_pixel_thr.empty()) {
+      quantize_pixel_property();  // Compute the non-abs thresholds.
+      for (auto &v : pixel_samples) v = std::abs(v);
+      abs_pixel_thr = QuantizeSamples(pixel_samples, max_property_values);
+    }
+    return abs_pixel_thr;
+  };
+  std::vector<int32_t> abs_diff_thr;
+  std::vector<int32_t> diff_thr;
+  auto quantize_diff_property = [&]() {
+    if (diff_thr.empty()) {
+      diff_thr = QuantizeSamples(diff_samples, max_property_values);
+    }
+    return diff_thr;
+  };
+  auto quantize_abs_diff_property = [&]() {
+    if (abs_diff_thr.empty()) {
+      quantize_diff_property();  // Compute the non-abs thresholds.
+      for (auto &v : diff_samples) v = std::abs(v);
+      abs_diff_thr = QuantizeSamples(diff_samples, max_property_values);
+    }
+    return abs_diff_thr;
+  };
+  auto quantize_wp = [&]() {
+    if (max_property_values < 32) {
+      return std::vector<int32_t>{-127, -63, -31, -15, -7, -3, -1, 0,
+                                  1,    3,   7,   15,  31, 63, 127};
+    }
+    if (max_property_values < 64) {
+      return std::vector<int32_t>{-255, -191, -127, -95, -63, -47, -31, -23,
+                                  -15,  -11,  -7,   -5,  -3,  -1,  0,   1,
+                                  3,    5,    7,    11,  15,  23,  31,  47,
+                                  63,   95,   127,  191, 255};
+    }
+    return std::vector<int32_t>{
+        -255, -223, -191, -159, -127, -111, -95, -79, -63, -55, -47,
+        -39,  -31,  -27,  -23,  -19,  -15,  -13, -11, -9,  -7,  -6,
+        -5,   -4,   -3,   -2,   -1,   0,    1,   2,   3,   4,   5,
+        6,    7,    9,    11,   13,   15,   19,  23,  27,  31,  39,
+        47,   55,   63,   79,   95,   111,  127, 159, 191, 223, 255};
+  };
+
+  property_mapping.resize(props_to_use.size());
+  for (size_t i = 0; i < props_to_use.size(); i++) {
+    if (props_to_use[i] == 0) {
+      compact_properties[i] = quantize_channel();
+    } else if (props_to_use[i] == 1) {
+      compact_properties[i] = quantize_group_id();
+    } else if (props_to_use[i] == 2 || props_to_use[i] == 3) {
+      compact_properties[i] = quantize_coordinate();
+    } else if (props_to_use[i] == 6 || props_to_use[i] == 7 ||
+               props_to_use[i] == 8 ||
+               (props_to_use[i] >= kNumNonrefProperties &&
+                (props_to_use[i] - kNumNonrefProperties) % 4 == 1)) {
+      compact_properties[i] = quantize_pixel_property();
+    } else if (props_to_use[i] == 4 || props_to_use[i] == 5 ||
+               (props_to_use[i] >= kNumNonrefProperties &&
+                (props_to_use[i] - kNumNonrefProperties) % 4 == 0)) {
+      compact_properties[i] = quantize_abs_pixel_property();
+    } else if (props_to_use[i] >= kNumNonrefProperties &&
+               (props_to_use[i] - kNumNonrefProperties) % 4 == 2) {
+      compact_properties[i] = quantize_abs_diff_property();
+    } else if (props_to_use[i] == kWPProp) {
+      compact_properties[i] = quantize_wp();
+    } else {
+      compact_properties[i] = quantize_diff_property();
+    }
+    property_mapping[i].resize(kPropertyRange * 2 + 1);
+    size_t mapped = 0;
+    for (size_t j = 0; j < property_mapping[i].size(); j++) {
+      while (mapped < compact_properties[i].size() &&
+             static_cast<int>(j) - kPropertyRange >
+                 compact_properties[i][mapped]) {
+        mapped++;
+      }
+      // property_mapping[i] of a value V is `mapped` if
+      // compact_properties[i][mapped] <= j and
+      // compact_properties[i][mapped-1] > j
+      // This is because the decision node in the tree splits on (property) > j,
+      // hence everything that is not > of a threshold should be clustered
+      // together.
+      property_mapping[i][j] = mapped;
+    }
+  }
+}
+
+void CollectPixelSamples(const Image &image, const ModularOptions &options,
+                         size_t group_id,
+                         std::vector<uint32_t> &group_pixel_count,
+                         std::vector<uint32_t> &channel_pixel_count,
+                         std::vector<pixel_type> &pixel_samples,
+                         std::vector<pixel_type> &diff_samples) {
+  if (options.nb_repeats == 0) return;
+  if (group_pixel_count.size() <= group_id) {
+    group_pixel_count.resize(group_id + 1);
+  }
+  if (channel_pixel_count.size() < image.channel.size()) {
+    channel_pixel_count.resize(image.channel.size());
+  }
+  Rng rng(group_id);
+  // Sample 10% of the final number of samples for property quantization.
+  float fraction = std::min(options.nb_repeats * 0.1, 0.99);
+  Rng::GeometricDistribution dist(fraction);
+  size_t total_pixels = 0;
+  std::vector<size_t> channel_ids;
+  for (size_t i = 0; i < image.channel.size(); i++) {
+    if (image.channel[i].w <= 1 || image.channel[i].h == 0) {
+      continue;  // skip empty or width-1 channels.
+    }
+    if (i >= image.nb_meta_channels &&
+        (image.channel[i].w > options.max_chan_size ||
+         image.channel[i].h > options.max_chan_size)) {
+      break;
+    }
+    channel_ids.push_back(i);
+    group_pixel_count[group_id] += image.channel[i].w * image.channel[i].h;
+    channel_pixel_count[i] += image.channel[i].w * image.channel[i].h;
+    total_pixels += image.channel[i].w * image.channel[i].h;
+  }
+  if (channel_ids.empty()) return;
+  pixel_samples.reserve(pixel_samples.size() + fraction * total_pixels);
+  diff_samples.reserve(diff_samples.size() + fraction * total_pixels);
+  size_t i = 0;
+  size_t y = 0;
+  size_t x = 0;
+  auto advance = [&](size_t amount) {
+    x += amount;
+    // Detect row overflow (rare).
+    while (x >= image.channel[channel_ids[i]].w) {
+      x -= image.channel[channel_ids[i]].w;
+      y++;
+      // Detect end-of-channel (even rarer).
+      if (y == image.channel[channel_ids[i]].h) {
+        i++;
+        y = 0;
+        if (i >= channel_ids.size()) {
+          return;
+        }
+      }
+    }
+  };
+  advance(rng.Geometric(dist));
+  for (; i < channel_ids.size(); advance(rng.Geometric(dist) + 1)) {
+    const pixel_type *row = image.channel[channel_ids[i]].Row(y);
+    pixel_samples.push_back(row[x]);
+    size_t xp = x == 0 ? 1 : x - 1;
+    diff_samples.push_back((int64_t)row[x] - row[xp]);
+  }
+}
+
+// TODO(veluca): very simple encoding scheme. This should be improved.
+void TokenizeTree(const Tree &tree, std::vector<Token> *tokens,
+                  Tree *decoder_tree) {
+  JXL_ASSERT(tree.size() <= kMaxTreeSize);
+  std::queue<int> q;
+  q.push(0);
+  size_t leaf_id = 0;
+  decoder_tree->clear();
+  while (!q.empty()) {
+    int cur = q.front();
+    q.pop();
+    JXL_ASSERT(tree[cur].property >= -1);
+    tokens->emplace_back(kPropertyContext, tree[cur].property + 1);
+    if (tree[cur].property == -1) {
+      tokens->emplace_back(kPredictorContext,
+                           static_cast<int>(tree[cur].predictor));
+      tokens->emplace_back(kOffsetContext,
+                           PackSigned(tree[cur].predictor_offset));
+      uint32_t mul_log = Num0BitsBelowLS1Bit_Nonzero(tree[cur].multiplier);
+      uint32_t mul_bits = (tree[cur].multiplier >> mul_log) - 1;
+      tokens->emplace_back(kMultiplierLogContext, mul_log);
+      tokens->emplace_back(kMultiplierBitsContext, mul_bits);
+      JXL_ASSERT(tree[cur].predictor < Predictor::Best);
+      decoder_tree->emplace_back(-1, 0, leaf_id++, 0, tree[cur].predictor,
+                                 tree[cur].predictor_offset,
+                                 tree[cur].multiplier);
+      continue;
+    }
+    decoder_tree->emplace_back(tree[cur].property, tree[cur].splitval,
+                               decoder_tree->size() + q.size() + 1,
+                               decoder_tree->size() + q.size() + 2,
+                               Predictor::Zero, 0, 1);
+    q.push(tree[cur].lchild);
+    q.push(tree[cur].rchild);
+    tokens->emplace_back(kSplitValContext, PackSigned(tree[cur].splitval));
+  }
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.h
new file mode 100644
index 0000000000..ede37c8023
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.h
@@ -0,0 +1,157 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_ENCODING_ENC_MA_H_
+#define LIB_JXL_MODULAR_ENCODING_ENC_MA_H_
+
+#include <numeric>
+
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/modular/encoding/dec_ma.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/options.h"
+
+namespace jxl {
+
+// Struct to collect all the data needed to build a tree.
+struct TreeSamples {
+  bool HasSamples() const {
+    return !residuals.empty() && !residuals[0].empty();
+  }
+  size_t NumDistinctSamples() const { return sample_counts.size(); }
+  size_t NumSamples() const { return num_samples; }
+  // Set the predictor to use. Must be called before adding any samples.
+  Status SetPredictor(Predictor predictor,
+                      ModularOptions::TreeMode wp_tree_mode);
+  // Set the properties to use. Must be called before adding any samples.
+  Status SetProperties(const std::vector<uint32_t> &properties,
+                       ModularOptions::TreeMode wp_tree_mode);
+
+  size_t Token(size_t pred, size_t i) const { return residuals[pred][i].tok; }
+  size_t NBits(size_t pred, size_t i) const { return residuals[pred][i].nbits; }
+  size_t Count(size_t i) const { return sample_counts[i]; }
+  size_t PredictorIndex(Predictor predictor) const {
+    const auto predictor_elem =
+        std::find(predictors.begin(), predictors.end(), predictor);
+    JXL_DASSERT(predictor_elem != predictors.end());
+    return predictor_elem - predictors.begin();
+  }
+  size_t PropertyIndex(size_t property) const {
+    const auto property_elem =
+        std::find(props_to_use.begin(), props_to_use.end(), property);
+    JXL_DASSERT(property_elem != props_to_use.end());
+    return property_elem - props_to_use.begin();
+  }
+  size_t NumPropertyValues(size_t property_index) const {
+    return compact_properties[property_index].size() + 1;
+  }
+  // Returns the *quantized* property value.
+  size_t Property(size_t property_index, size_t i) const {
+    return props[property_index][i];
+  }
+  int UnquantizeProperty(size_t property_index, uint32_t quant) const {
+    JXL_ASSERT(quant < compact_properties[property_index].size());
+    return compact_properties[property_index][quant];
+  }
+
+  Predictor PredictorFromIndex(size_t index) const {
+    JXL_DASSERT(index < predictors.size());
+    return predictors[index];
+  }
+  size_t PropertyFromIndex(size_t index) const {
+    JXL_DASSERT(index < props_to_use.size());
+    return props_to_use[index];
+  }
+  size_t NumPredictors() const { return predictors.size(); }
+  size_t NumProperties() const { return props_to_use.size(); }
+
+  // Preallocate data for a given number of samples. MUST be called before
+  // adding any sample.
+  void PrepareForSamples(size_t num_samples);
+  // Add a sample.
+  void AddSample(pixel_type_w pixel, const Properties &properties,
+                 const pixel_type_w *predictions);
+  // Pre-cluster property values.
+  void PreQuantizeProperties(
+      const StaticPropRange &range,
+      const std::vector<ModularMultiplierInfo> &multiplier_info,
+      const std::vector<uint32_t> &group_pixel_count,
+      const std::vector<uint32_t> &channel_pixel_count,
+      std::vector<pixel_type> &pixel_samples,
+      std::vector<pixel_type> &diff_samples, size_t max_property_values);
+
+  void AllSamplesDone() { dedup_table_ = std::vector<uint32_t>(); }
+
+  uint32_t QuantizeProperty(uint32_t prop, pixel_type v) const {
+    v = std::min(std::max(v, -kPropertyRange), kPropertyRange) + kPropertyRange;
+    return property_mapping[prop][v];
+  }
+
+  // Swaps samples in position a and b. Does nothing if a == b.
+  void Swap(size_t a, size_t b);
+
+  // Cycles samples: a -> b -> c -> a. We assume a <= b <= c, so that we can
+  // just call Swap(a, b) if b==c.
+  void ThreeShuffle(size_t a, size_t b, size_t c);
+
+ private:
+  // TODO(veluca): as the total number of properties and predictors are known
+  // before adding any samples, it might be better to interleave predictors,
+  // properties and counts in a single vector to improve locality.
+  // A first attempt at doing this actually results in much slower encoding,
+  // possibly because of the more complex addressing.
+  struct ResidualToken {
+    uint8_t tok;
+    uint8_t nbits;
+  };
+  // Residual information: token and number of extra bits, per predictor.
+  std::vector<std::vector<ResidualToken>> residuals;
+  // Number of occurrences of each sample.
+  std::vector<uint16_t> sample_counts;
+  // Property values, quantized to at most 256 distinct values.
+  std::vector<std::vector<uint8_t>> props;
+  // Decompactification info for `props`.
+  std::vector<std::vector<int32_t>> compact_properties;
+  // List of properties to use.
+  std::vector<uint32_t> props_to_use;
+  // List of predictors to use.
+  std::vector<Predictor> predictors;
+  // Mapping property value -> quantized property value.
+  static constexpr int32_t kPropertyRange = 511;
+  std::vector<std::vector<uint8_t>> property_mapping;
+  // Number of samples seen.
+  size_t num_samples = 0;
+  // Table for deduplication.
+  static constexpr uint32_t kDedupEntryUnused{static_cast<uint32_t>(-1)};
+  std::vector<uint32_t> dedup_table_;
+
+  // Functions for sample deduplication.
+  bool IsSameSample(size_t a, size_t b) const;
+  size_t Hash1(size_t a) const;
+  size_t Hash2(size_t a) const;
+  void InitTable(size_t size);
+  // Returns true if `a` was already present in the table.
+  bool AddToTableAndMerge(size_t a);
+  void AddToTable(size_t a);
+};
+
+void TokenizeTree(const Tree &tree, std::vector<Token> *tokens,
+                  Tree *decoder_tree);
+
+void CollectPixelSamples(const Image &image, const ModularOptions &options,
+                         size_t group_id,
+                         std::vector<uint32_t> &group_pixel_count,
+                         std::vector<uint32_t> &channel_pixel_count,
+                         std::vector<pixel_type> &pixel_samples,
+                         std::vector<pixel_type> &diff_samples);
+
+void ComputeBestTree(TreeSamples &tree_samples, float threshold,
+                     const std::vector<ModularMultiplierInfo> &mul_info,
+                     StaticPropRange static_prop_range,
+                     float fast_decode_multiplier, Tree *tree);
+
+}  // namespace jxl
+#endif  // LIB_JXL_MODULAR_ENCODING_ENC_MA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.cc b/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.cc
new file mode 100644
index 0000000000..9d2c3e5cf9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.cc
@@ -0,0 +1,622 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/encoding/encoding.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <queue>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/scope_guard.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/options.h"
+
+namespace jxl {
+
+// Removes all nodes that use a static property (i.e. channel or group ID) from
+// the tree and collapses each node on even levels with its two children to
+// produce a flatter tree. Also computes whether the resulting tree requires
+// using the weighted predictor.
+FlatTree FilterTree(const Tree &global_tree,
+                    std::array<pixel_type, kNumStaticProperties> &static_props,
+                    size_t *num_props, bool *use_wp, bool *wp_only,
+                    bool *gradient_only) {
+  *num_props = 0;
+  bool has_wp = false;
+  bool has_non_wp = false;
+  *gradient_only = true;
+  const auto mark_property = [&](int32_t p) {
+    if (p == kWPProp) {
+      has_wp = true;
+    } else if (p >= kNumStaticProperties) {
+      has_non_wp = true;
+    }
+    if (p >= kNumStaticProperties && p != kGradientProp) {
+      *gradient_only = false;
+    }
+  };
+  FlatTree output;
+  std::queue<size_t> nodes;
+  nodes.push(0);
+  // Produces a trimmed and flattened tree by doing a BFS visit of the original
+  // tree, ignoring branches that are known to be false and proceeding two
+  // levels at a time to collapse nodes in a flatter tree; if an inner parent
+  // node has a leaf as a child, the leaf is duplicated and an implicit fake
+  // node is added. This allows to reduce the number of branches when traversing
+  // the resulting flat tree.
+  while (!nodes.empty()) {
+    size_t cur = nodes.front();
+    nodes.pop();
+    // Skip nodes that we can decide now, by jumping directly to their children.
+    while (global_tree[cur].property < kNumStaticProperties &&
+           global_tree[cur].property != -1) {
+      if (static_props[global_tree[cur].property] > global_tree[cur].splitval) {
+        cur = global_tree[cur].lchild;
+      } else {
+        cur = global_tree[cur].rchild;
+      }
+    }
+    FlatDecisionNode flat;
+    if (global_tree[cur].property == -1) {
+      flat.property0 = -1;
+      flat.childID = global_tree[cur].lchild;
+      flat.predictor = global_tree[cur].predictor;
+      flat.predictor_offset = global_tree[cur].predictor_offset;
+      flat.multiplier = global_tree[cur].multiplier;
+      *gradient_only &= flat.predictor == Predictor::Gradient;
+      has_wp |= flat.predictor == Predictor::Weighted;
+      has_non_wp |= flat.predictor != Predictor::Weighted;
+      output.push_back(flat);
+      continue;
+    }
+    flat.childID = output.size() + nodes.size() + 1;
+
+    flat.property0 = global_tree[cur].property;
+    *num_props = std::max<size_t>(flat.property0 + 1, *num_props);
+    flat.splitval0 = global_tree[cur].splitval;
+
+    for (size_t i = 0; i < 2; i++) {
+      size_t cur_child =
+          i == 0 ? global_tree[cur].lchild : global_tree[cur].rchild;
+      // Skip nodes that we can decide now.
+      while (global_tree[cur_child].property < kNumStaticProperties &&
+             global_tree[cur_child].property != -1) {
+        if (static_props[global_tree[cur_child].property] >
+            global_tree[cur_child].splitval) {
+          cur_child = global_tree[cur_child].lchild;
+        } else {
+          cur_child = global_tree[cur_child].rchild;
+        }
+      }
+      // We ended up in a leaf, add a dummy decision and two copies of the leaf.
+      if (global_tree[cur_child].property == -1) {
+        flat.properties[i] = 0;
+        flat.splitvals[i] = 0;
+        nodes.push(cur_child);
+        nodes.push(cur_child);
+      } else {
+        flat.properties[i] = global_tree[cur_child].property;
+        flat.splitvals[i] = global_tree[cur_child].splitval;
+        nodes.push(global_tree[cur_child].lchild);
+        nodes.push(global_tree[cur_child].rchild);
+        *num_props = std::max<size_t>(flat.properties[i] + 1, *num_props);
+      }
+    }
+
+    for (size_t j = 0; j < 2; j++) mark_property(flat.properties[j]);
+    mark_property(flat.property0);
+    output.push_back(flat);
+  }
+  if (*num_props > kNumNonrefProperties) {
+    *num_props =
+        DivCeil(*num_props - kNumNonrefProperties, kExtraPropsPerChannel) *
+            kExtraPropsPerChannel +
+        kNumNonrefProperties;
+  } else {
+    *num_props = kNumNonrefProperties;
+  }
+  *use_wp = has_wp;
+  *wp_only = has_wp && !has_non_wp;
+
+  return output;
+}
+
+Status DecodeModularChannelMAANS(BitReader *br, ANSSymbolReader *reader,
+                                 const std::vector<uint8_t> &context_map,
+                                 const Tree &global_tree,
+                                 const weighted::Header &wp_header,
+                                 pixel_type chan, size_t group_id,
+                                 Image *image) {
+  Channel &channel = image->channel[chan];
+
+  std::array<pixel_type, kNumStaticProperties> static_props = {
+      {chan, (int)group_id}};
+  // TODO(veluca): filter the tree according to static_props.
+
+  // zero pixel channel? could happen
+  if (channel.w == 0 || channel.h == 0) return true;
+
+  bool tree_has_wp_prop_or_pred = false;
+  bool is_wp_only = false;
+  bool is_gradient_only = false;
+  size_t num_props;
+  FlatTree tree =
+      FilterTree(global_tree, static_props, &num_props,
+                 &tree_has_wp_prop_or_pred, &is_wp_only, &is_gradient_only);
+
+  // From here on, tree lookup returns a *clustered* context ID.
+  // This avoids an extra memory lookup after tree traversal.
+  for (size_t i = 0; i < tree.size(); i++) {
+    if (tree[i].property0 == -1) {
+      tree[i].childID = context_map[tree[i].childID];
+    }
+  }
+
+  JXL_DEBUG_V(3, "Decoded MA tree with %" PRIuS " nodes", tree.size());
+
+  // MAANS decode
+  const auto make_pixel = [](uint64_t v, pixel_type multiplier,
+                             pixel_type_w offset) -> pixel_type {
+    JXL_DASSERT((v & 0xFFFFFFFF) == v);
+    pixel_type_w val = UnpackSigned(v);
+    // if it overflows, it overflows, and we have a problem anyway
+    return val * multiplier + offset;
+  };
+
+  if (tree.size() == 1) {
+    // special optimized case: no meta-adaptation, so no need
+    // to compute properties.
+    Predictor predictor = tree[0].predictor;
+    int64_t offset = tree[0].predictor_offset;
+    int32_t multiplier = tree[0].multiplier;
+    size_t ctx_id = tree[0].childID;
+    if (predictor == Predictor::Zero) {
+      uint32_t value;
+      if (reader->IsSingleValueAndAdvance(ctx_id, &value,
+                                          channel.w * channel.h)) {
+        // Special-case: histogram has a single symbol, with no extra bits, and
+        // we use ANS mode.
+        JXL_DEBUG_V(8, "Fastest track.");
+        pixel_type v = make_pixel(value, multiplier, offset);
+        for (size_t y = 0; y < channel.h; y++) {
+          pixel_type *JXL_RESTRICT r = channel.Row(y);
+          std::fill(r, r + channel.w, v);
+        }
+      } else {
+        JXL_DEBUG_V(8, "Fast track.");
+        if (multiplier == 1 && offset == 0) {
+          for (size_t y = 0; y < channel.h; y++) {
+            pixel_type *JXL_RESTRICT r = channel.Row(y);
+            for (size_t x = 0; x < channel.w; x++) {
+              uint32_t v = reader->ReadHybridUintClustered(ctx_id, br);
+              r[x] = UnpackSigned(v);
+            }
+          }
+        } else {
+          for (size_t y = 0; y < channel.h; y++) {
+            pixel_type *JXL_RESTRICT r = channel.Row(y);
+            for (size_t x = 0; x < channel.w; x++) {
+              uint32_t v = reader->ReadHybridUintClustered(ctx_id, br);
+              r[x] = make_pixel(v, multiplier, offset);
+            }
+          }
+        }
+      }
+    } else if (predictor == Predictor::Gradient && offset == 0 &&
+               multiplier == 1 && reader->HuffRleOnly()) {
+      JXL_DEBUG_V(8, "Gradient RLE (fjxl) very fast track.");
+      uint32_t run = 0;
+      uint32_t v = 0;
+      pixel_type_w sv = 0;
+      for (size_t y = 0; y < channel.h; y++) {
+        pixel_type *JXL_RESTRICT r = channel.Row(y);
+        const pixel_type *JXL_RESTRICT rtop = (y ? channel.Row(y - 1) : r - 1);
+        const pixel_type *JXL_RESTRICT rtopleft =
+            (y ? channel.Row(y - 1) - 1 : r - 1);
+        pixel_type_w guess = (y ? rtop[0] : 0);
+        if (run == 0) {
+          reader->ReadHybridUintClusteredHuffRleOnly(ctx_id, br, &v, &run);
+          sv = UnpackSigned(v);
+        } else {
+          run--;
+        }
+        r[0] = sv + guess;
+        for (size_t x = 1; x < channel.w; x++) {
+          pixel_type left = r[x - 1];
+          pixel_type top = rtop[x];
+          pixel_type topleft = rtopleft[x];
+          pixel_type_w guess = ClampedGradient(top, left, topleft);
+          if (!run) {
+            reader->ReadHybridUintClusteredHuffRleOnly(ctx_id, br, &v, &run);
+            sv = UnpackSigned(v);
+          } else {
+            run--;
+          }
+          r[x] = sv + guess;
+        }
+      }
+    } else if (predictor == Predictor::Gradient && offset == 0 &&
+               multiplier == 1) {
+      JXL_DEBUG_V(8, "Gradient very fast track.");
+      const intptr_t onerow = channel.plane.PixelsPerRow();
+      for (size_t y = 0; y < channel.h; y++) {
+        pixel_type *JXL_RESTRICT r = channel.Row(y);
+        for (size_t x = 0; x < channel.w; x++) {
+          pixel_type left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+          pixel_type top = (y ? *(r + x - onerow) : left);
+          pixel_type topleft = (x && y ? *(r + x - 1 - onerow) : left);
+          pixel_type guess = ClampedGradient(top, left, topleft);
+          uint64_t v = reader->ReadHybridUintClustered(ctx_id, br);
+          r[x] = make_pixel(v, 1, guess);
+        }
+      }
+    } else if (predictor != Predictor::Weighted) {
+      // special optimized case: no wp
+      JXL_DEBUG_V(8, "Quite fast track.");
+      const intptr_t onerow = channel.plane.PixelsPerRow();
+      for (size_t y = 0; y < channel.h; y++) {
+        pixel_type *JXL_RESTRICT r = channel.Row(y);
+        for (size_t x = 0; x < channel.w; x++) {
+          PredictionResult pred =
+              PredictNoTreeNoWP(channel.w, r + x, onerow, x, y, predictor);
+          pixel_type_w g = pred.guess + offset;
+          uint64_t v = reader->ReadHybridUintClustered(ctx_id, br);
+          // NOTE: pred.multiplier is unset.
+          r[x] = make_pixel(v, multiplier, g);
+        }
+      }
+    } else {
+      JXL_DEBUG_V(8, "Somewhat fast track.");
+      const intptr_t onerow = channel.plane.PixelsPerRow();
+      weighted::State wp_state(wp_header, channel.w, channel.h);
+      for (size_t y = 0; y < channel.h; y++) {
+        pixel_type *JXL_RESTRICT r = channel.Row(y);
+        for (size_t x = 0; x < channel.w; x++) {
+          pixel_type_w g = PredictNoTreeWP(channel.w, r + x, onerow, x, y,
+                                           predictor, &wp_state)
+                               .guess +
+                           offset;
+          uint64_t v = reader->ReadHybridUintClustered(ctx_id, br);
+          r[x] = make_pixel(v, multiplier, g);
+          wp_state.UpdateErrors(r[x], x, y, channel.w);
+        }
+      }
+    }
+    return true;
+  }
+
+  // Check if this tree is a WP-only tree with a small enough property value
+  // range.
+  // Initialized to avoid clang-tidy complaining.
+  uint8_t context_lookup[2 * kPropRangeFast] = {};
+  int8_t multipliers[2 * kPropRangeFast] = {};
+  int8_t offsets[2 * kPropRangeFast] = {};
+  if (is_wp_only) {
+    is_wp_only = TreeToLookupTable(tree, context_lookup, offsets, multipliers);
+  }
+  if (is_gradient_only) {
+    is_gradient_only =
+        TreeToLookupTable(tree, context_lookup, offsets, multipliers);
+  }
+
+  if (is_gradient_only) {
+    JXL_DEBUG_V(8, "Gradient fast track.");
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    for (size_t y = 0; y < channel.h; y++) {
+      pixel_type *JXL_RESTRICT r = channel.Row(y);
+      for (size_t x = 0; x < channel.w; x++) {
+        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+        pixel_type_w top = (y ? *(r + x - onerow) : left);
+        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
+        int32_t guess = ClampedGradient(top, left, topleft);
+        uint32_t pos =
+            kPropRangeFast +
+            std::min<pixel_type_w>(
+                std::max<pixel_type_w>(-kPropRangeFast, top + left - topleft),
+                kPropRangeFast - 1);
+        uint32_t ctx_id = context_lookup[pos];
+        uint64_t v = reader->ReadHybridUintClustered(ctx_id, br);
+        r[x] = make_pixel(v, multipliers[pos],
+                          static_cast<pixel_type_w>(offsets[pos]) + guess);
+      }
+    }
+  } else if (is_wp_only) {
+    JXL_DEBUG_V(8, "WP fast track.");
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    weighted::State wp_state(wp_header, channel.w, channel.h);
+    Properties properties(1);
+    for (size_t y = 0; y < channel.h; y++) {
+      pixel_type *JXL_RESTRICT r = channel.Row(y);
+      for (size_t x = 0; x < channel.w; x++) {
+        size_t offset = 0;
+        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+        pixel_type_w top = (y ? *(r + x - onerow) : left);
+        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
+        pixel_type_w topright =
+            (x + 1 < channel.w && y ? *(r + x + 1 - onerow) : top);
+        pixel_type_w toptop = (y > 1 ? *(r + x - onerow - onerow) : top);
+        int32_t guess = wp_state.Predict</*compute_properties=*/true>(
+            x, y, channel.w, top, left, topright, topleft, toptop, &properties,
+            offset);
+        uint32_t pos =
+            kPropRangeFast + std::min(std::max(-kPropRangeFast, properties[0]),
+                                      kPropRangeFast - 1);
+        uint32_t ctx_id = context_lookup[pos];
+        uint64_t v = reader->ReadHybridUintClustered(ctx_id, br);
+        r[x] = make_pixel(v, multipliers[pos],
+                          static_cast<pixel_type_w>(offsets[pos]) + guess);
+        wp_state.UpdateErrors(r[x], x, y, channel.w);
+      }
+    }
+  } else if (!tree_has_wp_prop_or_pred) {
+    // special optimized case: the weighted predictor and its properties are not
+    // used, so no need to compute weights and properties.
+    JXL_DEBUG_V(8, "Slow track.");
+    MATreeLookup tree_lookup(tree);
+    Properties properties = Properties(num_props);
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    Channel references(properties.size() - kNumNonrefProperties, channel.w);
+    for (size_t y = 0; y < channel.h; y++) {
+      pixel_type *JXL_RESTRICT p = channel.Row(y);
+      PrecomputeReferences(channel, y, *image, chan, &references);
+      InitPropsRow(&properties, static_props, y);
+      if (y > 1 && channel.w > 8 && references.w == 0) {
+        for (size_t x = 0; x < 2; x++) {
+          PredictionResult res =
+              PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y,
+                              tree_lookup, references);
+          uint64_t v = reader->ReadHybridUintClustered(res.context, br);
+          p[x] = make_pixel(v, res.multiplier, res.guess);
+        }
+        for (size_t x = 2; x < channel.w - 2; x++) {
+          PredictionResult res =
+              PredictTreeNoWPNEC(&properties, channel.w, p + x, onerow, x, y,
+                                 tree_lookup, references);
+          uint64_t v = reader->ReadHybridUintClustered(res.context, br);
+          p[x] = make_pixel(v, res.multiplier, res.guess);
+        }
+        for (size_t x = channel.w - 2; x < channel.w; x++) {
+          PredictionResult res =
+              PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y,
+                              tree_lookup, references);
+          uint64_t v = reader->ReadHybridUintClustered(res.context, br);
+          p[x] = make_pixel(v, res.multiplier, res.guess);
+        }
+      } else {
+        for (size_t x = 0; x < channel.w; x++) {
+          PredictionResult res =
+              PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y,
+                              tree_lookup, references);
+          uint64_t v = reader->ReadHybridUintClustered(res.context, br);
+          p[x] = make_pixel(v, res.multiplier, res.guess);
+        }
+      }
+    }
+  } else {
+    JXL_DEBUG_V(8, "Slowest track.");
+    MATreeLookup tree_lookup(tree);
+    Properties properties = Properties(num_props);
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    Channel references(properties.size() - kNumNonrefProperties, channel.w);
+    weighted::State wp_state(wp_header, channel.w, channel.h);
+    for (size_t y = 0; y < channel.h; y++) {
+      pixel_type *JXL_RESTRICT p = channel.Row(y);
+      InitPropsRow(&properties, static_props, y);
+      PrecomputeReferences(channel, y, *image, chan, &references);
+      for (size_t x = 0; x < channel.w; x++) {
+        PredictionResult res =
+            PredictTreeWP(&properties, channel.w, p + x, onerow, x, y,
+                          tree_lookup, references, &wp_state);
+        uint64_t v = reader->ReadHybridUintClustered(res.context, br);
+        p[x] = make_pixel(v, res.multiplier, res.guess);
+        wp_state.UpdateErrors(p[x], x, y, channel.w);
+      }
+    }
+  }
+  return true;
+}
+
+GroupHeader::GroupHeader() { Bundle::Init(this); }
+
+Status ValidateChannelDimensions(const Image &image,
+                                 const ModularOptions &options) {
+  size_t nb_channels = image.channel.size();
+  for (bool is_dc : {true, false}) {
+    size_t group_dim = options.group_dim * (is_dc ? kBlockDim : 1);
+    size_t c = image.nb_meta_channels;
+    for (; c < nb_channels; c++) {
+      const Channel &ch = image.channel[c];
+      if (ch.w > options.group_dim || ch.h > options.group_dim) break;
+    }
+    for (; c < nb_channels; c++) {
+      const Channel &ch = image.channel[c];
+      if (ch.w == 0 || ch.h == 0) continue;  // skip empty
+      bool is_dc_channel = std::min(ch.hshift, ch.vshift) >= 3;
+      if (is_dc_channel != is_dc) continue;
+      size_t tile_dim = group_dim >> std::max(ch.hshift, ch.vshift);
+      if (tile_dim == 0) {
+        return JXL_FAILURE("Inconsistent transforms");
+      }
+    }
+  }
+  return true;
+}
+
+Status ModularDecode(BitReader *br, Image &image, GroupHeader &header,
+                     size_t group_id, ModularOptions *options,
+                     const Tree *global_tree, const ANSCode *global_code,
+                     const std::vector<uint8_t> *global_ctx_map,
+                     bool allow_truncated_group) {
+  if (image.channel.empty()) return true;
+
+  // decode transforms
+  Status status = Bundle::Read(br, &header);
+  if (!allow_truncated_group) JXL_RETURN_IF_ERROR(status);
+  if (status.IsFatalError()) return status;
+  if (!br->AllReadsWithinBounds()) {
+    // Don't do/undo transforms if header is incomplete.
+    header.transforms.clear();
+    image.transform = header.transforms;
+    for (size_t c = 0; c < image.channel.size(); c++) {
+      ZeroFillImage(&image.channel[c].plane);
+    }
+    return Status(StatusCode::kNotEnoughBytes);
+  }
+
+  JXL_DEBUG_V(3, "Image data underwent %" PRIuS " transformations: ",
+              header.transforms.size());
+  image.transform = header.transforms;
+  for (Transform &transform : image.transform) {
+    JXL_RETURN_IF_ERROR(transform.MetaApply(image));
+  }
+  if (image.error) {
+    return JXL_FAILURE("Corrupt file. Aborting.");
+  }
+  JXL_RETURN_IF_ERROR(ValidateChannelDimensions(image, *options));
+
+  size_t nb_channels = image.channel.size();
+
+  size_t num_chans = 0;
+  size_t distance_multiplier = 0;
+  for (size_t i = 0; i < nb_channels; i++) {
+    Channel &channel = image.channel[i];
+    if (!channel.w || !channel.h) {
+      continue;  // skip empty channels
+    }
+    if (i >= image.nb_meta_channels && (channel.w > options->max_chan_size ||
+                                        channel.h > options->max_chan_size)) {
+      break;
+    }
+    if (channel.w > distance_multiplier) {
+      distance_multiplier = channel.w;
+    }
+    num_chans++;
+  }
+  if (num_chans == 0) return true;
+
+  size_t next_channel = 0;
+  auto scope_guard = MakeScopeGuard([&]() {
+    // Do not do anything if truncated groups are not allowed.
+    if (!allow_truncated_group) return;
+    for (size_t c = next_channel; c < nb_channels; c++) {
+      ZeroFillImage(&image.channel[c].plane);
+    }
+  });
+
+  // Read tree.
+  Tree tree_storage;
+  std::vector<uint8_t> context_map_storage;
+  ANSCode code_storage;
+  const Tree *tree = &tree_storage;
+  const ANSCode *code = &code_storage;
+  const std::vector<uint8_t> *context_map = &context_map_storage;
+  if (!header.use_global_tree) {
+    size_t max_tree_size = 1024;
+    for (size_t i = 0; i < nb_channels; i++) {
+      Channel &channel = image.channel[i];
+      if (!channel.w || !channel.h) {
+        continue;  // skip empty channels
+      }
+      if (i >= image.nb_meta_channels && (channel.w > options->max_chan_size ||
+                                          channel.h > options->max_chan_size)) {
+        break;
+      }
+      size_t pixels = channel.w * channel.h;
+      if (pixels / channel.w != channel.h) {
+        return JXL_FAILURE("Tree size overflow");
+      }
+      max_tree_size += pixels;
+      if (max_tree_size < pixels) return JXL_FAILURE("Tree size overflow");
+    }
+    max_tree_size = std::min(static_cast<size_t>(1 << 20), max_tree_size);
+    JXL_RETURN_IF_ERROR(DecodeTree(br, &tree_storage, max_tree_size));
+    JXL_RETURN_IF_ERROR(DecodeHistograms(br, (tree_storage.size() + 1) / 2,
+                                         &code_storage, &context_map_storage));
+  } else {
+    if (!global_tree || !global_code || !global_ctx_map ||
+        global_tree->empty()) {
+      return JXL_FAILURE("No global tree available but one was requested");
+    }
+    tree = global_tree;
+    code = global_code;
+    context_map = global_ctx_map;
+  }
+
+  // Read channels
+  ANSSymbolReader reader(code, br, distance_multiplier);
+  for (; next_channel < nb_channels; next_channel++) {
+    Channel &channel = image.channel[next_channel];
+    if (!channel.w || !channel.h) {
+      continue;  // skip empty channels
+    }
+    if (next_channel >= image.nb_meta_channels &&
+        (channel.w > options->max_chan_size ||
+         channel.h > options->max_chan_size)) {
+      break;
+    }
+    JXL_RETURN_IF_ERROR(DecodeModularChannelMAANS(
+        br, &reader, *context_map, *tree, header.wp_header, next_channel,
+        group_id, &image));
+    // Truncated group.
+    if (!br->AllReadsWithinBounds()) {
+      if (!allow_truncated_group) return JXL_FAILURE("Truncated input");
+      return Status(StatusCode::kNotEnoughBytes);
+    }
+  }
+
+  // Make sure no zero-filling happens even if next_channel < nb_channels.
+  scope_guard.Disarm();
+
+  if (!reader.CheckANSFinalState()) {
+    return JXL_FAILURE("ANS decode final state failed");
+  }
+  return true;
+}
+
+Status ModularGenericDecompress(BitReader *br, Image &image,
+                                GroupHeader *header, size_t group_id,
+                                ModularOptions *options, bool undo_transforms,
+                                const Tree *tree, const ANSCode *code,
+                                const std::vector<uint8_t> *ctx_map,
+                                bool allow_truncated_group) {
+#ifdef JXL_ENABLE_ASSERT
+  std::vector<std::pair<uint32_t, uint32_t>> req_sizes(image.channel.size());
+  for (size_t c = 0; c < req_sizes.size(); c++) {
+    req_sizes[c] = {image.channel[c].w, image.channel[c].h};
+  }
+#endif
+  GroupHeader local_header;
+  if (header == nullptr) header = &local_header;
+  size_t bit_pos = br->TotalBitsConsumed();
+  auto dec_status = ModularDecode(br, image, *header, group_id, options, tree,
+                                  code, ctx_map, allow_truncated_group);
+  if (!allow_truncated_group) JXL_RETURN_IF_ERROR(dec_status);
+  if (dec_status.IsFatalError()) return dec_status;
+  if (undo_transforms) image.undo_transforms(header->wp_header);
+  if (image.error) return JXL_FAILURE("Corrupt file. Aborting.");
+  JXL_DEBUG_V(4,
+              "Modular-decoded a %" PRIuS "x%" PRIuS " nbchans=%" PRIuS
+              " image from %" PRIuS " bytes",
+              image.w, image.h, image.channel.size(),
+              (br->TotalBitsConsumed() - bit_pos) / 8);
+  JXL_DEBUG_V(5, "Modular image: %s", image.DebugString().c_str());
+  (void)bit_pos;
+#ifdef JXL_ENABLE_ASSERT
+  // Check that after applying all transforms we are back to the requested image
+  // sizes, otherwise there's a programming error with the transformations.
+  if (undo_transforms) {
+    JXL_ASSERT(image.channel.size() == req_sizes.size());
+    for (size_t c = 0; c < req_sizes.size(); c++) {
+      JXL_ASSERT(req_sizes[c].first == image.channel[c].w);
+      JXL_ASSERT(req_sizes[c].second == image.channel[c].h);
+    }
+  }
+#endif
+  return dec_status;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.h
new file mode 100644
index 0000000000..89697bce87
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.h
@@ -0,0 +1,135 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_ENCODING_ENCODING_H_
+#define LIB_JXL_MODULAR_ENCODING_ENCODING_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/encoding/dec_ma.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/options.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+namespace jxl {
+
+// Valid range of properties for using lookup tables instead of trees.
+constexpr int32_t kPropRangeFast = 512;
+
+struct GroupHeader : public Fields {
+  GroupHeader();
+
+  JXL_FIELDS_NAME(GroupHeader)
+
+  Status VisitFields(Visitor *JXL_RESTRICT visitor) override {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &use_global_tree));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&wp_header));
+    uint32_t num_transforms = static_cast<uint32_t>(transforms.size());
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), BitsOffset(4, 2),
+                                           BitsOffset(8, 18), 0,
+                                           &num_transforms));
+    if (visitor->IsReading()) transforms.resize(num_transforms);
+    for (size_t i = 0; i < num_transforms; i++) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&transforms[i]));
+    }
+    return true;
+  }
+
+  bool use_global_tree;
+  weighted::Header wp_header;
+
+  std::vector<Transform> transforms;
+};
+
+FlatTree FilterTree(const Tree &global_tree,
+                    std::array<pixel_type, kNumStaticProperties> &static_props,
+                    size_t *num_props, bool *use_wp, bool *wp_only,
+                    bool *gradient_only);
+
+template <typename T>
+bool TreeToLookupTable(const FlatTree &tree,
+                       T context_lookup[2 * kPropRangeFast],
+                       int8_t offsets[2 * kPropRangeFast],
+                       int8_t multipliers[2 * kPropRangeFast] = nullptr) {
+  struct TreeRange {
+    // Begin *excluded*, end *included*. This works best with > vs <= decision
+    // nodes.
+    int begin, end;
+    size_t pos;
+  };
+  std::vector<TreeRange> ranges;
+  ranges.push_back(TreeRange{-kPropRangeFast - 1, kPropRangeFast - 1, 0});
+  while (!ranges.empty()) {
+    TreeRange cur = ranges.back();
+    ranges.pop_back();
+    if (cur.begin < -kPropRangeFast - 1 || cur.begin >= kPropRangeFast - 1 ||
+        cur.end > kPropRangeFast - 1) {
+      // Tree is outside the allowed range, exit.
+      return false;
+    }
+    auto &node = tree[cur.pos];
+    // Leaf.
+    if (node.property0 == -1) {
+      if (node.predictor_offset < std::numeric_limits<int8_t>::min() ||
+          node.predictor_offset > std::numeric_limits<int8_t>::max()) {
+        return false;
+      }
+      if (node.multiplier < std::numeric_limits<int8_t>::min() ||
+          node.multiplier > std::numeric_limits<int8_t>::max()) {
+        return false;
+      }
+      if (multipliers == nullptr && node.multiplier != 1) {
+        return false;
+      }
+      for (int i = cur.begin + 1; i < cur.end + 1; i++) {
+        context_lookup[i + kPropRangeFast] = node.childID;
+        if (multipliers) multipliers[i + kPropRangeFast] = node.multiplier;
+        offsets[i + kPropRangeFast] = node.predictor_offset;
+      }
+      continue;
+    }
+    // > side of top node.
+    if (node.properties[0] >= kNumStaticProperties) {
+      ranges.push_back(TreeRange({node.splitvals[0], cur.end, node.childID}));
+      ranges.push_back(
+          TreeRange({node.splitval0, node.splitvals[0], node.childID + 1}));
+    } else {
+      ranges.push_back(TreeRange({node.splitval0, cur.end, node.childID}));
+    }
+    // <= side
+    if (node.properties[1] >= kNumStaticProperties) {
+      ranges.push_back(
+          TreeRange({node.splitvals[1], node.splitval0, node.childID + 2}));
+      ranges.push_back(
+          TreeRange({cur.begin, node.splitvals[1], node.childID + 3}));
+    } else {
+      ranges.push_back(
+          TreeRange({cur.begin, node.splitval0, node.childID + 2}));
+    }
+  }
+  return true;
+}
+// TODO(veluca): make cleaner interfaces.
+
+Status ValidateChannelDimensions(const Image &image,
+                                 const ModularOptions &options);
+
+Status ModularGenericDecompress(BitReader *br, Image &image,
+                                GroupHeader *header, size_t group_id,
+                                ModularOptions *options,
+                                bool undo_transforms = true,
+                                const Tree *tree = nullptr,
+                                const ANSCode *code = nullptr,
+                                const std::vector<uint8_t> *ctx_map = nullptr,
+                                bool allow_truncated_group = false);
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_ENCODING_ENCODING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/ma_common.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/ma_common.h
new file mode 100644
index 0000000000..71b7847321
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/ma_common.h
@@ -0,0 +1,28 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_ENCODING_MA_COMMON_H_
+#define LIB_JXL_MODULAR_ENCODING_MA_COMMON_H_
+
+#include <stddef.h>
+
+namespace jxl {
+
+enum MATreeContext : size_t {
+  kSplitValContext = 0,
+  kPropertyContext = 1,
+  kPredictorContext = 2,
+  kOffsetContext = 3,
+  kMultiplierLogContext = 4,
+  kMultiplierBitsContext = 5,
+
+  kNumTreeContexts = 6,
+};
+
+static constexpr size_t kMaxTreeSize = 1 << 22;
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_ENCODING_MA_COMMON_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/modular_image.cc b/third_party/jpeg-xl/lib/jxl/modular/modular_image.cc
new file mode 100644
index 0000000000..785d0c5443
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/modular_image.cc
@@ -0,0 +1,77 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/modular_image.h"
+
+#include <sstream>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+namespace jxl {
+
+void Image::undo_transforms(const weighted::Header &wp_header,
+                            jxl::ThreadPool *pool) {
+  while (!transform.empty()) {
+    Transform t = transform.back();
+    JXL_DEBUG_V(4, "Undoing transform");
+    Status result = t.Inverse(*this, wp_header, pool);
+    if (result == false) {
+      JXL_NOTIFY_ERROR("Error while undoing transform.");
+      error = true;
+      return;
+    }
+    JXL_DEBUG_V(8, "Undoing transform: done");
+    transform.pop_back();
+  }
+}
+
+Image::Image(size_t iw, size_t ih, int bitdepth, int nb_chans)
+    : w(iw), h(ih), bitdepth(bitdepth), nb_meta_channels(0), error(false) {
+  for (int i = 0; i < nb_chans; i++) channel.emplace_back(Channel(iw, ih));
+}
+
+Image::Image() : w(0), h(0), bitdepth(8), nb_meta_channels(0), error(true) {}
+
+Image &Image::operator=(Image &&other) noexcept {
+  w = other.w;
+  h = other.h;
+  bitdepth = other.bitdepth;
+  nb_meta_channels = other.nb_meta_channels;
+  error = other.error;
+  channel = std::move(other.channel);
+  transform = std::move(other.transform);
+  return *this;
+}
+
+Image Image::clone() {
+  Image c(w, h, bitdepth, 0);
+  c.nb_meta_channels = nb_meta_channels;
+  c.error = error;
+  c.transform = transform;
+  for (Channel &ch : channel) {
+    Channel a(ch.w, ch.h, ch.hshift, ch.vshift);
+    CopyImageTo(ch.plane, &a.plane);
+    c.channel.push_back(std::move(a));
+  }
+  return c;
+}
+
+std::string Image::DebugString() const {
+  std::ostringstream os;
+  os << w << "x" << h << ", depth: " << bitdepth;
+  if (!channel.empty()) {
+    os << ", channels:";
+    for (size_t i = 0; i < channel.size(); ++i) {
+      os << " " << channel[i].w << "x" << channel[i].h
+         << "(shift: " << channel[i].hshift << "," << channel[i].vshift << ")";
+      if (i < nb_meta_channels) os << "*";
+    }
+  }
+  return os.str();
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/modular_image.h b/third_party/jpeg-xl/lib/jxl/modular/modular_image.h
new file mode 100644
index 0000000000..3e9b5a8a08
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/modular_image.h
@@ -0,0 +1,118 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_MODULAR_IMAGE_H_
+#define LIB_JXL_MODULAR_MODULAR_IMAGE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+typedef int32_t pixel_type;  // can use int16_t if it's only for 8-bit images.
+                             // Need some wiggle room for YCoCg / Squeeze etc
+
+typedef int64_t pixel_type_w;
+
+namespace weighted {
+struct Header;
+}
+
+class Channel {
+ public:
+  jxl::Plane<pixel_type> plane;
+  size_t w, h;
+  int hshift, vshift;  // w ~= image.w >> hshift;  h ~= image.h >> vshift
+  Channel(size_t iw, size_t ih, int hsh = 0, int vsh = 0)
+      : plane(iw, ih), w(iw), h(ih), hshift(hsh), vshift(vsh) {}
+
+  Channel(const Channel& other) = delete;
+  Channel& operator=(const Channel& other) = delete;
+
+  // Move assignment
+  Channel& operator=(Channel&& other) noexcept {
+    w = other.w;
+    h = other.h;
+    hshift = other.hshift;
+    vshift = other.vshift;
+    plane = std::move(other.plane);
+    return *this;
+  }
+
+  // Move constructor
+  Channel(Channel&& other) noexcept = default;
+
+  void shrink() {
+    if (plane.xsize() == w && plane.ysize() == h) return;
+    jxl::Plane<pixel_type> resizedplane(w, h);
+    plane = std::move(resizedplane);
+  }
+  void shrink(int nw, int nh) {
+    w = nw;
+    h = nh;
+    shrink();
+  }
+
+  JXL_INLINE pixel_type* Row(const size_t y) { return plane.Row(y); }
+  JXL_INLINE const pixel_type* Row(const size_t y) const {
+    return plane.Row(y);
+  }
+};
+
+class Transform;
+
+class Image {
+ public:
+  // image data, transforms can dramatically change the number of channels and
+  // their semantics
+  std::vector<Channel> channel;
+  // transforms that have been applied (and that have to be undone)
+  std::vector<Transform> transform;
+
+  // image dimensions (channels may have different dimensions due to transforms)
+  size_t w, h;
+  int bitdepth;
+  size_t nb_meta_channels;  // first few channels might contain palette(s)
+  bool error;               // true if a fatal error occurred, false otherwise
+
+  Image(size_t iw, size_t ih, int bitdepth, int nb_chans);
+  Image();
+
+  Image(const Image& other) = delete;
+  Image& operator=(const Image& other) = delete;
+
+  Image& operator=(Image&& other) noexcept;
+  Image(Image&& other) noexcept = default;
+
+  bool empty() const {
+    for (const auto& ch : channel) {
+      if (ch.w && ch.h) return false;
+    }
+    return true;
+  }
+
+  Image clone();
+
+  void undo_transforms(const weighted::Header& wp_header,
+                       jxl::ThreadPool* pool = nullptr);
+
+  std::string DebugString() const;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_MODULAR_IMAGE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/options.h b/third_party/jpeg-xl/lib/jxl/modular/options.h
new file mode 100644
index 0000000000..ce6596b912
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/options.h
@@ -0,0 +1,117 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_OPTIONS_H_
+#define LIB_JXL_MODULAR_OPTIONS_H_
+
+#include <stdint.h>
+
+#include <array>
+#include <vector>
+
+namespace jxl {
+
+using PropertyVal = int32_t;
+using Properties = std::vector<PropertyVal>;
+
+enum class Predictor : uint32_t {
+  Zero = 0,
+  Left = 1,
+  Top = 2,
+  Average0 = 3,
+  Select = 4,
+  Gradient = 5,
+  Weighted = 6,
+  TopRight = 7,
+  TopLeft = 8,
+  LeftLeft = 9,
+  Average1 = 10,
+  Average2 = 11,
+  Average3 = 12,
+  Average4 = 13,
+  // The following predictors are encoder-only.
+  Best = 14,  // Best of Gradient and Weighted
+  Variable =
+      15,  // Find the best decision tree for predictors/predictor per row
+};
+
+constexpr size_t kNumModularPredictors =
+    static_cast<size_t>(Predictor::Average4) + 1;
+constexpr size_t kNumModularEncoderPredictors =
+    static_cast<size_t>(Predictor::Variable) + 1;
+
+static constexpr ssize_t kNumStaticProperties = 2;  // channel, group_id.
+
+using StaticPropRange =
+    std::array<std::array<uint32_t, 2>, kNumStaticProperties>;
+
+struct ModularMultiplierInfo {
+  StaticPropRange range;
+  uint32_t multiplier;
+};
+
+struct ModularOptions {
+  /// Used in both encode and decode:
+
+  // Stop encoding/decoding when reaching a (non-meta) channel that has a
+  // dimension bigger than max_chan_size.
+  size_t max_chan_size = 0xFFFFFF;
+
+  // Used during decoding for validation of transforms (sqeeezing) scheme.
+  size_t group_dim = 0x1FFFFFFF;
+
+  /// Encode options:
+  // Fraction of pixels to look at to learn a MA tree
+  // Number of iterations to do to learn a MA tree
+  // (if zero there is no MA context model)
+  float nb_repeats = .5f;
+
+  // Maximum number of (previous channel) properties to use in the MA trees
+  int max_properties = 0;  // no previous channels
+
+  // Alternative heuristic tweaks.
+  // Properties default to channel, group, weighted, gradient residual, W-NW,
+  // NW-N, N-NE, N-NN
+  std::vector<uint32_t> splitting_heuristics_properties = {0,  1,  15, 9,
+                                                           10, 11, 12, 13};
+  float splitting_heuristics_node_threshold = 96;
+  size_t max_property_values = 32;
+
+  // Predictor to use for each channel.
+  Predictor predictor = static_cast<Predictor>(-1);
+
+  int wp_mode = 0;
+
+  float fast_decode_multiplier = 1.01f;
+
+  // Forces the encoder to produce a tree that is compatible with the WP-only
+  // decode path (or with the no-wp path, or the gradient-only path).
+  enum class TreeMode { kGradientOnly, kWPOnly, kNoWP, kDefault };
+  TreeMode wp_tree_mode = TreeMode::kDefault;
+
+  // Skip fast paths in the encoder.
+  bool skip_encoder_fast_path = false;
+
+  // Kind of tree to use.
+  // TODO(veluca): add tree kinds for JPEG recompression with CfL enabled,
+  // general AC metadata, different DC qualities, and others.
+  enum class TreeKind {
+    kTrivialTreeNoPredictor,
+    kLearn,
+    kJpegTranscodeACMeta,
+    kFalconACMeta,
+    kACMeta,
+    kWPFixedDC,
+    kGradientFixedDC,
+  };
+  TreeKind tree_kind = TreeKind::kLearn;
+
+  // Ignore the image and just pretend all tokens are zeroes
+  bool zero_tokens = false;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_OPTIONS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.cc
new file mode 100644
index 0000000000..bc31445bc5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.cc
@@ -0,0 +1,606 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/enc_palette.h"
+
+#include <array>
+#include <map>
+#include <set>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/enc_transform.h"
+#include "lib/jxl/modular/transform/palette.h"
+
+namespace jxl {
+
+namespace palette_internal {
+
+static constexpr bool kEncodeToHighQualityImplicitPalette = true;
+
+// Inclusive.
+static constexpr int kMinImplicitPaletteIndex = -(2 * 72 - 1);
+
+float ColorDistance(const std::vector<float> &JXL_RESTRICT a,
+                    const std::vector<pixel_type> &JXL_RESTRICT b) {
+  JXL_ASSERT(a.size() == b.size());
+  float distance = 0;
+  float ave3 = 0;
+  if (a.size() >= 3) {
+    ave3 = (a[0] + b[0] + a[1] + b[1] + a[2] + b[2]) * (1.21f / 3.0f);
+  }
+  float sum_a = 0, sum_b = 0;
+  for (size_t c = 0; c < a.size(); ++c) {
+    const float difference =
+        static_cast<float>(a[c]) - static_cast<float>(b[c]);
+    float weight = c == 0 ? 3 : c == 1 ? 5 : 2;
+    if (c < 3 && (a[c] + b[c] >= ave3)) {
+      const float add_w[3] = {
+          1.15,
+          1.15,
+          1.12,
+      };
+      weight += add_w[c];
+      if (c == 2 && ((a[2] + b[2]) < 1.22 * ave3)) {
+        weight -= 0.5;
+      }
+    }
+    distance += difference * difference * weight * weight;
+    const int sum_weight = c == 0 ? 3 : c == 1 ? 5 : 1;
+    sum_a += a[c] * sum_weight;
+    sum_b += b[c] * sum_weight;
+  }
+  distance *= 4;
+  float sum_difference = sum_a - sum_b;
+  distance += sum_difference * sum_difference;
+  return distance;
+}
+
+static int QuantizeColorToImplicitPaletteIndex(
+    const std::vector<pixel_type> &color, const int palette_size,
+    const int bit_depth, bool high_quality) {
+  int index = 0;
+  if (high_quality) {
+    int multiplier = 1;
+    for (size_t c = 0; c < color.size(); c++) {
+      int quantized = ((kLargeCube - 1) * color[c] + (1 << (bit_depth - 1))) /
+                      ((1 << bit_depth) - 1);
+      JXL_ASSERT((quantized % kLargeCube) == quantized);
+      index += quantized * multiplier;
+      multiplier *= kLargeCube;
+    }
+    return index + palette_size + kLargeCubeOffset;
+  } else {
+    int multiplier = 1;
+    for (size_t c = 0; c < color.size(); c++) {
+      int value = color[c];
+      value -= 1 << (std::max(0, bit_depth - 3));
+      value = std::max(0, value);
+      int quantized = ((kLargeCube - 1) * value + (1 << (bit_depth - 1))) /
+                      ((1 << bit_depth) - 1);
+      JXL_ASSERT((quantized % kLargeCube) == quantized);
+      if (quantized > kSmallCube - 1) {
+        quantized = kSmallCube - 1;
+      }
+      index += quantized * multiplier;
+      multiplier *= kSmallCube;
+    }
+    return index + palette_size;
+  }
+}
+
+}  // namespace palette_internal
+
+int RoundInt(int value, int div) {  // symmetric rounding around 0
+  if (value < 0) return -RoundInt(-value, div);
+  return (value + div / 2) / div;
+}
+
+struct PaletteIterationData {
+  static constexpr int kMaxDeltas = 128;
+  bool final_run = false;
+  std::vector<pixel_type> deltas[3];
+  std::vector<double> delta_distances;
+  std::vector<pixel_type> frequent_deltas[3];
+
+  // Populates `frequent_deltas` with items from `deltas` based on frequencies
+  // and color distances.
+  void FindFrequentColorDeltas(int num_pixels, int bitdepth) {
+    using pixel_type_3d = std::array<pixel_type, 3>;
+    std::map<pixel_type_3d, double> delta_frequency_map;
+    pixel_type bucket_size = 3 << std::max(0, bitdepth - 8);
+    // Store frequency weighted by delta distance from quantized value.
+    for (size_t i = 0; i < deltas[0].size(); ++i) {
+      pixel_type_3d delta = {
+          {RoundInt(deltas[0][i], bucket_size),
+           RoundInt(deltas[1][i], bucket_size),
+           RoundInt(deltas[2][i], bucket_size)}};  // a basic form of clustering
+      if (delta[0] == 0 && delta[1] == 0 && delta[2] == 0) continue;
+      delta_frequency_map[delta] += sqrt(sqrt(delta_distances[i]));
+    }
+
+    const float delta_distance_multiplier = 1.0f / num_pixels;
+
+    // Weigh frequencies by magnitude and normalize.
+    for (auto &delta_frequency : delta_frequency_map) {
+      std::vector<pixel_type> current_delta = {delta_frequency.first[0],
+                                               delta_frequency.first[1],
+                                               delta_frequency.first[2]};
+      float delta_distance =
+          sqrt(palette_internal::ColorDistance({0, 0, 0}, current_delta)) + 1;
+      delta_frequency.second *= delta_distance * delta_distance_multiplier;
+    }
+
+    // Sort by weighted frequency.
+    using pixel_type_3d_frequency = std::pair<pixel_type_3d, double>;
+    std::vector<pixel_type_3d_frequency> sorted_delta_frequency_map(
+        delta_frequency_map.begin(), delta_frequency_map.end());
+    std::sort(
+        sorted_delta_frequency_map.begin(), sorted_delta_frequency_map.end(),
+        [](const pixel_type_3d_frequency &a, const pixel_type_3d_frequency &b) {
+          return a.second > b.second;
+        });
+
+    // Store the top deltas.
+    for (auto &delta_frequency : sorted_delta_frequency_map) {
+      if (frequent_deltas[0].size() >= kMaxDeltas) break;
+      // Number obtained by optimizing on jyrki31 corpus:
+      if (delta_frequency.second < 17) break;
+      for (int c = 0; c < 3; ++c) {
+        frequent_deltas[c].push_back(delta_frequency.first[c] * bucket_size);
+      }
+    }
+  }
+};
+
+Status FwdPaletteIteration(Image &input, uint32_t begin_c, uint32_t end_c,
+                           uint32_t &nb_colors, uint32_t &nb_deltas,
+                           bool ordered, bool lossy, Predictor &predictor,
+                           const weighted::Header &wp_header,
+                           PaletteIterationData &palette_iteration_data) {
+  JXL_QUIET_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, end_c));
+  JXL_ASSERT(begin_c >= input.nb_meta_channels);
+  uint32_t nb = end_c - begin_c + 1;
+
+  size_t w = input.channel[begin_c].w;
+  size_t h = input.channel[begin_c].h;
+
+  if (!lossy && nb == 1) {
+    // Channel palette special case
+    if (nb_colors == 0) return false;
+    std::vector<pixel_type> lookup;
+    pixel_type minval, maxval;
+    compute_minmax(input.channel[begin_c], &minval, &maxval);
+    size_t lookup_table_size =
+        static_cast<int64_t>(maxval) - static_cast<int64_t>(minval) + 1;
+    if (lookup_table_size > palette_internal::kMaxPaletteLookupTableSize) {
+      // a lookup table would use too much memory, instead use a slower approach
+      // with std::set
+      std::set<pixel_type> chpalette;
+      pixel_type idx = 0;
+      for (size_t y = 0; y < h; y++) {
+        const pixel_type *p = input.channel[begin_c].Row(y);
+        for (size_t x = 0; x < w; x++) {
+          const bool new_color = chpalette.insert(p[x]).second;
+          if (new_color) {
+            idx++;
+            if (idx > (int)nb_colors) return false;
+          }
+        }
+      }
+      JXL_DEBUG_V(6, "Channel %i uses only %i colors.", begin_c, idx);
+      Channel pch(idx, 1);
+      pch.hshift = -1;
+      pch.vshift = -1;
+      nb_colors = idx;
+      idx = 0;
+      pixel_type *JXL_RESTRICT p_palette = pch.Row(0);
+      for (pixel_type p : chpalette) {
+        p_palette[idx++] = p;
+      }
+      for (size_t y = 0; y < h; y++) {
+        pixel_type *p = input.channel[begin_c].Row(y);
+        for (size_t x = 0; x < w; x++) {
+          for (idx = 0; p[x] != p_palette[idx] && idx < (int)nb_colors; idx++) {
+          }
+          JXL_DASSERT(idx < (int)nb_colors);
+          p[x] = idx;
+        }
+      }
+      predictor = Predictor::Zero;
+      input.nb_meta_channels++;
+      input.channel.insert(input.channel.begin(), std::move(pch));
+
+      return true;
+    }
+    lookup.resize(lookup_table_size, 0);
+    pixel_type idx = 0;
+    for (size_t y = 0; y < h; y++) {
+      const pixel_type *p = input.channel[begin_c].Row(y);
+      for (size_t x = 0; x < w; x++) {
+        if (lookup[p[x] - minval] == 0) {
+          lookup[p[x] - minval] = 1;
+          idx++;
+          if (idx > (int)nb_colors) return false;
+        }
+      }
+    }
+    JXL_DEBUG_V(6, "Channel %i uses only %i colors.", begin_c, idx);
+    Channel pch(idx, 1);
+    pch.hshift = -1;
+    pch.vshift = -1;
+    nb_colors = idx;
+    idx = 0;
+    pixel_type *JXL_RESTRICT p_palette = pch.Row(0);
+    for (size_t i = 0; i < lookup_table_size; i++) {
+      if (lookup[i]) {
+        p_palette[idx] = i + minval;
+        lookup[i] = idx;
+        idx++;
+      }
+    }
+    for (size_t y = 0; y < h; y++) {
+      pixel_type *p = input.channel[begin_c].Row(y);
+      for (size_t x = 0; x < w; x++) p[x] = lookup[p[x] - minval];
+    }
+    predictor = Predictor::Zero;
+    input.nb_meta_channels++;
+    input.channel.insert(input.channel.begin(), std::move(pch));
+    return true;
+  }
+
+  Image quantized_input;
+  if (lossy) {
+    quantized_input = Image(w, h, input.bitdepth, nb);
+    for (size_t c = 0; c < nb; c++) {
+      CopyImageTo(input.channel[begin_c + c].plane,
+                  &quantized_input.channel[c].plane);
+    }
+  }
+
+  JXL_DEBUG_V(
+      7, "Trying to represent channels %i-%i using at most a %i-color palette.",
+      begin_c, end_c, nb_colors);
+  nb_deltas = 0;
+  bool delta_used = false;
+  std::set<std::vector<pixel_type>>
+      candidate_palette;  // ordered lexicographically
+  std::vector<std::vector<pixel_type>> candidate_palette_imageorder;
+  std::vector<pixel_type> color(nb);
+  std::vector<float> color_with_error(nb);
+  std::vector<const pixel_type *> p_in(nb);
+
+  if (lossy) {
+    palette_iteration_data.FindFrequentColorDeltas(w * h, input.bitdepth);
+    nb_deltas = palette_iteration_data.frequent_deltas[0].size();
+
+    // Count color frequency for colors that make a cross.
+    std::map<std::vector<pixel_type>, size_t> color_freq_map;
+    for (size_t y = 1; y + 1 < h; y++) {
+      for (uint32_t c = 0; c < nb; c++) {
+        p_in[c] = input.channel[begin_c + c].Row(y);
+      }
+      for (size_t x = 1; x + 1 < w; x++) {
+        for (uint32_t c = 0; c < nb; c++) {
+          color[c] = p_in[c][x];
+        }
+        int offsets[4][2] = {{1, 0}, {-1, 0}, {0, 1}, {0, -1}};
+        bool makes_cross = true;
+        for (int i = 0; i < 4 && makes_cross; ++i) {
+          int dx = offsets[i][0];
+          int dy = offsets[i][1];
+          for (uint32_t c = 0; c < nb && makes_cross; c++) {
+            if (input.channel[begin_c + c].Row(y + dy)[x + dx] != color[c]) {
+              makes_cross = false;
+            }
+          }
+        }
+        if (makes_cross) color_freq_map[color] += 1;
+      }
+    }
+    // Add colors satisfying frequency condition to the palette.
+    constexpr float kImageFraction = 0.01f;
+    size_t color_frequency_lower_bound = 5 + input.h * input.w * kImageFraction;
+    for (const auto &color_freq : color_freq_map) {
+      if (color_freq.second > color_frequency_lower_bound) {
+        candidate_palette.insert(color_freq.first);
+        candidate_palette_imageorder.push_back(color_freq.first);
+      }
+    }
+  }
+
+  for (size_t y = 0; y < h; y++) {
+    for (uint32_t c = 0; c < nb; c++) {
+      p_in[c] = input.channel[begin_c + c].Row(y);
+    }
+    for (size_t x = 0; x < w; x++) {
+      if (lossy && candidate_palette.size() >= nb_colors) break;
+      for (uint32_t c = 0; c < nb; c++) {
+        color[c] = p_in[c][x];
+      }
+      const bool new_color = candidate_palette.insert(color).second;
+      if (new_color) {
+        candidate_palette_imageorder.push_back(color);
+      }
+      if (candidate_palette.size() > nb_colors) {
+        return false;  // too many colors
+      }
+    }
+  }
+
+  nb_colors = nb_deltas + candidate_palette.size();
+  JXL_DEBUG_V(6, "Channels %i-%i can be represented using a %i-color palette.",
+              begin_c, end_c, nb_colors);
+
+  Channel pch(nb_colors, nb);
+  pch.hshift = -1;
+  pch.vshift = -1;
+  pixel_type *JXL_RESTRICT p_palette = pch.Row(0);
+  intptr_t onerow = pch.plane.PixelsPerRow();
+  intptr_t onerow_image = input.channel[begin_c].plane.PixelsPerRow();
+  const int bit_depth = std::min(input.bitdepth, 24);
+
+  if (lossy) {
+    for (uint32_t i = 0; i < nb_deltas; i++) {
+      for (size_t c = 0; c < 3; c++) {
+        p_palette[c * onerow + i] =
+            palette_iteration_data.frequent_deltas[c][i];
+      }
+    }
+  }
+
+  int x = 0;
+  if (ordered) {
+    JXL_DEBUG_V(7, "Palette of %i colors, using lexicographic order",
+                nb_colors);
+    for (auto pcol : candidate_palette) {
+      JXL_DEBUG_V(9, "  Color %i :  ", x);
+      for (size_t i = 0; i < nb; i++) {
+        p_palette[nb_deltas + i * onerow + x] = pcol[i];
+      }
+      for (size_t i = 0; i < nb; i++) {
+        JXL_DEBUG_V(9, "%i ", pcol[i]);
+      }
+      x++;
+    }
+  } else {
+    JXL_DEBUG_V(7, "Palette of %i colors, using image order", nb_colors);
+    for (auto pcol : candidate_palette_imageorder) {
+      JXL_DEBUG_V(9, "  Color %i :  ", x);
+      for (size_t i = 0; i < nb; i++)
+        p_palette[nb_deltas + i * onerow + x] = pcol[i];
+      for (size_t i = 0; i < nb; i++) JXL_DEBUG_V(9, "%i ", pcol[i]);
+      x++;
+    }
+  }
+  std::vector<weighted::State> wp_states;
+  for (size_t c = 0; c < nb; c++) {
+    wp_states.emplace_back(wp_header, w, h);
+  }
+  std::vector<pixel_type *> p_quant(nb);
+  // Three rows of error for dithering: y to y + 2.
+  // Each row has two pixels of padding in the ends, which is
+  // beneficial for both precision and encoding speed.
+  std::vector<std::vector<float>> error_row[3];
+  if (lossy) {
+    for (int i = 0; i < 3; ++i) {
+      error_row[i].resize(nb);
+      for (size_t c = 0; c < nb; ++c) {
+        error_row[i][c].resize(w + 4);
+      }
+    }
+  }
+  for (size_t y = 0; y < h; y++) {
+    for (size_t c = 0; c < nb; c++) {
+      p_in[c] = input.channel[begin_c + c].Row(y);
+      if (lossy) p_quant[c] = quantized_input.channel[c].Row(y);
+    }
+    pixel_type *JXL_RESTRICT p = input.channel[begin_c].Row(y);
+    for (size_t x = 0; x < w; x++) {
+      int index;
+      if (!lossy) {
+        for (size_t c = 0; c < nb; c++) color[c] = p_in[c][x];
+        // Exact search.
+        for (index = 0; static_cast<uint32_t>(index) < nb_colors; index++) {
+          bool found = true;
+          for (size_t c = 0; c < nb; c++) {
+            if (color[c] != p_palette[c * onerow + index]) {
+              found = false;
+              break;
+            }
+          }
+          if (found) break;
+        }
+        if (index < static_cast<int>(nb_deltas)) {
+          delta_used = true;
+        }
+      } else {
+        int best_index = 0;
+        bool best_is_delta = false;
+        float best_distance = std::numeric_limits<float>::infinity();
+        std::vector<pixel_type> best_val(nb, 0);
+        std::vector<pixel_type> ideal_residual(nb, 0);
+        std::vector<pixel_type> quantized_val(nb);
+        std::vector<pixel_type> predictions(nb);
+        static const double kDiffusionMultiplier[] = {0.55, 0.75};
+        for (int diffusion_index = 0; diffusion_index < 2; ++diffusion_index) {
+          for (size_t c = 0; c < nb; c++) {
+            color_with_error[c] =
+                p_in[c][x] + palette_iteration_data.final_run *
+                                 kDiffusionMultiplier[diffusion_index] *
+                                 error_row[0][c][x + 2];
+            color[c] = Clamp1(lroundf(color_with_error[c]), 0l,
+                              (1l << input.bitdepth) - 1);
+          }
+
+          for (size_t c = 0; c < nb; ++c) {
+            predictions[c] = PredictNoTreeWP(w, p_quant[c] + x, onerow_image, x,
+                                             y, predictor, &wp_states[c])
+                                 .guess;
+          }
+          const auto TryIndex = [&](const int index) {
+            for (size_t c = 0; c < nb; c++) {
+              quantized_val[c] = palette_internal::GetPaletteValue(
+                  p_palette, index, /*c=*/c,
+                  /*palette_size=*/nb_colors,
+                  /*onerow=*/onerow, /*bit_depth=*/bit_depth);
+              if (index < static_cast<int>(nb_deltas)) {
+                quantized_val[c] += predictions[c];
+              }
+            }
+            const float color_distance =
+                32.0 / (1LL << std::max(0, 2 * (bit_depth - 8))) *
+                palette_internal::ColorDistance(color_with_error,
+                                                quantized_val);
+            float index_penalty = 0;
+            if (index == -1) {
+              index_penalty = -124;
+            } else if (index < 0) {
+              index_penalty = -2 * index;
+            } else if (index < static_cast<int>(nb_deltas)) {
+              index_penalty = 250;
+            } else if (index < static_cast<int>(nb_colors)) {
+              index_penalty = 150;
+            } else if (index < static_cast<int>(nb_colors) +
+                                   palette_internal::kLargeCubeOffset) {
+              index_penalty = 70;
+            } else {
+              index_penalty = 256;
+            }
+            const float distance = color_distance + index_penalty;
+            if (distance < best_distance) {
+              best_distance = distance;
+              best_index = index;
+              best_is_delta = index < static_cast<int>(nb_deltas);
+              best_val.swap(quantized_val);
+              for (size_t c = 0; c < nb; ++c) {
+                ideal_residual[c] = color_with_error[c] - predictions[c];
+              }
+            }
+          };
+          for (index = palette_internal::kMinImplicitPaletteIndex;
+               index < static_cast<int32_t>(nb_colors); index++) {
+            TryIndex(index);
+          }
+          TryIndex(palette_internal::QuantizeColorToImplicitPaletteIndex(
+              color, nb_colors, bit_depth,
+              /*high_quality=*/false));
+          if (palette_internal::kEncodeToHighQualityImplicitPalette) {
+            TryIndex(palette_internal::QuantizeColorToImplicitPaletteIndex(
+                color, nb_colors, bit_depth,
+                /*high_quality=*/true));
+          }
+        }
+        index = best_index;
+        delta_used |= best_is_delta;
+        if (!palette_iteration_data.final_run) {
+          for (size_t c = 0; c < 3; ++c) {
+            palette_iteration_data.deltas[c].push_back(ideal_residual[c]);
+          }
+          palette_iteration_data.delta_distances.push_back(best_distance);
+        }
+
+        for (size_t c = 0; c < nb; ++c) {
+          wp_states[c].UpdateErrors(best_val[c], x, y, w);
+          p_quant[c][x] = best_val[c];
+        }
+        float len_error = 0;
+        for (size_t c = 0; c < nb; ++c) {
+          float local_error = color_with_error[c] - best_val[c];
+          len_error += local_error * local_error;
+        }
+        len_error = sqrt(len_error);
+        float modulate = 1.0;
+        int len_limit = 38 << std::max(0, bit_depth - 8);
+        if (len_error > len_limit) {
+          modulate *= len_limit / len_error;
+        }
+        for (size_t c = 0; c < nb; ++c) {
+          float total_error = (color_with_error[c] - best_val[c]);
+
+          // If the neighboring pixels have some error in the opposite
+          // direction of total_error, cancel some or all of it out before
+          // spreading among them.
+          constexpr int offsets[12][2] = {{1, 2}, {0, 3}, {0, 4}, {1, 1},
+                                          {1, 3}, {2, 2}, {1, 0}, {1, 4},
+                                          {2, 1}, {2, 3}, {2, 0}, {2, 4}};
+          float total_available = 0;
+          for (int i = 0; i < 11; ++i) {
+            const int row = offsets[i][0];
+            const int col = offsets[i][1];
+            if (std::signbit(error_row[row][c][x + col]) !=
+                std::signbit(total_error)) {
+              total_available += error_row[row][c][x + col];
+            }
+          }
+          float weight =
+              std::abs(total_error) / (std::abs(total_available) + 1e-3);
+          weight = std::min(weight, 1.0f);
+          for (int i = 0; i < 11; ++i) {
+            const int row = offsets[i][0];
+            const int col = offsets[i][1];
+            if (std::signbit(error_row[row][c][x + col]) !=
+                std::signbit(total_error)) {
+              total_error += weight * error_row[row][c][x + col];
+              error_row[row][c][x + col] *= (1 - weight);
+            }
+          }
+          total_error *= modulate;
+          const float remaining_error = (1.0f / 14.) * total_error;
+          error_row[0][c][x + 3] += 2 * remaining_error;
+          error_row[0][c][x + 4] += remaining_error;
+          error_row[1][c][x + 0] += remaining_error;
+          for (int i = 0; i < 5; ++i) {
+            error_row[1][c][x + i] += remaining_error;
+            error_row[2][c][x + i] += remaining_error;
+          }
+        }
+      }
+      if (palette_iteration_data.final_run) p[x] = index;
+    }
+    if (lossy) {
+      for (size_t c = 0; c < nb; ++c) {
+        error_row[0][c].swap(error_row[1][c]);
+        error_row[1][c].swap(error_row[2][c]);
+        std::fill(error_row[2][c].begin(), error_row[2][c].end(), 0.f);
+      }
+    }
+  }
+  if (!delta_used) {
+    predictor = Predictor::Zero;
+  }
+  if (palette_iteration_data.final_run) {
+    input.nb_meta_channels++;
+    input.channel.erase(input.channel.begin() + begin_c + 1,
+                        input.channel.begin() + end_c + 1);
+    input.channel.insert(input.channel.begin(), std::move(pch));
+  }
+  nb_colors -= nb_deltas;
+  return true;
+}
+
+Status FwdPalette(Image &input, uint32_t begin_c, uint32_t end_c,
+                  uint32_t &nb_colors, uint32_t &nb_deltas, bool ordered,
+                  bool lossy, Predictor &predictor,
+                  const weighted::Header &wp_header) {
+  PaletteIterationData palette_iteration_data;
+  uint32_t nb_colors_orig = nb_colors;
+  uint32_t nb_deltas_orig = nb_deltas;
+  // preprocessing pass in case of lossy palette
+  if (lossy && input.bitdepth >= 8) {
+    JXL_RETURN_IF_ERROR(FwdPaletteIteration(
+        input, begin_c, end_c, nb_colors_orig, nb_deltas_orig, ordered, lossy,
+        predictor, wp_header, palette_iteration_data));
+  }
+  palette_iteration_data.final_run = true;
+  return FwdPaletteIteration(input, begin_c, end_c, nb_colors, nb_deltas,
+                             ordered, lossy, predictor, wp_header,
+                             palette_iteration_data);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.h b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.h
new file mode 100644
index 0000000000..0f3d66825b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_PALETTE_H_
+#define LIB_JXL_MODULAR_TRANSFORM_ENC_PALETTE_H_
+
+#include "lib/jxl/fields.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/modular_image.h"
+
+namespace jxl {
+
+Status FwdPalette(Image &input, uint32_t begin_c, uint32_t end_c,
+                  uint32_t &nb_colors, uint32_t &nb_deltas, bool ordered,
+                  bool lossy, Predictor &predictor,
+                  const weighted::Header &wp_header);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_ENC_PALETTE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.cc
new file mode 100644
index 0000000000..050563a3c2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.cc
@@ -0,0 +1,73 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/enc_rct.h"
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"  // CheckEqualChannels
+
+namespace jxl {
+
+Status FwdRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
+  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2));
+  if (rct_type == 0) {  // noop
+    return false;
+  }
+  // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR
+  int permutation = rct_type / 7;
+  // 0-5 values have the low bit corresponding to Third and the high bits
+  // corresponding to Second. 6 corresponds to YCoCg.
+  //
+  // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird
+  //
+  // Third: 0=nop, 1=SubtractFirst
+  int custom = rct_type % 7;
+  size_t m = begin_c;
+  size_t w = input.channel[m + 0].w;
+  size_t h = input.channel[m + 0].h;
+  int second = (custom % 7) >> 1;
+  int third = (custom % 7) & 1;
+  const auto do_rct = [&](const int y, const int thread) {
+    const pixel_type* in0 = input.channel[m + (permutation % 3)].Row(y);
+    const pixel_type* in1 =
+        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
+    const pixel_type* in2 =
+        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
+    pixel_type* out0 = input.channel[m].Row(y);
+    pixel_type* out1 = input.channel[m + 1].Row(y);
+    pixel_type* out2 = input.channel[m + 2].Row(y);
+    if (custom == 6) {
+      for (size_t x = 0; x < w; x++) {
+        pixel_type R = in0[x];
+        pixel_type G = in1[x];
+        pixel_type B = in2[x];
+        out1[x] = R - B;
+        pixel_type tmp = B + (out1[x] >> 1);
+        out2[x] = G - tmp;
+        out0[x] = tmp + (out2[x] >> 1);
+      }
+    } else {
+      for (size_t x = 0; x < w; x++) {
+        pixel_type First = in0[x];
+        pixel_type Second = in1[x];
+        pixel_type Third = in2[x];
+        if (second == 1) {
+          Second = Second - First;
+        } else if (second == 2) {
+          Second = Second - ((First + Third) >> 1);
+        }
+        if (third) Third = Third - First;
+        out0[x] = First;
+        out1[x] = Second;
+        out2[x] = Third;
+      }
+    }
+  };
+  return RunOnPool(pool, 0, h, ThreadPool::NoInit, do_rct, "FwdRCT");
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.h b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.h
new file mode 100644
index 0000000000..cb5a193c8d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.h
@@ -0,0 +1,17 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_RCT_H_
+#define LIB_JXL_MODULAR_TRANSFORM_ENC_RCT_H_
+
+#include "lib/jxl/modular/modular_image.h"
+
+namespace jxl {
+
+Status FwdRCT(Image &input, size_t begin_c, size_t rct_type, ThreadPool *pool);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_ENC_RCT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.cc
new file mode 100644
index 0000000000..dfd90cde68
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.cc
@@ -0,0 +1,141 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/enc_squeeze.h"
+
+#include <stdlib.h>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/squeeze.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+namespace jxl {
+
+void FwdHSqueeze(Image &input, int c, int rc) {
+  const Channel &chin = input.channel[c];
+
+  JXL_DEBUG_V(4, "Doing horizontal squeeze of channel %i to new channel %i", c,
+              rc);
+
+  Channel chout((chin.w + 1) / 2, chin.h, chin.hshift + 1, chin.vshift);
+  Channel chout_residual(chin.w - chout.w, chout.h, chin.hshift + 1,
+                         chin.vshift);
+
+  for (size_t y = 0; y < chout.h; y++) {
+    const pixel_type *JXL_RESTRICT p_in = chin.Row(y);
+    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
+    pixel_type *JXL_RESTRICT p_res = chout_residual.Row(y);
+    for (size_t x = 0; x < chout_residual.w; x++) {
+      pixel_type A = p_in[x * 2];
+      pixel_type B = p_in[x * 2 + 1];
+      pixel_type avg = (A + B + (A > B)) >> 1;
+      p_out[x] = avg;
+
+      pixel_type diff = A - B;
+
+      pixel_type next_avg = avg;
+      if (x + 1 < chout_residual.w) {
+        next_avg = (p_in[x * 2 + 2] + p_in[x * 2 + 3] +
+                    (p_in[x * 2 + 2] > p_in[x * 2 + 3])) >>
+                   1;  // which will be chout.value(y,x+1)
+      } else if (chin.w & 1)
+        next_avg = p_in[x * 2 + 2];
+      pixel_type left = (x > 0 ? p_in[x * 2 - 1] : avg);
+      pixel_type tendency = SmoothTendency(left, avg, next_avg);
+
+      p_res[x] = diff - tendency;
+    }
+    if (chin.w & 1) {
+      int x = chout.w - 1;
+      p_out[x] = p_in[x * 2];
+    }
+  }
+  input.channel[c] = std::move(chout);
+  input.channel.insert(input.channel.begin() + rc, std::move(chout_residual));
+}
+
+void FwdVSqueeze(Image &input, int c, int rc) {
+  const Channel &chin = input.channel[c];
+
+  JXL_DEBUG_V(4, "Doing vertical squeeze of channel %i to new channel %i", c,
+              rc);
+
+  Channel chout(chin.w, (chin.h + 1) / 2, chin.hshift, chin.vshift + 1);
+  Channel chout_residual(chin.w, chin.h - chout.h, chin.hshift,
+                         chin.vshift + 1);
+  intptr_t onerow_in = chin.plane.PixelsPerRow();
+  for (size_t y = 0; y < chout_residual.h; y++) {
+    const pixel_type *JXL_RESTRICT p_in = chin.Row(y * 2);
+    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
+    pixel_type *JXL_RESTRICT p_res = chout_residual.Row(y);
+    for (size_t x = 0; x < chout.w; x++) {
+      pixel_type A = p_in[x];
+      pixel_type B = p_in[x + onerow_in];
+      pixel_type avg = (A + B + (A > B)) >> 1;
+      p_out[x] = avg;
+
+      pixel_type diff = A - B;
+
+      pixel_type next_avg = avg;
+      if (y + 1 < chout_residual.h) {
+        next_avg = (p_in[x + 2 * onerow_in] + p_in[x + 3 * onerow_in] +
+                    (p_in[x + 2 * onerow_in] > p_in[x + 3 * onerow_in])) >>
+                   1;  // which will be chout.value(y+1,x)
+      } else if (chin.h & 1) {
+        next_avg = p_in[x + 2 * onerow_in];
+      }
+      pixel_type top =
+          (y > 0 ? p_in[static_cast<ssize_t>(x) - onerow_in] : avg);
+      pixel_type tendency = SmoothTendency(top, avg, next_avg);
+
+      p_res[x] = diff - tendency;
+    }
+  }
+  if (chin.h & 1) {
+    size_t y = chout.h - 1;
+    const pixel_type *p_in = chin.Row(y * 2);
+    pixel_type *p_out = chout.Row(y);
+    for (size_t x = 0; x < chout.w; x++) {
+      p_out[x] = p_in[x];
+    }
+  }
+  input.channel[c] = std::move(chout);
+  input.channel.insert(input.channel.begin() + rc, std::move(chout_residual));
+}
+
+Status FwdSqueeze(Image &input, std::vector<SqueezeParams> parameters,
+                  ThreadPool *pool) {
+  if (parameters.empty()) {
+    DefaultSqueezeParameters(&parameters, input);
+  }
+  // if nothing to do, don't do squeeze
+  if (parameters.empty()) return false;
+  for (size_t i = 0; i < parameters.size(); i++) {
+    JXL_RETURN_IF_ERROR(
+        CheckMetaSqueezeParams(parameters[i], input.channel.size()));
+    bool horizontal = parameters[i].horizontal;
+    bool in_place = parameters[i].in_place;
+    uint32_t beginc = parameters[i].begin_c;
+    uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1;
+    uint32_t offset;
+    if (in_place) {
+      offset = endc + 1;
+    } else {
+      offset = input.channel.size();
+    }
+    for (uint32_t c = beginc; c <= endc; c++) {
+      if (horizontal) {
+        FwdHSqueeze(input, c, offset + c - beginc);
+      } else {
+        FwdVSqueeze(input, c, offset + c - beginc);
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.h b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.h
new file mode 100644
index 0000000000..39b001017b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.h
@@ -0,0 +1,20 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_SQUEEZE_H_
+#define LIB_JXL_MODULAR_TRANSFORM_ENC_SQUEEZE_H_
+
+#include "lib/jxl/fields.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+namespace jxl {
+
+Status FwdSqueeze(Image &input, std::vector<SqueezeParams> parameters,
+                  ThreadPool *pool);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_ENC_SQUEEZE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.cc
new file mode 100644
index 0000000000..bdaaf9f87e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.cc
@@ -0,0 +1,46 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/enc_transform.h"
+
+#include "lib/jxl/modular/transform/enc_palette.h"
+#include "lib/jxl/modular/transform/enc_rct.h"
+#include "lib/jxl/modular/transform/enc_squeeze.h"
+
+namespace jxl {
+
+Status TransformForward(Transform &t, Image &input,
+                        const weighted::Header &wp_header, ThreadPool *pool) {
+  switch (t.id) {
+    case TransformId::kRCT:
+      return FwdRCT(input, t.begin_c, t.rct_type, pool);
+    case TransformId::kSqueeze:
+      return FwdSqueeze(input, t.squeezes, pool);
+    case TransformId::kPalette:
+      return FwdPalette(input, t.begin_c, t.begin_c + t.num_c - 1, t.nb_colors,
+                        t.nb_deltas, t.ordered_palette, t.lossy_palette,
+                        t.predictor, wp_header);
+    default:
+      return JXL_FAILURE("Unknown transformation (ID=%u)",
+                         static_cast<unsigned int>(t.id));
+  }
+}
+
+void compute_minmax(const Channel &ch, pixel_type *min, pixel_type *max) {
+  pixel_type realmin = std::numeric_limits<pixel_type>::max();
+  pixel_type realmax = std::numeric_limits<pixel_type>::min();
+  for (size_t y = 0; y < ch.h; y++) {
+    const pixel_type *JXL_RESTRICT p = ch.Row(y);
+    for (size_t x = 0; x < ch.w; x++) {
+      if (p[x] < realmin) realmin = p[x];
+      if (p[x] > realmax) realmax = p[x];
+    }
+  }
+
+  if (min) *min = realmin;
+  if (max) *max = realmax;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.h b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.h
new file mode 100644
index 0000000000..07659e1b0a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_TRANSFORM_H_
+#define LIB_JXL_MODULAR_TRANSFORM_ENC_TRANSFORM_H_
+
+#include "lib/jxl/fields.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+namespace jxl {
+
+Status TransformForward(Transform &t, Image &input,
+                        const weighted::Header &wp_header, ThreadPool *pool);
+
+void compute_minmax(const Channel &ch, pixel_type *min, pixel_type *max);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_ENC_TRANSFORM_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/palette.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/palette.cc
new file mode 100644
index 0000000000..46129f19f0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/palette.cc
@@ -0,0 +1,176 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/palette.h"
+
+namespace jxl {
+
+Status InvPalette(Image &input, uint32_t begin_c, uint32_t nb_colors,
+                  uint32_t nb_deltas, Predictor predictor,
+                  const weighted::Header &wp_header, ThreadPool *pool) {
+  if (input.nb_meta_channels < 1) {
+    return JXL_FAILURE("Error: Palette transform without palette.");
+  }
+  std::atomic<int> num_errors{0};
+  int nb = input.channel[0].h;
+  uint32_t c0 = begin_c + 1;
+  if (c0 >= input.channel.size()) {
+    return JXL_FAILURE("Channel is out of range.");
+  }
+  size_t w = input.channel[c0].w;
+  size_t h = input.channel[c0].h;
+  if (nb < 1) return JXL_FAILURE("Corrupted transforms");
+  for (int i = 1; i < nb; i++) {
+    input.channel.insert(
+        input.channel.begin() + c0 + 1,
+        Channel(w, h, input.channel[c0].hshift, input.channel[c0].vshift));
+  }
+  const Channel &palette = input.channel[0];
+  const pixel_type *JXL_RESTRICT p_palette = input.channel[0].Row(0);
+  intptr_t onerow = input.channel[0].plane.PixelsPerRow();
+  intptr_t onerow_image = input.channel[c0].plane.PixelsPerRow();
+  const int bit_depth = std::min(input.bitdepth, 24);
+
+  if (w == 0) {
+    // Nothing to do.
+    // Avoid touching "empty" channels with non-zero height.
+  } else if (nb_deltas == 0 && predictor == Predictor::Zero) {
+    if (nb == 1) {
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, h, ThreadPool::NoInit,
+          [&](const uint32_t task, size_t /* thread */) {
+            const size_t y = task;
+            pixel_type *p = input.channel[c0].Row(y);
+            for (size_t x = 0; x < w; x++) {
+              const int index = Clamp1<int>(p[x], 0, (pixel_type)palette.w - 1);
+              p[x] = palette_internal::GetPaletteValue(
+                  p_palette, index, /*c=*/0,
+                  /*palette_size=*/palette.w,
+                  /*onerow=*/onerow, /*bit_depth=*/bit_depth);
+            }
+          },
+          "UndoChannelPalette"));
+    } else {
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, h, ThreadPool::NoInit,
+          [&](const uint32_t task, size_t /* thread */) {
+            const size_t y = task;
+            std::vector<pixel_type *> p_out(nb);
+            const pixel_type *p_index = input.channel[c0].Row(y);
+            for (int c = 0; c < nb; c++)
+              p_out[c] = input.channel[c0 + c].Row(y);
+            for (size_t x = 0; x < w; x++) {
+              const int index = p_index[x];
+              for (int c = 0; c < nb; c++) {
+                p_out[c][x] = palette_internal::GetPaletteValue(
+                    p_palette, index, /*c=*/c,
+                    /*palette_size=*/palette.w,
+                    /*onerow=*/onerow, /*bit_depth=*/bit_depth);
+              }
+            }
+          },
+          "UndoPalette"));
+    }
+  } else {
+    // Parallelized per channel.
+    ImageI indices = CopyImage(input.channel[c0].plane);
+    if (predictor == Predictor::Weighted) {
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, nb, ThreadPool::NoInit,
+          [&](const uint32_t c, size_t /* thread */) {
+            Channel &channel = input.channel[c0 + c];
+            weighted::State wp_state(wp_header, channel.w, channel.h);
+            for (size_t y = 0; y < channel.h; y++) {
+              pixel_type *JXL_RESTRICT p = channel.Row(y);
+              const pixel_type *JXL_RESTRICT idx = indices.Row(y);
+              for (size_t x = 0; x < channel.w; x++) {
+                int index = idx[x];
+                pixel_type_w val = 0;
+                const pixel_type palette_entry =
+                    palette_internal::GetPaletteValue(
+                        p_palette, index, /*c=*/c,
+                        /*palette_size=*/palette.w, /*onerow=*/onerow,
+                        /*bit_depth=*/bit_depth);
+                if (index < static_cast<int32_t>(nb_deltas)) {
+                  PredictionResult pred =
+                      PredictNoTreeWP(channel.w, p + x, onerow_image, x, y,
+                                      predictor, &wp_state);
+                  val = pred.guess + palette_entry;
+                } else {
+                  val = palette_entry;
+                }
+                p[x] = val;
+                wp_state.UpdateErrors(p[x], x, y, channel.w);
+              }
+            }
+          },
+          "UndoDeltaPaletteWP"));
+    } else {
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, nb, ThreadPool::NoInit,
+          [&](const uint32_t c, size_t /* thread */) {
+            Channel &channel = input.channel[c0 + c];
+            for (size_t y = 0; y < channel.h; y++) {
+              pixel_type *JXL_RESTRICT p = channel.Row(y);
+              const pixel_type *JXL_RESTRICT idx = indices.Row(y);
+              for (size_t x = 0; x < channel.w; x++) {
+                int index = idx[x];
+                pixel_type_w val = 0;
+                const pixel_type palette_entry =
+                    palette_internal::GetPaletteValue(
+                        p_palette, index, /*c=*/c,
+                        /*palette_size=*/palette.w,
+                        /*onerow=*/onerow, /*bit_depth=*/bit_depth);
+                if (index < static_cast<int32_t>(nb_deltas)) {
+                  PredictionResult pred = PredictNoTreeNoWP(
+                      channel.w, p + x, onerow_image, x, y, predictor);
+                  val = pred.guess + palette_entry;
+                } else {
+                  val = palette_entry;
+                }
+                p[x] = val;
+              }
+            }
+          },
+          "UndoDeltaPaletteNoWP"));
+    }
+  }
+  if (c0 >= input.nb_meta_channels) {
+    // Palette was done on normal channels
+    input.nb_meta_channels--;
+  } else {
+    // Palette was done on metachannels
+    JXL_ASSERT(static_cast<int>(input.nb_meta_channels) >= 2 - nb);
+    input.nb_meta_channels -= 2 - nb;
+    JXL_ASSERT(begin_c + nb - 1 < input.nb_meta_channels);
+  }
+  input.channel.erase(input.channel.begin(), input.channel.begin() + 1);
+  return num_errors.load(std::memory_order_relaxed) == 0;
+}
+
+Status MetaPalette(Image &input, uint32_t begin_c, uint32_t end_c,
+                   uint32_t nb_colors, uint32_t nb_deltas, bool lossy) {
+  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, end_c));
+
+  size_t nb = end_c - begin_c + 1;
+  if (begin_c >= input.nb_meta_channels) {
+    // Palette was done on normal channels
+    input.nb_meta_channels++;
+  } else {
+    // Palette was done on metachannels
+    JXL_ASSERT(end_c < input.nb_meta_channels);
+    // we remove nb-1 metachannels and add one
+    input.nb_meta_channels += 2 - nb;
+  }
+  input.channel.erase(input.channel.begin() + begin_c + 1,
+                      input.channel.begin() + end_c + 1);
+  Channel pch(nb_colors + nb_deltas, nb);
+  pch.hshift = -1;
+  pch.vshift = -1;
+  input.channel.insert(input.channel.begin(), std::move(pch));
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/palette.h b/third_party/jpeg-xl/lib/jxl/modular/transform/palette.h
new file mode 100644
index 0000000000..cc0f67960b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/palette.h
@@ -0,0 +1,129 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_PALETTE_H_
+#define LIB_JXL_MODULAR_TRANSFORM_PALETTE_H_
+
+#include <atomic>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"  // CheckEqualChannels
+
+namespace jxl {
+
+namespace palette_internal {
+
+static constexpr int kMaxPaletteLookupTableSize = 1 << 16;
+
+static constexpr int kRgbChannels = 3;
+
+// 5x5x5 color cube for the larger cube.
+static constexpr int kLargeCube = 5;
+
+// Smaller interleaved color cube to fill the holes of the larger cube.
+static constexpr int kSmallCube = 4;
+static constexpr int kSmallCubeBits = 2;
+// kSmallCube ** 3
+static constexpr int kLargeCubeOffset = kSmallCube * kSmallCube * kSmallCube;
+
+static inline pixel_type Scale(uint64_t value, uint64_t bit_depth,
+                               uint64_t denom) {
+  // return (value * ((static_cast<pixel_type_w>(1) << bit_depth) - 1)) / denom;
+  // We only call this function with kSmallCube or kLargeCube - 1 as denom,
+  // allowing us to avoid a division here.
+  JXL_ASSERT(denom == 4);
+  return (value * ((static_cast<uint64_t>(1) << bit_depth) - 1)) >> 2;
+}
+
+// The purpose of this function is solely to extend the interpretation of
+// palette indices to implicit values. If index < nb_deltas, indicating that the
+// result is a delta palette entry, it is the responsibility of the caller to
+// treat it as such.
+static JXL_MAYBE_UNUSED pixel_type
+GetPaletteValue(const pixel_type *const palette, int index, const size_t c,
+                const int palette_size, const int onerow, const int bit_depth) {
+  if (index < 0) {
+    static constexpr std::array<std::array<pixel_type, 3>, 72> kDeltaPalette = {
+        {
+            {{0, 0, 0}},       {{4, 4, 4}},       {{11, 0, 0}},
+            {{0, 0, -13}},     {{0, -12, 0}},     {{-10, -10, -10}},
+            {{-18, -18, -18}}, {{-27, -27, -27}}, {{-18, -18, 0}},
+            {{0, 0, -32}},     {{-32, 0, 0}},     {{-37, -37, -37}},
+            {{0, -32, -32}},   {{24, 24, 45}},    {{50, 50, 50}},
+            {{-45, -24, -24}}, {{-24, -45, -45}}, {{0, -24, -24}},
+            {{-34, -34, 0}},   {{-24, 0, -24}},   {{-45, -45, -24}},
+            {{64, 64, 64}},    {{-32, 0, -32}},   {{0, -32, 0}},
+            {{-32, 0, 32}},    {{-24, -45, -24}}, {{45, 24, 45}},
+            {{24, -24, -45}},  {{-45, -24, 24}},  {{80, 80, 80}},
+            {{64, 0, 0}},      {{0, 0, -64}},     {{0, -64, -64}},
+            {{-24, -24, 45}},  {{96, 96, 96}},    {{64, 64, 0}},
+            {{45, -24, -24}},  {{34, -34, 0}},    {{112, 112, 112}},
+            {{24, -45, -45}},  {{45, 45, -24}},   {{0, -32, 32}},
+            {{24, -24, 45}},   {{0, 96, 96}},     {{45, -24, 24}},
+            {{24, -45, -24}},  {{-24, -45, 24}},  {{0, -64, 0}},
+            {{96, 0, 0}},      {{128, 128, 128}}, {{64, 0, 64}},
+            {{144, 144, 144}}, {{96, 96, 0}},     {{-36, -36, 36}},
+            {{45, -24, -45}},  {{45, -45, -24}},  {{0, 0, -96}},
+            {{0, 128, 128}},   {{0, 96, 0}},      {{45, 24, -45}},
+            {{-128, 0, 0}},    {{24, -45, 24}},   {{-45, 24, -45}},
+            {{64, 0, -64}},    {{64, -64, -64}},  {{96, 0, 96}},
+            {{45, -45, 24}},   {{24, 45, -45}},   {{64, 64, -64}},
+            {{128, 128, 0}},   {{0, 0, -128}},    {{-24, 45, -45}},
+        }};
+    if (c >= kRgbChannels) {
+      return 0;
+    }
+    // Do not open the brackets, otherwise INT32_MIN negation could overflow.
+    index = -(index + 1);
+    index %= 1 + 2 * (kDeltaPalette.size() - 1);
+    static constexpr int kMultiplier[] = {-1, 1};
+    pixel_type result =
+        kDeltaPalette[((index + 1) >> 1)][c] * kMultiplier[index & 1];
+    if (bit_depth > 8) {
+      result *= static_cast<pixel_type>(1) << (bit_depth - 8);
+    }
+    return result;
+  } else if (palette_size <= index && index < palette_size + kLargeCubeOffset) {
+    if (c >= kRgbChannels) return 0;
+    index -= palette_size;
+    index >>= c * kSmallCubeBits;
+    return Scale(index % kSmallCube, bit_depth, kSmallCube) +
+           (1 << (std::max(0, bit_depth - 3)));
+  } else if (palette_size + kLargeCubeOffset <= index) {
+    if (c >= kRgbChannels) return 0;
+    index -= palette_size + kLargeCubeOffset;
+    // TODO(eustas): should we take care of ambiguity created by
+    //               index >= kLargeCube ** 3 ?
+    switch (c) {
+      case 0:
+        break;
+      case 1:
+        index /= kLargeCube;
+        break;
+      case 2:
+        index /= kLargeCube * kLargeCube;
+        break;
+    }
+    return Scale(index % kLargeCube, bit_depth, kLargeCube - 1);
+  }
+  return palette[c * onerow + static_cast<size_t>(index)];
+}
+
+}  // namespace palette_internal
+
+Status InvPalette(Image &input, uint32_t begin_c, uint32_t nb_colors,
+                  uint32_t nb_deltas, Predictor predictor,
+                  const weighted::Header &wp_header, ThreadPool *pool);
+
+Status MetaPalette(Image &input, uint32_t begin_c, uint32_t end_c,
+                   uint32_t nb_colors, uint32_t nb_deltas, bool lossy);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_PALETTE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/rct.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/rct.cc
new file mode 100644
index 0000000000..f3002a5ac3
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/rct.cc
@@ -0,0 +1,153 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/rct.h"
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/modular/transform/rct.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Sub;
+
+template <int transform_type>
+void InvRCTRow(const pixel_type* in0, const pixel_type* in1,
+               const pixel_type* in2, pixel_type* out0, pixel_type* out1,
+               pixel_type* out2, size_t w) {
+  static_assert(transform_type >= 0 && transform_type < 7,
+                "Invalid transform type");
+  int second = transform_type >> 1;
+  int third = transform_type & 1;
+
+  size_t x = 0;
+  const HWY_FULL(pixel_type) d;
+  const size_t N = Lanes(d);
+  for (; x + N - 1 < w; x += N) {
+    if (transform_type == 6) {
+      auto Y = Load(d, in0 + x);
+      auto Co = Load(d, in1 + x);
+      auto Cg = Load(d, in2 + x);
+      Y = Sub(Y, ShiftRight<1>(Cg));
+      auto G = Add(Cg, Y);
+      Y = Sub(Y, ShiftRight<1>(Co));
+      auto R = Add(Y, Co);
+      Store(R, d, out0 + x);
+      Store(G, d, out1 + x);
+      Store(Y, d, out2 + x);
+    } else {
+      auto First = Load(d, in0 + x);
+      auto Second = Load(d, in1 + x);
+      auto Third = Load(d, in2 + x);
+      if (third) Third = Add(Third, First);
+      if (second == 1) {
+        Second = Add(Second, First);
+      } else if (second == 2) {
+        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
+      }
+      Store(First, d, out0 + x);
+      Store(Second, d, out1 + x);
+      Store(Third, d, out2 + x);
+    }
+  }
+  for (; x < w; x++) {
+    if (transform_type == 6) {
+      pixel_type Y = in0[x];
+      pixel_type Co = in1[x];
+      pixel_type Cg = in2[x];
+      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
+      pixel_type G = PixelAdd(Cg, tmp);
+      pixel_type B = PixelAdd(tmp, -(Co >> 1));
+      pixel_type R = PixelAdd(B, Co);
+      out0[x] = R;
+      out1[x] = G;
+      out2[x] = B;
+    } else {
+      pixel_type First = in0[x];
+      pixel_type Second = in1[x];
+      pixel_type Third = in2[x];
+      if (third) Third = PixelAdd(Third, First);
+      if (second == 1) {
+        Second = PixelAdd(Second, First);
+      } else if (second == 2) {
+        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
+      }
+      out0[x] = First;
+      out1[x] = Second;
+      out2[x] = Third;
+    }
+  }
+}
+
+Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
+  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2));
+  size_t m = begin_c;
+  Channel& c0 = input.channel[m + 0];
+  size_t w = c0.w;
+  size_t h = c0.h;
+  if (rct_type == 0) {  // noop
+    return true;
+  }
+  // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR
+  int permutation = rct_type / 7;
+  JXL_CHECK(permutation < 6);
+  // 0-5 values have the low bit corresponding to Third and the high bits
+  // corresponding to Second. 6 corresponds to YCoCg.
+  //
+  // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird
+  //
+  // Third: 0=nop, 1=SubtractFirst
+  int custom = rct_type % 7;
+  // Special case: permute-only. Swap channels around.
+  if (custom == 0) {
+    Channel ch0 = std::move(input.channel[m]);
+    Channel ch1 = std::move(input.channel[m + 1]);
+    Channel ch2 = std::move(input.channel[m + 2]);
+    input.channel[m + (permutation % 3)] = std::move(ch0);
+    input.channel[m + ((permutation + 1 + permutation / 3) % 3)] =
+        std::move(ch1);
+    input.channel[m + ((permutation + 2 - permutation / 3) % 3)] =
+        std::move(ch2);
+    return true;
+  }
+  constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = {
+      InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>,
+      InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>};
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0, h, ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /* thread */) {
+        const size_t y = task;
+        const pixel_type* in0 = input.channel[m].Row(y);
+        const pixel_type* in1 = input.channel[m + 1].Row(y);
+        const pixel_type* in2 = input.channel[m + 2].Row(y);
+        pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
+        pixel_type* out1 =
+            input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
+        pixel_type* out2 =
+            input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
+        inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
+      },
+      "InvRCT"));
+  return true;
+}
+
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(InvRCT);
+Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
+  return HWY_DYNAMIC_DISPATCH(InvRCT)(input, begin_c, rct_type, pool);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/rct.h b/third_party/jpeg-xl/lib/jxl/modular/transform/rct.h
new file mode 100644
index 0000000000..aef65621d5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/rct.h
@@ -0,0 +1,20 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_RCT_H_
+#define LIB_JXL_MODULAR_TRANSFORM_RCT_H_
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"  // CheckEqualChannels
+
+namespace jxl {
+
+Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_RCT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.cc
new file mode 100644
index 0000000000..8440d9e804
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.cc
@@ -0,0 +1,478 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/squeeze.h"
+
+#include <stdlib.h>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/modular/transform/squeeze.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/simd_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::Gt;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+using hwy::HWY_NAMESPACE::Lt;
+using hwy::HWY_NAMESPACE::MulEven;
+using hwy::HWY_NAMESPACE::Ne;
+using hwy::HWY_NAMESPACE::Neg;
+using hwy::HWY_NAMESPACE::OddEven;
+using hwy::HWY_NAMESPACE::RebindToUnsigned;
+using hwy::HWY_NAMESPACE::ShiftLeft;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::Xor;
+
+#if HWY_TARGET != HWY_SCALAR
+
+JXL_INLINE void FastUnsqueeze(const pixel_type *JXL_RESTRICT p_residual,
+                              const pixel_type *JXL_RESTRICT p_avg,
+                              const pixel_type *JXL_RESTRICT p_navg,
+                              const pixel_type *p_pout,
+                              pixel_type *JXL_RESTRICT p_out,
+                              pixel_type *p_nout) {
+  const HWY_CAPPED(pixel_type, 8) d;
+  const RebindToUnsigned<decltype(d)> du;
+  const size_t N = Lanes(d);
+  auto onethird = Set(d, 0x55555556);
+  for (size_t x = 0; x < 8; x += N) {
+    auto avg = Load(d, p_avg + x);
+    auto next_avg = Load(d, p_navg + x);
+    auto top = Load(d, p_pout + x);
+    // Equivalent to SmoothTendency(top,avg,next_avg), but without branches
+    auto Ba = Sub(top, avg);
+    auto an = Sub(avg, next_avg);
+    auto nonmono = Xor(Ba, an);
+    auto absBa = Abs(Ba);
+    auto absan = Abs(an);
+    auto absBn = Abs(Sub(top, next_avg));
+    // Compute a3 = absBa / 3
+    auto a3e = BitCast(d, ShiftRight<32>(MulEven(absBa, onethird)));
+    auto a3oi = MulEven(Reverse(d, absBa), onethird);
+    auto a3o = BitCast(
+        d, Reverse(hwy::HWY_NAMESPACE::Repartition<pixel_type_w, decltype(d)>(),
+                   a3oi));
+    auto a3 = OddEven(a3o, a3e);
+    a3 = Add(a3, Add(absBn, Set(d, 2)));
+    auto absdiff = ShiftRight<2>(a3);
+    auto skipdiff = Ne(Ba, Zero(d));
+    skipdiff = And(skipdiff, Ne(an, Zero(d)));
+    skipdiff = And(skipdiff, Lt(nonmono, Zero(d)));
+    auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1)));
+    absdiff = IfThenElse(Gt(absdiff, absBa2),
+                         Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff);
+    auto absan2 = ShiftLeft<1>(absan);
+    absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2),
+                         absan2, absdiff);
+    auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff);
+    auto tendency = IfThenZeroElse(skipdiff, diff1);
+
+    auto diff_minus_tendency = Load(d, p_residual + x);
+    auto diff = Add(diff_minus_tendency, tendency);
+    auto out =
+        Add(avg, ShiftRight<1>(
+                     Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff))))));
+    Store(out, d, p_out + x);
+    Store(Sub(out, diff), d, p_nout + x);
+  }
+}
+
+#endif
+
+Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
+  JXL_ASSERT(c < input.channel.size());
+  JXL_ASSERT(rc < input.channel.size());
+  Channel &chin = input.channel[c];
+  const Channel &chin_residual = input.channel[rc];
+  // These must be valid since we ran MetaApply already.
+  JXL_ASSERT(chin.w == DivCeil(chin.w + chin_residual.w, 2));
+  JXL_ASSERT(chin.h == chin_residual.h);
+
+  if (chin_residual.w == 0) {
+    // Short-circuit: output channel has same dimensions as input.
+    input.channel[c].hshift--;
+    return true;
+  }
+
+  // Note: chin.w >= chin_residual.w and at most 1 different.
+  Channel chout(chin.w + chin_residual.w, chin.h, chin.hshift - 1, chin.vshift);
+  JXL_DEBUG_V(4,
+              "Undoing horizontal squeeze of channel %i using residuals in "
+              "channel %i (going from width %" PRIuS " to %" PRIuS ")",
+              c, rc, chin.w, chout.w);
+
+  if (chin_residual.h == 0) {
+    // Short-circuit: channel with no pixels.
+    input.channel[c] = std::move(chout);
+    return true;
+  }
+  auto unsqueeze_row = [&](size_t y, size_t x0) {
+    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y);
+    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y);
+    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
+    for (size_t x = x0; x < chin_residual.w; x++) {
+      pixel_type_w diff_minus_tendency = p_residual[x];
+      pixel_type_w avg = p_avg[x];
+      pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg);
+      pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg);
+      pixel_type_w tendency = SmoothTendency(left, avg, next_avg);
+      pixel_type_w diff = diff_minus_tendency + tendency;
+      pixel_type_w A = avg + (diff / 2);
+      p_out[(x << 1)] = A;
+      pixel_type_w B = A - diff;
+      p_out[(x << 1) + 1] = B;
+    }
+    if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1];
+  };
+
+  // somewhat complicated trickery just to be able to SIMD this.
+  // Horizontal unsqueeze has horizontal data dependencies, so we do
+  // 8 rows at a time and treat it as a vertical unsqueeze of a
+  // transposed 8x8 block (or 9x8 for one input).
+  static constexpr const size_t kRowsPerThread = 8;
+  const auto unsqueeze_span = [&](const uint32_t task, size_t /* thread */) {
+    const size_t y0 = task * kRowsPerThread;
+    const size_t rows = std::min(kRowsPerThread, chin.h - y0);
+    size_t x = 0;
+
+#if HWY_TARGET != HWY_SCALAR
+    intptr_t onerow_in = chin.plane.PixelsPerRow();
+    intptr_t onerow_inr = chin_residual.plane.PixelsPerRow();
+    intptr_t onerow_out = chout.plane.PixelsPerRow();
+    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0);
+    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0);
+    pixel_type *JXL_RESTRICT p_out = chout.Row(y0);
+    HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread];
+    HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread];
+    HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread];
+    HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread];
+    HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread];
+    HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread];
+    const HWY_CAPPED(pixel_type, 8) d;
+    const size_t N = Lanes(d);
+    if (chin_residual.w > 16 && rows == kRowsPerThread) {
+      for (; x < chin_residual.w - 9; x += 8) {
+        Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr);
+        Transpose8x8Block(p_avg + x, b_p_avg, onerow_in);
+        for (size_t y = 0; y < kRowsPerThread; y++) {
+          b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y];
+        }
+        for (size_t i = 0; i < 8; i++) {
+          FastUnsqueeze(
+              b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1),
+              (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i),
+              b_p_out_even + 8 * i, b_p_out_odd + 8 * i);
+        }
+
+        Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8);
+        Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8);
+        for (size_t y = 0; y < kRowsPerThread; y++) {
+          for (size_t i = 0; i < kRowsPerThread; i += N) {
+            auto even = Load(d, b_p_out_evenT + 8 * y + i);
+            auto odd = Load(d, b_p_out_oddT + 8 * y + i);
+            StoreInterleaved(d, even, odd,
+                             p_out + ((x + i) << 1) + onerow_out * y);
+          }
+        }
+      }
+    }
+#endif
+    for (size_t y = 0; y < rows; y++) {
+      unsqueeze_row(y0 + y, x);
+    }
+  };
+  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread),
+                                ThreadPool::NoInit, unsqueeze_span,
+                                "InvHorizontalSqueeze"));
+  input.channel[c] = std::move(chout);
+  return true;
+}
+
+Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
+  JXL_ASSERT(c < input.channel.size());
+  JXL_ASSERT(rc < input.channel.size());
+  const Channel &chin = input.channel[c];
+  const Channel &chin_residual = input.channel[rc];
+  // These must be valid since we ran MetaApply already.
+  JXL_ASSERT(chin.h == DivCeil(chin.h + chin_residual.h, 2));
+  JXL_ASSERT(chin.w == chin_residual.w);
+
+  if (chin_residual.h == 0) {
+    // Short-circuit: output channel has same dimensions as input.
+    input.channel[c].vshift--;
+    return true;
+  }
+
+  // Note: chin.h >= chin_residual.h and at most 1 different.
+  Channel chout(chin.w, chin.h + chin_residual.h, chin.hshift, chin.vshift - 1);
+  JXL_DEBUG_V(
+      4,
+      "Undoing vertical squeeze of channel %i using residuals in channel "
+      "%i (going from height %" PRIuS " to %" PRIuS ")",
+      c, rc, chin.h, chout.h);
+
+  if (chin_residual.w == 0) {
+    // Short-circuit: channel with no pixels.
+    input.channel[c] = std::move(chout);
+    return true;
+  }
+
+  static constexpr const int kColsPerThread = 64;
+  const auto unsqueeze_slice = [&](const uint32_t task, size_t /* thread */) {
+    const size_t x0 = task * kColsPerThread;
+    const size_t x1 = std::min((size_t)(task + 1) * kColsPerThread, chin.w);
+    const size_t w = x1 - x0;
+    // We only iterate up to std::min(chin_residual.h, chin.h) which is
+    // always chin_residual.h.
+    for (size_t y = 0; y < chin_residual.h; y++) {
+      const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0;
+      const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0;
+      const pixel_type *JXL_RESTRICT p_navg =
+          chin.Row(y + 1 < chin.h ? y + 1 : y) + x0;
+      pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0;
+      pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0;
+      const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg;
+      size_t x = 0;
+#if HWY_TARGET != HWY_SCALAR
+      for (; x + 7 < w; x += 8) {
+        FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x,
+                      p_out + x, p_nout + x);
+      }
+#endif
+      for (; x < w; x++) {
+        pixel_type_w avg = p_avg[x];
+        pixel_type_w next_avg = p_navg[x];
+        pixel_type_w top = p_pout[x];
+        pixel_type_w tendency = SmoothTendency(top, avg, next_avg);
+        pixel_type_w diff_minus_tendency = p_residual[x];
+        pixel_type_w diff = diff_minus_tendency + tendency;
+        pixel_type_w out = avg + (diff / 2);
+        p_out[x] = out;
+        // If the chin_residual.h == chin.h, the output has an even number
+        // of rows so the next line is fine. Otherwise, this loop won't
+        // write to the last output row which is handled separately.
+        p_nout[x] = out - diff;
+      }
+    }
+  };
+  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread),
+                                ThreadPool::NoInit, unsqueeze_slice,
+                                "InvVertSqueeze"));
+
+  if (chout.h & 1) {
+    size_t y = chin.h - 1;
+    const pixel_type *p_avg = chin.Row(y);
+    pixel_type *p_out = chout.Row(y << 1);
+    for (size_t x = 0; x < chin.w; x++) {
+      p_out[x] = p_avg[x];
+    }
+  }
+  input.channel[c] = std::move(chout);
+  return true;
+}
+
+Status InvSqueeze(Image &input, std::vector<SqueezeParams> parameters,
+                  ThreadPool *pool) {
+  for (int i = parameters.size() - 1; i >= 0; i--) {
+    JXL_RETURN_IF_ERROR(
+        CheckMetaSqueezeParams(parameters[i], input.channel.size()));
+    bool horizontal = parameters[i].horizontal;
+    bool in_place = parameters[i].in_place;
+    uint32_t beginc = parameters[i].begin_c;
+    uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1;
+    uint32_t offset;
+    if (in_place) {
+      offset = endc + 1;
+    } else {
+      offset = input.channel.size() + beginc - endc - 1;
+    }
+    if (beginc < input.nb_meta_channels) {
+      // This is checked in MetaSqueeze.
+      JXL_ASSERT(input.nb_meta_channels > parameters[i].num_c);
+      input.nb_meta_channels -= parameters[i].num_c;
+    }
+
+    for (uint32_t c = beginc; c <= endc; c++) {
+      uint32_t rc = offset + c - beginc;
+      // MetaApply should imply that `rc` is within range, otherwise there's a
+      // programming bug.
+      JXL_ASSERT(rc < input.channel.size());
+      if ((input.channel[c].w < input.channel[rc].w) ||
+          (input.channel[c].h < input.channel[rc].h)) {
+        return JXL_FAILURE("Corrupted squeeze transform");
+      }
+      if (horizontal) {
+        JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool));
+      } else {
+        JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool));
+      }
+    }
+    input.channel.erase(input.channel.begin() + offset,
+                        input.channel.begin() + offset + (endc - beginc + 1));
+  }
+  return true;
+}
+
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace jxl {
+
+HWY_EXPORT(InvSqueeze);
+Status InvSqueeze(Image &input, std::vector<SqueezeParams> parameters,
+                  ThreadPool *pool) {
+  return HWY_DYNAMIC_DISPATCH(InvSqueeze)(input, parameters, pool);
+}
+
+void DefaultSqueezeParameters(std::vector<SqueezeParams> *parameters,
+                              const Image &image) {
+  int nb_channels = image.channel.size() - image.nb_meta_channels;
+
+  parameters->clear();
+  size_t w = image.channel[image.nb_meta_channels].w;
+  size_t h = image.channel[image.nb_meta_channels].h;
+  JXL_DEBUG_V(
+      7, "Default squeeze parameters for %" PRIuS "x%" PRIuS " image: ", w, h);
+
+  // do horizontal first on wide images; vertical first on tall images
+  bool wide = (w > h);
+
+  if (nb_channels > 2 && image.channel[image.nb_meta_channels + 1].w == w &&
+      image.channel[image.nb_meta_channels + 1].h == h) {
+    // assume channels 1 and 2 are chroma, and can be squeezed first for 4:2:0
+    // previews
+    JXL_DEBUG_V(7, "(4:2:0 chroma), %" PRIuS "x%" PRIuS " image", w, h);
+    SqueezeParams params;
+    // horizontal chroma squeeze
+    params.horizontal = true;
+    params.in_place = false;
+    params.begin_c = image.nb_meta_channels + 1;
+    params.num_c = 2;
+    parameters->push_back(params);
+    params.horizontal = false;
+    // vertical chroma squeeze
+    parameters->push_back(params);
+  }
+  SqueezeParams params;
+  params.begin_c = image.nb_meta_channels;
+  params.num_c = nb_channels;
+  params.in_place = true;
+
+  if (!wide) {
+    if (h > JXL_MAX_FIRST_PREVIEW_SIZE) {
+      params.horizontal = false;
+      parameters->push_back(params);
+      h = (h + 1) / 2;
+      JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h);
+    }
+  }
+  while (w > JXL_MAX_FIRST_PREVIEW_SIZE || h > JXL_MAX_FIRST_PREVIEW_SIZE) {
+    if (w > JXL_MAX_FIRST_PREVIEW_SIZE) {
+      params.horizontal = true;
+      parameters->push_back(params);
+      w = (w + 1) / 2;
+      JXL_DEBUG_V(7, "Horizontal (%" PRIuS "x%" PRIuS "), ", w, h);
+    }
+    if (h > JXL_MAX_FIRST_PREVIEW_SIZE) {
+      params.horizontal = false;
+      parameters->push_back(params);
+      h = (h + 1) / 2;
+      JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h);
+    }
+  }
+  JXL_DEBUG_V(7, "that's it");
+}
+
+Status CheckMetaSqueezeParams(const SqueezeParams &parameter,
+                              int num_channels) {
+  int c1 = parameter.begin_c;
+  int c2 = parameter.begin_c + parameter.num_c - 1;
+  if (c1 < 0 || c1 >= num_channels || c2 < 0 || c2 >= num_channels || c2 < c1) {
+    return JXL_FAILURE("Invalid channel range");
+  }
+  return true;
+}
+
+Status MetaSqueeze(Image &image, std::vector<SqueezeParams> *parameters) {
+  if (parameters->empty()) {
+    DefaultSqueezeParameters(parameters, image);
+  }
+
+  for (size_t i = 0; i < parameters->size(); i++) {
+    JXL_RETURN_IF_ERROR(
+        CheckMetaSqueezeParams((*parameters)[i], image.channel.size()));
+    bool horizontal = (*parameters)[i].horizontal;
+    bool in_place = (*parameters)[i].in_place;
+    uint32_t beginc = (*parameters)[i].begin_c;
+    uint32_t endc = (*parameters)[i].begin_c + (*parameters)[i].num_c - 1;
+
+    uint32_t offset;
+    if (beginc < image.nb_meta_channels) {
+      if (endc >= image.nb_meta_channels) {
+        return JXL_FAILURE("Invalid squeeze: mix of meta and nonmeta channels");
+      }
+      if (!in_place) {
+        return JXL_FAILURE(
+            "Invalid squeeze: meta channels require in-place residuals");
+      }
+      image.nb_meta_channels += (*parameters)[i].num_c;
+    }
+    if (in_place) {
+      offset = endc + 1;
+    } else {
+      offset = image.channel.size();
+    }
+    for (uint32_t c = beginc; c <= endc; c++) {
+      if (image.channel[c].hshift > 30 || image.channel[c].vshift > 30) {
+        return JXL_FAILURE("Too many squeezes: shift > 30");
+      }
+      size_t w = image.channel[c].w;
+      size_t h = image.channel[c].h;
+      if (w == 0 || h == 0) return JXL_FAILURE("Squeezing empty channel");
+      if (horizontal) {
+        image.channel[c].w = (w + 1) / 2;
+        if (image.channel[c].hshift >= 0) image.channel[c].hshift++;
+        w = w - (w + 1) / 2;
+      } else {
+        image.channel[c].h = (h + 1) / 2;
+        if (image.channel[c].vshift >= 0) image.channel[c].vshift++;
+        h = h - (h + 1) / 2;
+      }
+      image.channel[c].shrink();
+      Channel dummy(w, h);
+      dummy.hshift = image.channel[c].hshift;
+      dummy.vshift = image.channel[c].vshift;
+
+      image.channel.insert(image.channel.begin() + offset + (c - beginc),
+                           std::move(dummy));
+      JXL_DEBUG_V(8, "MetaSqueeze applied, current image: %s",
+                  image.DebugString().c_str());
+    }
+  }
+  return true;
+}
+
+}  // namespace jxl
+
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.h b/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.h
new file mode 100644
index 0000000000..fb18710a6f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.h
@@ -0,0 +1,90 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_SQUEEZE_H_
+#define LIB_JXL_MODULAR_TRANSFORM_SQUEEZE_H_
+
+// Haar-like transform: halves the resolution in one direction
+// A B   -> (A+B)>>1              in one channel (average)  -> same range as
+// original channel
+//          A-B - tendency        in a new channel ('residual' needed to make
+//          the transform reversible)
+//                                        -> theoretically range could be 2.5
+//                                        times larger (2 times without the
+//                                        'tendency'), but there should be lots
+//                                        of zeroes
+// Repeated application (alternating horizontal and vertical squeezes) results
+// in downscaling
+//
+// The default coefficient ordering is low-frequency to high-frequency, as in
+// M. Antonini, M. Barlaud, P. Mathieu and I. Daubechies, "Image coding using
+// wavelet transform", IEEE Transactions on Image Processing, vol. 1, no. 2, pp.
+// 205-220, April 1992, doi: 10.1109/83.136597.
+
+#include <stdlib.h>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+#define JXL_MAX_FIRST_PREVIEW_SIZE 8
+
+namespace jxl {
+
+/*
+        int avg=(A+B)>>1;
+        int diff=(A-B);
+        int rA=(diff+(avg<<1)+(diff&1))>>1;
+        int rB=rA-diff;
+
+*/
+//         |A B|C D|E F|
+//           p   a   n             p=avg(A,B), a=avg(C,D), n=avg(E,F)
+//
+// Goal: estimate C-D (avoiding ringing artifacts)
+// (ensuring that in smooth areas, a zero residual corresponds to a smooth
+// gradient)
+
+// best estimate for C: (B + 2*a)/3
+// best estimate for D: (n + 3*a)/4
+// best estimate for C-D:  4*B - 3*n - a /12
+
+// avoid ringing by 1) only doing this if B <= a <= n  or  B >= a >= n
+// (otherwise, this is not a smooth area and we cannot really estimate C-D)
+//                  2) making sure that B <= C <= D <= n  or B >= C >= D >= n
+
+inline pixel_type_w SmoothTendency(pixel_type_w B, pixel_type_w a,
+                                   pixel_type_w n) {
+  pixel_type_w diff = 0;
+  if (B >= a && a >= n) {
+    diff = (4 * B - 3 * n - a + 6) / 12;
+    //      2C = a<<1 + diff - diff&1 <= 2B  so diff - diff&1 <= 2B - 2a
+    //      2D = a<<1 - diff - diff&1 >= 2n  so diff + diff&1 <= 2a - 2n
+    if (diff - (diff & 1) > 2 * (B - a)) diff = 2 * (B - a) + 1;
+    if (diff + (diff & 1) > 2 * (a - n)) diff = 2 * (a - n);
+  } else if (B <= a && a <= n) {
+    diff = (4 * B - 3 * n - a - 6) / 12;
+    //      2C = a<<1 + diff + diff&1 >= 2B  so diff + diff&1 >= 2B - 2a
+    //      2D = a<<1 - diff + diff&1 <= 2n  so diff - diff&1 >= 2a - 2n
+    if (diff + (diff & 1) < 2 * (B - a)) diff = 2 * (B - a) - 1;
+    if (diff - (diff & 1) < 2 * (a - n)) diff = 2 * (a - n);
+  }
+  return diff;
+}
+
+void DefaultSqueezeParameters(std::vector<SqueezeParams> *parameters,
+                              const Image &image);
+
+Status CheckMetaSqueezeParams(const SqueezeParams &parameter, int num_channels);
+
+Status MetaSqueeze(Image &image, std::vector<SqueezeParams> *parameters);
+
+Status InvSqueeze(Image &input, std::vector<SqueezeParams> parameters,
+                  ThreadPool *pool);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_SQUEEZE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc
new file mode 100644
index 0000000000..d9f2b435bf
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc
@@ -0,0 +1,98 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/transform.h"
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/palette.h"
+#include "lib/jxl/modular/transform/rct.h"
+#include "lib/jxl/modular/transform/squeeze.h"
+
+namespace jxl {
+
+SqueezeParams::SqueezeParams() { Bundle::Init(this); }
+Transform::Transform(TransformId id) {
+  Bundle::Init(this);
+  this->id = id;
+}
+
+Status Transform::Inverse(Image &input, const weighted::Header &wp_header,
+                          ThreadPool *pool) {
+  JXL_DEBUG_V(6, "Input channels (%" PRIuS ", %" PRIuS " meta): ",
+              input.channel.size(), input.nb_meta_channels);
+  switch (id) {
+    case TransformId::kRCT:
+      return InvRCT(input, begin_c, rct_type, pool);
+    case TransformId::kSqueeze:
+      return InvSqueeze(input, squeezes, pool);
+    case TransformId::kPalette:
+      return InvPalette(input, begin_c, nb_colors, nb_deltas, predictor,
+                        wp_header, pool);
+    default:
+      return JXL_FAILURE("Unknown transformation (ID=%u)",
+                         static_cast<unsigned int>(id));
+  }
+}
+
+Status Transform::MetaApply(Image &input) {
+  JXL_DEBUG_V(6, "MetaApply input: %s", input.DebugString().c_str());
+  switch (id) {
+    case TransformId::kRCT:
+      JXL_DEBUG_V(2, "Transform: kRCT, rct_type=%" PRIu32, rct_type);
+      return CheckEqualChannels(input, begin_c, begin_c + 2);
+    case TransformId::kSqueeze:
+      JXL_DEBUG_V(2, "Transform: kSqueeze:");
+#if JXL_DEBUG_V_LEVEL >= 2
+      {
+        auto squeezes_copy = squeezes;
+        if (squeezes_copy.empty()) {
+          DefaultSqueezeParameters(&squeezes_copy, input);
+        }
+        for (const auto &params : squeezes_copy) {
+          JXL_DEBUG_V(
+              2,
+              "  squeeze params: horizontal=%d, in_place=%d, begin_c=%" PRIu32
+              ", num_c=%" PRIu32,
+              params.horizontal, params.in_place, params.begin_c, params.num_c);
+        }
+      }
+#endif
+      return MetaSqueeze(input, &squeezes);
+    case TransformId::kPalette:
+      JXL_DEBUG_V(2,
+                  "Transform: kPalette, begin_c=%" PRIu32 ", num_c=%" PRIu32
+                  ", nb_colors=%" PRIu32 ", nb_deltas=%" PRIu32,
+                  begin_c, num_c, nb_colors, nb_deltas);
+      return MetaPalette(input, begin_c, begin_c + num_c - 1, nb_colors,
+                         nb_deltas, lossy_palette);
+    default:
+      return JXL_FAILURE("Unknown transformation (ID=%u)",
+                         static_cast<unsigned int>(id));
+  }
+}
+
+Status CheckEqualChannels(const Image &image, uint32_t c1, uint32_t c2) {
+  if (c1 > image.channel.size() || c2 >= image.channel.size() || c2 < c1) {
+    return JXL_FAILURE("Invalid channel range: %u..%u (there are only %" PRIuS
+                       " channels)",
+                       c1, c2, image.channel.size());
+  }
+  if (c1 < image.nb_meta_channels && c2 >= image.nb_meta_channels) {
+    return JXL_FAILURE("Invalid: transforming mix of meta and nonmeta");
+  }
+  const auto &ch1 = image.channel[c1];
+  for (size_t c = c1 + 1; c <= c2; c++) {
+    const auto &ch2 = image.channel[c];
+    if (ch1.w != ch2.w || ch1.h != ch2.h || ch1.hshift != ch2.hshift ||
+        ch1.vshift != ch2.vshift) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/transform.h b/third_party/jpeg-xl/lib/jxl/modular/transform/transform.h
new file mode 100644
index 0000000000..d5d3259f7a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/transform.h
@@ -0,0 +1,148 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_TRANSFORM_H_
+#define LIB_JXL_MODULAR_TRANSFORM_TRANSFORM_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/options.h"
+
+namespace jxl {
+
+enum class TransformId : uint32_t {
+  // G, R-G, B-G and variants (including YCoCg).
+  kRCT = 0,
+
+  // Color palette. Parameters are: [begin_c] [end_c] [nb_colors]
+  kPalette = 1,
+
+  // Squeezing (Haar-style)
+  kSqueeze = 2,
+
+  // Invalid for now.
+  kInvalid = 3,
+};
+
+struct SqueezeParams : public Fields {
+  JXL_FIELDS_NAME(SqueezeParams)
+  bool horizontal;
+  bool in_place;
+  uint32_t begin_c;
+  uint32_t num_c;
+  SqueezeParams();
+  Status VisitFields(Visitor *JXL_RESTRICT visitor) override {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &horizontal));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &in_place));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Bits(3), BitsOffset(6, 8),
+                                           BitsOffset(10, 72),
+                                           BitsOffset(13, 1096), 0, &begin_c));
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(1), Val(2), Val(3), BitsOffset(4, 4), 2, &num_c));
+    return true;
+  }
+};
+
+class Transform : public Fields {
+ public:
+  TransformId id;
+  // for Palette and RCT.
+  uint32_t begin_c;
+  // for RCT. 42 possible values starting from 0.
+  uint32_t rct_type;
+  // Only for Palette and NearLossless.
+  uint32_t num_c;
+  // Only for Palette.
+  uint32_t nb_colors;
+  uint32_t nb_deltas;
+  // for Squeeze. Default squeeze if empty.
+  std::vector<SqueezeParams> squeezes;
+  // for NearLossless, not serialized.
+  int max_delta_error;
+  // Serialized for Palette.
+  Predictor predictor;
+  // for Palette, not serialized.
+  bool ordered_palette = true;
+  bool lossy_palette = false;
+
+  explicit Transform(TransformId id);
+  // default constructor for bundles.
+  Transform() : Transform(TransformId::kInvalid) {}
+
+  Status VisitFields(Visitor *JXL_RESTRICT visitor) override {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+        Val((uint32_t)TransformId::kRCT), Val((uint32_t)TransformId::kPalette),
+        Val((uint32_t)TransformId::kSqueeze),
+        Val((uint32_t)TransformId::kInvalid), (uint32_t)TransformId::kRCT,
+        reinterpret_cast<uint32_t *>(&id)));
+    if (id == TransformId::kInvalid) {
+      return JXL_FAILURE("Invalid transform ID");
+    }
+    if (visitor->Conditional(id == TransformId::kRCT ||
+                             id == TransformId::kPalette)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Bits(3), BitsOffset(6, 8), BitsOffset(10, 72),
+                       BitsOffset(13, 1096), 0, &begin_c));
+    }
+    if (visitor->Conditional(id == TransformId::kRCT)) {
+      // 0-41, default YCoCg.
+      JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(6), Bits(2), BitsOffset(4, 2),
+                                             BitsOffset(6, 10), 6, &rct_type));
+      if (rct_type >= 42) {
+        return JXL_FAILURE("Invalid transform RCT type");
+      }
+    }
+    if (visitor->Conditional(id == TransformId::kPalette)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Val(1), Val(3), Val(4), BitsOffset(13, 1), 3, &num_c));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+          BitsOffset(8, 0), BitsOffset(10, 256), BitsOffset(12, 1280),
+          BitsOffset(16, 5376), 256, &nb_colors));
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Val(0), BitsOffset(8, 1), BitsOffset(10, 257),
+                       BitsOffset(16, 1281), 0, &nb_deltas));
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->Bits(4, (uint32_t)Predictor::Zero,
+                        reinterpret_cast<uint32_t *>(&predictor)));
+      if (predictor >= Predictor::Best) {
+        return JXL_FAILURE("Invalid predictor");
+      }
+    }
+
+    if (visitor->Conditional(id == TransformId::kSqueeze)) {
+      uint32_t num_squeezes = static_cast<uint32_t>(squeezes.size());
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Val(0), BitsOffset(4, 1), BitsOffset(6, 9),
+                       BitsOffset(8, 41), 0, &num_squeezes));
+      if (visitor->IsReading()) squeezes.resize(num_squeezes);
+      for (size_t i = 0; i < num_squeezes; i++) {
+        JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&squeezes[i]));
+      }
+    }
+    return true;
+  }
+
+  JXL_FIELDS_NAME(Transform)
+
+  Status Inverse(Image &input, const weighted::Header &wp_header,
+                 ThreadPool *pool = nullptr);
+  Status MetaApply(Image &input);
+};
+
+Status CheckEqualChannels(const Image &image, uint32_t c1, uint32_t c2);
+
+static inline pixel_type PixelAdd(pixel_type a, pixel_type b) {
+  return static_cast<pixel_type>(static_cast<uint32_t>(a) +
+                                 static_cast<uint32_t>(b));
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_TRANSFORM_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular_test.cc b/third_party/jpeg-xl/lib/jxl/modular_test.cc
new file mode 100644
index 0000000000..293f59ff87
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular_test.cc
@@ -0,0 +1,541 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <array>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/codec.h"
+#include "lib/extras/dec/jxl.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_butteraugli_pnorm.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_toc.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/modular/encoding/enc_encoding.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/encoding/ma_common.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+using test::Roundtrip;
+
+void TestLosslessGroups(size_t group_size_shift) {
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  CompressParams cparams;
+  cparams.SetLossless();
+  cparams.modular_group_size_shift = group_size_shift;
+
+  CodecInOut io_out;
+
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(io.xsize() / 4, io.ysize() / 4);
+
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
+  EXPECT_LE(compressed_size, 280000u);
+  JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io_out.Main().color(), _));
+}
+
+TEST(ModularTest, RoundtripLosslessGroups128) { TestLosslessGroups(0); }
+
+TEST(ModularTest, JXL_TSAN_SLOW_TEST(RoundtripLosslessGroups512)) {
+  TestLosslessGroups(2);
+}
+
+TEST(ModularTest, JXL_TSAN_SLOW_TEST(RoundtripLosslessGroups1024)) {
+  TestLosslessGroups(3);
+}
+
+TEST(ModularTest, RoundtripLosslessCustomWP_PermuteRCT) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CompressParams cparams;
+  cparams.SetLossless();
+  // 9 = permute to GBR, to test the special case of permutation-only
+  cparams.colorspace = 9;
+  // slowest speed so different WP modes are tried
+  cparams.speed_tier = SpeedTier::kTortoise;
+  cparams.options.predictor = {Predictor::Weighted};
+
+  CodecInOut io_out;
+
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(100, 100);
+
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
+  EXPECT_LE(compressed_size, 10150u);
+  JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io_out.Main().color(), _));
+}
+
+TEST(ModularTest, RoundtripLossyDeltaPalette) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CompressParams cparams;
+  cparams.modular_mode = true;
+  cparams.color_transform = jxl::ColorTransform::kNone;
+  cparams.lossy_palette = true;
+  cparams.palette_colors = 0;
+
+  CodecInOut io_out;
+
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(300, 100);
+
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
+  EXPECT_LE(compressed_size, 6800u);
+  cparams.ba_params.intensity_target = 80.0f;
+  EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, cparams.ba_params,
+                                  GetJxlCms(),
+                                  /*distmap=*/nullptr),
+              IsSlightlyBelow(1.5));
+}
+TEST(ModularTest, RoundtripLossyDeltaPaletteWP) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CompressParams cparams;
+  cparams.SetLossless();
+  cparams.lossy_palette = true;
+  cparams.palette_colors = 0;
+  cparams.options.predictor = jxl::Predictor::Weighted;
+
+  CodecInOut io_out;
+
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(300, 100);
+
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
+  EXPECT_LE(compressed_size, 7000u);
+  cparams.ba_params.intensity_target = 80.0f;
+  EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, cparams.ba_params,
+                                  GetJxlCms(),
+                                  /*distmap=*/nullptr),
+              IsSlightlyBelow(10.1));
+}
+
+TEST(ModularTest, RoundtripLossy) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CompressParams cparams;
+  cparams.modular_mode = true;
+  cparams.butteraugli_distance = 2.f;
+
+  CodecInOut io_out;
+
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
+  EXPECT_LE(compressed_size, 30000u);
+  cparams.ba_params.intensity_target = 80.0f;
+  EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, cparams.ba_params,
+                                  GetJxlCms(),
+                                  /*distmap=*/nullptr),
+              IsSlightlyBelow(2.3));
+}
+
+TEST(ModularTest, RoundtripLossy16) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/raw.pixls/DJI-FC6310-16bit_709_v4_krita.png");
+  CompressParams cparams;
+  cparams.modular_mode = true;
+  cparams.butteraugli_distance = 2.f;
+
+  CodecInOut io_out;
+
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  JXL_CHECK(!io.metadata.m.have_preview);
+  JXL_CHECK(io.frames.size() == 1);
+  JXL_CHECK(io.frames[0].TransformTo(ColorEncoding::SRGB(), GetJxlCms()));
+  io.metadata.m.color_encoding = ColorEncoding::SRGB();
+
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
+  EXPECT_LE(compressed_size, 300u);
+  cparams.ba_params.intensity_target = 80.0f;
+  EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, cparams.ba_params,
+                                  GetJxlCms(),
+                                  /*distmap=*/nullptr),
+              IsSlightlyBelow(1.6));
+}
+
+TEST(ModularTest, RoundtripExtraProperties) {
+  constexpr size_t kSize = 250;
+  Image image(kSize, kSize, /*bitdepth=*/8, 3);
+  ModularOptions options;
+  options.max_properties = 4;
+  options.predictor = Predictor::Zero;
+  Rng rng(0);
+  for (size_t y = 0; y < kSize; y++) {
+    for (size_t x = 0; x < kSize; x++) {
+      image.channel[0].plane.Row(y)[x] = image.channel[2].plane.Row(y)[x] =
+          rng.UniformU(0, 9);
+    }
+  }
+  ZeroFillImage(&image.channel[1].plane);
+  BitWriter writer;
+  ASSERT_TRUE(ModularGenericCompress(image, options, &writer));
+  writer.ZeroPadToByte();
+  Image decoded(kSize, kSize, /*bitdepth=*/8, image.channel.size());
+  for (size_t i = 0; i < image.channel.size(); i++) {
+    const Channel& ch = image.channel[i];
+    decoded.channel[i] = Channel(ch.w, ch.h, ch.hshift, ch.vshift);
+  }
+  Status status = true;
+  {
+    BitReader reader(writer.GetSpan());
+    BitReaderScopedCloser closer(&reader, &status);
+    ASSERT_TRUE(ModularGenericDecompress(&reader, decoded, /*header=*/nullptr,
+                                         /*group_id=*/0, &options));
+  }
+  ASSERT_TRUE(status);
+  ASSERT_EQ(image.channel.size(), decoded.channel.size());
+  for (size_t c = 0; c < image.channel.size(); c++) {
+    for (size_t y = 0; y < image.channel[c].plane.ysize(); y++) {
+      for (size_t x = 0; x < image.channel[c].plane.xsize(); x++) {
+        EXPECT_EQ(image.channel[c].plane.Row(y)[x],
+                  decoded.channel[c].plane.Row(y)[x])
+            << "c = " << c << ", x = " << x << ",  y = " << y;
+      }
+    }
+  }
+}
+
+TEST(ModularTest, RoundtripLosslessCustomSqueeze) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+
+  CompressParams cparams;
+  cparams.modular_mode = true;
+  cparams.color_transform = jxl::ColorTransform::kNone;
+  cparams.butteraugli_distance = 0.f;
+  cparams.options.predictor = {Predictor::Zero};
+  cparams.speed_tier = SpeedTier::kThunder;
+  cparams.responsive = 1;
+  // Custom squeeze params, atm just for testing
+  SqueezeParams p;
+  p.horizontal = true;
+  p.in_place = false;
+  p.begin_c = 0;
+  p.num_c = 3;
+  cparams.squeezes.push_back(p);
+  p.begin_c = 1;
+  p.in_place = true;
+  p.horizontal = false;
+  cparams.squeezes.push_back(p);
+
+  CodecInOut io2;
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
+  EXPECT_LE(compressed_size, 265000u);
+  JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io2.Main().color(), _));
+}
+
+struct RoundtripLosslessConfig {
+  int bitdepth;
+  int responsive;
+};
+class ModularTestParam
+    : public ::testing::TestWithParam<RoundtripLosslessConfig> {};
+
+std::vector<RoundtripLosslessConfig> GenerateLosslessTests() {
+  std::vector<RoundtripLosslessConfig> all;
+  for (int responsive = 0; responsive <= 1; responsive++) {
+    for (int bitdepth = 1; bitdepth < 32; bitdepth++) {
+      if (responsive && bitdepth > 30) continue;
+      all.push_back({bitdepth, responsive});
+    }
+  }
+  return all;
+}
+std::string LosslessTestDescription(
+    const testing::TestParamInfo<ModularTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param.bitdepth << "bit";
+  if (info.param.responsive) name << "Squeeze";
+  return name.str();
+}
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(RoundtripLossless, ModularTestParam,
+                                   testing::ValuesIn(GenerateLosslessTests()),
+                                   LosslessTestDescription);
+
+TEST_P(ModularTestParam, RoundtripLossless) {
+  RoundtripLosslessConfig config = GetParam();
+  int bitdepth = config.bitdepth;
+  int responsive = config.responsive;
+
+  ThreadPool* pool = nullptr;
+  Rng generator(123);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io1;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io1, pool));
+
+  // vary the dimensions a bit, in case of bugs related to
+  // even vs odd width or height.
+  size_t xsize = 423 + bitdepth;
+  size_t ysize = 467 + bitdepth;
+
+  CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+  io.metadata.m.SetUintSamples(bitdepth);
+
+  double factor = ((1lu << bitdepth) - 1lu);
+  double ifactor = 1.0 / factor;
+  Image3F noise_added(xsize, ysize);
+
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < ysize; y++) {
+      const float* in = io1.Main().color()->PlaneRow(c, y);
+      float* out = noise_added.PlaneRow(c, y);
+      for (size_t x = 0; x < xsize; x++) {
+        // make the least significant bits random
+        float f = in[x] + generator.UniformF(0.0f, 1.f / 255.f);
+        if (f > 1.f) f = 1.f;
+        // quantize to the bitdepth we're testing
+        unsigned int u = f * factor + 0.5;
+        out[x] = u * ifactor;
+      }
+    }
+  }
+  io.SetFromImage(std::move(noise_added), jxl::ColorEncoding::SRGB(false));
+
+  CompressParams cparams;
+  cparams.modular_mode = true;
+  cparams.color_transform = jxl::ColorTransform::kNone;
+  cparams.butteraugli_distance = 0.f;
+  cparams.options.predictor = {Predictor::Zero};
+  cparams.speed_tier = SpeedTier::kThunder;
+  cparams.responsive = responsive;
+  CodecInOut io2;
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
+  EXPECT_LE(compressed_size, bitdepth * xsize * ysize / 3);
+  EXPECT_LE(0, ComputeDistance2(io.Main(), io2.Main(), GetJxlCms()));
+  size_t different = 0;
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < ysize; y++) {
+      const float* in = io.Main().color()->PlaneRow(c, y);
+      const float* out = io2.Main().color()->PlaneRow(c, y);
+      for (size_t x = 0; x < xsize; x++) {
+        uint32_t uin = in[x] * factor + 0.5;
+        uint32_t uout = out[x] * factor + 0.5;
+        // check that the integer values are identical
+        if (uin != uout) different++;
+      }
+    }
+  }
+  EXPECT_EQ(different, 0);
+}
+
+TEST(ModularTest, RoundtripLosslessCustomFloat) {
+  CodecInOut io;
+  size_t xsize = 100, ysize = 300;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.bit_depth.bits_per_sample = 18;
+  io.metadata.m.bit_depth.exponent_bits_per_sample = 6;
+  io.metadata.m.bit_depth.floating_point_sample = true;
+  io.metadata.m.modular_16_bit_buffer_sufficient = false;
+  ColorEncoding color_encoding;
+  color_encoding.tf.SetTransferFunction(TransferFunction::kLinear);
+  color_encoding.SetColorSpace(ColorSpace::kRGB);
+  Image3F testimage(xsize, ysize);
+  float factor = 1.f / (1 << 14);
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < ysize; y++) {
+      float* const JXL_RESTRICT row = testimage.PlaneRow(c, y);
+      for (size_t x = 0; x < xsize; x++) {
+        row[x] = factor * (x ^ y);
+      }
+    }
+  }
+  io.SetFromImage(std::move(testimage), color_encoding);
+  io.metadata.m.color_encoding = color_encoding;
+  io.metadata.m.SetIntensityTarget(255);
+
+  CompressParams cparams;
+  cparams.modular_mode = true;
+  cparams.color_transform = jxl::ColorTransform::kNone;
+  cparams.butteraugli_distance = 0.f;
+  cparams.options.predictor = {Predictor::Zero};
+  cparams.speed_tier = SpeedTier::kThunder;
+  cparams.decoding_speed_tier = 2;
+
+  CodecInOut io2;
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
+  EXPECT_LE(compressed_size, 23000u);
+  JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io2.Main().color(), _));
+}
+
+void WriteHeaders(BitWriter* writer, size_t xsize, size_t ysize) {
+  BitWriter::Allotment allotment(writer, 16);
+  writer->Write(8, 0xFF);
+  writer->Write(8, kCodestreamMarker);
+  allotment.ReclaimAndCharge(writer, 0, nullptr);
+  CodecMetadata metadata;
+  EXPECT_TRUE(metadata.size.Set(xsize, ysize));
+  EXPECT_TRUE(WriteSizeHeader(metadata.size, writer, 0, nullptr));
+  metadata.m.color_encoding = ColorEncoding::LinearSRGB(/*is_gray=*/true);
+  metadata.m.xyb_encoded = false;
+  metadata.m.SetUintSamples(31);
+  EXPECT_TRUE(WriteImageMetadata(metadata.m, writer, 0, nullptr));
+  metadata.transform_data.nonserialized_xyb_encoded = metadata.m.xyb_encoded;
+  EXPECT_TRUE(Bundle::Write(metadata.transform_data, writer, 0, nullptr));
+  writer->ZeroPadToByte();
+  FrameHeader frame_header(&metadata);
+  frame_header.encoding = FrameEncoding::kModular;
+  frame_header.loop_filter.gab = false;
+  frame_header.loop_filter.epf_iters = 0;
+  EXPECT_TRUE(WriteFrameHeader(frame_header, writer, nullptr));
+}
+
+// Tree with single node, zero predictor, offset is 1 and multiplier is 1,
+// entropy code is prefix tree with alphabet size 256 and all bits lengths 8.
+void WriteHistograms(BitWriter* writer) {
+  writer->Write(1, 1);  // default DC quant
+  writer->Write(1, 1);  // has_tree
+  // tree histograms
+  writer->Write(1, 0);         // LZ77 disabled
+  writer->Write(3, 1);         // simple context map
+  writer->Write(1, 1);         // prefix code
+  writer->Write(7, 0x63);      // UnintConfig(3, 2, 1)
+  writer->Write(12, 0xfef);    // alphabet_size = 256
+  writer->Write(32, 0x10003);  // all bit lengths 8
+  // tree tokens
+  writer->Write(8, 0);   // tree leaf
+  writer->Write(8, 0);   // zero predictor
+  writer->Write(8, 64);  // offset = UnpackSigned(ReverseBits(64)) = 1
+  writer->Write(16, 0);  // multiplier = 1
+  // histograms
+  writer->Write(1, 0);         // LZ77 disabled
+  writer->Write(1, 1);         // prefix code
+  writer->Write(7, 0x63);      // UnintConfig(3, 2, 1)
+  writer->Write(12, 0xfef);    // alphabet_size = 256
+  writer->Write(32, 0x10003);  // all bit lengths 8
+}
+
+TEST(ModularTest, PredictorIntegerOverflow) {
+  const size_t xsize = 1;
+  const size_t ysize = 1;
+  BitWriter writer;
+  WriteHeaders(&writer, xsize, ysize);
+  std::vector<BitWriter> group_codes(1);
+  {
+    BitWriter* bw = &group_codes[0];
+    BitWriter::Allotment allotment(bw, 1 << 20);
+    WriteHistograms(bw);
+    GroupHeader header;
+    header.use_global_tree = true;
+    EXPECT_TRUE(Bundle::Write(header, bw, 0, nullptr));
+    // After UnpackSigned this becomes (1 << 31) - 1, the largest pixel_type,
+    // and after adding the offset we get -(1 << 31).
+    bw->Write(8, 119);
+    bw->Write(28, 0xfffffff);
+    bw->ZeroPadToByte();
+    allotment.ReclaimAndCharge(bw, 0, nullptr);
+  }
+  EXPECT_TRUE(WriteGroupOffsets(group_codes, nullptr, &writer, nullptr));
+  writer.AppendByteAligned(group_codes);
+
+  PaddedBytes compressed = std::move(writer).TakeBytes();
+  extras::PackedPixelFile ppf;
+  extras::JXLDecompressParams params;
+  params.accepted_formats.push_back({1, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0});
+  EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), params,
+                             nullptr, &ppf));
+  ASSERT_EQ(1, ppf.frames.size());
+  const auto& img = ppf.frames[0].color;
+  const auto pixels = reinterpret_cast<const float*>(img.pixels());
+  EXPECT_EQ(-1.0f, pixels[0]);
+}
+
+TEST(ModularTest, UnsqueezeIntegerOverflow) {
+  // Image width is 9 so we can test both the SIMD and non-vector code paths.
+  const size_t xsize = 9;
+  const size_t ysize = 2;
+  BitWriter writer;
+  WriteHeaders(&writer, xsize, ysize);
+  std::vector<BitWriter> group_codes(1);
+  {
+    BitWriter* bw = &group_codes[0];
+    BitWriter::Allotment allotment(bw, 1 << 20);
+    WriteHistograms(bw);
+    GroupHeader header;
+    header.use_global_tree = true;
+    header.transforms.emplace_back();
+    header.transforms[0].id = TransformId::kSqueeze;
+    SqueezeParams params;
+    params.horizontal = false;
+    params.in_place = true;
+    params.begin_c = 0;
+    params.num_c = 1;
+    header.transforms[0].squeezes.emplace_back(params);
+    EXPECT_TRUE(Bundle::Write(header, bw, 0, nullptr));
+    for (size_t i = 0; i < xsize * ysize; ++i) {
+      // After UnpackSigned and adding offset, this becomes (1 << 31) - 1, both
+      // in the image and in the residual channels, and unsqueeze makes them
+      // ~(3 << 30) and (1 << 30) (in pixel_type_w) and the first wraps around
+      // to about -(1 << 30).
+      bw->Write(8, 119);
+      bw->Write(28, 0xffffffe);
+    }
+    bw->ZeroPadToByte();
+    allotment.ReclaimAndCharge(bw, 0, nullptr);
+  }
+  EXPECT_TRUE(WriteGroupOffsets(group_codes, nullptr, &writer, nullptr));
+  writer.AppendByteAligned(group_codes);
+
+  PaddedBytes compressed = std::move(writer).TakeBytes();
+  extras::PackedPixelFile ppf;
+  extras::JXLDecompressParams params;
+  params.accepted_formats.push_back({1, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0});
+  EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), params,
+                             nullptr, &ppf));
+  ASSERT_EQ(1, ppf.frames.size());
+  const auto& img = ppf.frames[0].color;
+  const auto pixels = reinterpret_cast<const float*>(img.pixels());
+  for (size_t x = 0; x < xsize; ++x) {
+    EXPECT_NEAR(-0.5f, pixels[x], 1e-10);
+    EXPECT_NEAR(0.5f, pixels[xsize + x], 1e-10);
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/noise.h b/third_party/jpeg-xl/lib/jxl/noise.h
new file mode 100644
index 0000000000..d897ea3abe
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/noise.h
@@ -0,0 +1,60 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_NOISE_H_
+#define LIB_JXL_NOISE_H_
+
+// Noise parameters shared by encoder/decoder.
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <cmath>
+#include <utility>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+
+const float kNoisePrecision = 1 << 10;
+
+struct NoiseParams {
+  // LUT index is an intensity of pixel / mean intensity of patch
+  static constexpr size_t kNumNoisePoints = 8;
+  float lut[kNumNoisePoints];
+
+  void Clear() {
+    for (float& i : lut) i = 0.f;
+  }
+  bool HasAny() const {
+    for (float i : lut) {
+      if (std::abs(i) > 1e-3f) return true;
+    }
+    return false;
+  }
+};
+
+static inline std::pair<int, float> IndexAndFrac(float x) {
+  constexpr size_t kScaleNumerator = NoiseParams::kNumNoisePoints - 2;
+  // TODO: instead of 1, this should be a proper Y range.
+  constexpr float kScale = kScaleNumerator / 1;
+  float scaled_x = std::max(0.f, x * kScale);
+  float floor_x;
+  float frac_x = std::modf(scaled_x, &floor_x);
+  if (JXL_UNLIKELY(scaled_x >= kScaleNumerator + 1)) {
+    floor_x = kScaleNumerator;
+    frac_x = 1.f;
+  }
+  return std::make_pair(static_cast<int>(floor_x), frac_x);
+}
+
+struct NoiseLevel {
+  float noise_level;
+  float intensity;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_NOISE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/opsin_image_test.cc b/third_party/jpeg-xl/lib/jxl/opsin_image_test.cc
new file mode 100644
index 0000000000..07fd824f14
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/opsin_image_test.cc
@@ -0,0 +1,123 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#include <hwy/tests/test_util-inl.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/matrix_ops.h"
+#include "lib/jxl/opsin_params.h"
+
+namespace jxl {
+namespace {
+
+// Convert a single linear sRGB color to xyb, using the exact image conversion
+// procedure that jpeg xl uses.
+void LinearSrgbToOpsin(float rgb_r, float rgb_g, float rgb_b,
+                       float* JXL_RESTRICT xyb_x, float* JXL_RESTRICT xyb_y,
+                       float* JXL_RESTRICT xyb_b) {
+  Image3F linear(1, 1);
+  linear.PlaneRow(0, 0)[0] = rgb_r;
+  linear.PlaneRow(1, 0)[0] = rgb_g;
+  linear.PlaneRow(2, 0)[0] = rgb_b;
+
+  ImageMetadata metadata;
+  metadata.SetFloat32Samples();
+  metadata.color_encoding = ColorEncoding::LinearSRGB();
+  ImageBundle ib(&metadata);
+  ib.SetFromImage(std::move(linear), metadata.color_encoding);
+  Image3F opsin(1, 1);
+  (void)ToXYB(ib, /*pool=*/nullptr, &opsin, GetJxlCms());
+
+  *xyb_x = opsin.PlaneRow(0, 0)[0];
+  *xyb_y = opsin.PlaneRow(1, 0)[0];
+  *xyb_b = opsin.PlaneRow(2, 0)[0];
+}
+
+// Convert a single XYB color to linear sRGB, using the exact image conversion
+// procedure that jpeg xl uses.
+void OpsinToLinearSrgb(float xyb_x, float xyb_y, float xyb_b,
+                       float* JXL_RESTRICT rgb_r, float* JXL_RESTRICT rgb_g,
+                       float* JXL_RESTRICT rgb_b) {
+  Image3F opsin(1, 1);
+  opsin.PlaneRow(0, 0)[0] = xyb_x;
+  opsin.PlaneRow(1, 0)[0] = xyb_y;
+  opsin.PlaneRow(2, 0)[0] = xyb_b;
+  Image3F linear(1, 1);
+  OpsinParams opsin_params;
+  opsin_params.Init(/*intensity_target=*/255.0f);
+  OpsinToLinear(opsin, Rect(opsin), nullptr, &linear, opsin_params);
+  *rgb_r = linear.PlaneRow(0, 0)[0];
+  *rgb_g = linear.PlaneRow(1, 0)[0];
+  *rgb_b = linear.PlaneRow(2, 0)[0];
+}
+
+void OpsinRoundtripTestRGB(float r, float g, float b) {
+  float xyb_x, xyb_y, xyb_b;
+  LinearSrgbToOpsin(r, g, b, &xyb_x, &xyb_y, &xyb_b);
+  float r2, g2, b2;
+  OpsinToLinearSrgb(xyb_x, xyb_y, xyb_b, &r2, &g2, &b2);
+  EXPECT_NEAR(r, r2, 1e-3);
+  EXPECT_NEAR(g, g2, 1e-3);
+  EXPECT_NEAR(b, b2, 1e-3);
+}
+
+TEST(OpsinImageTest, VerifyOpsinAbsorbanceInverseMatrix) {
+  float matrix[9];  // writable copy
+  for (int i = 0; i < 9; i++) {
+    matrix[i] = GetOpsinAbsorbanceInverseMatrix()[i];
+  }
+  EXPECT_TRUE(Inv3x3Matrix(matrix));
+  for (int i = 0; i < 9; i++) {
+    EXPECT_NEAR(matrix[i], kOpsinAbsorbanceMatrix[i], 1e-6);
+  }
+}
+
+TEST(OpsinImageTest, OpsinRoundtrip) {
+  OpsinRoundtripTestRGB(0, 0, 0);
+  OpsinRoundtripTestRGB(1. / 255, 1. / 255, 1. / 255);
+  OpsinRoundtripTestRGB(128. / 255, 128. / 255, 128. / 255);
+  OpsinRoundtripTestRGB(1, 1, 1);
+
+  OpsinRoundtripTestRGB(0, 0, 1. / 255);
+  OpsinRoundtripTestRGB(0, 0, 128. / 255);
+  OpsinRoundtripTestRGB(0, 0, 1);
+
+  OpsinRoundtripTestRGB(0, 1. / 255, 0);
+  OpsinRoundtripTestRGB(0, 128. / 255, 0);
+  OpsinRoundtripTestRGB(0, 1, 0);
+
+  OpsinRoundtripTestRGB(1. / 255, 0, 0);
+  OpsinRoundtripTestRGB(128. / 255, 0, 0);
+  OpsinRoundtripTestRGB(1, 0, 0);
+}
+
+TEST(OpsinImageTest, VerifyZero) {
+  // Test that black color (zero energy) is 0,0,0 in xyb.
+  float x, y, b;
+  LinearSrgbToOpsin(0, 0, 0, &x, &y, &b);
+  EXPECT_NEAR(0, x, 1e-9);
+  EXPECT_NEAR(0, y, 1e-7);
+  EXPECT_NEAR(0, b, 1e-7);
+}
+
+TEST(OpsinImageTest, VerifyGray) {
+  // Test that grayscale colors have a fixed y/b ratio and x==0.
+  for (size_t i = 1; i < 255; i++) {
+    float x, y, b;
+    LinearSrgbToOpsin(i / 255., i / 255., i / 255., &x, &y, &b);
+    EXPECT_NEAR(0, x, 1e-6);
+    EXPECT_NEAR(kYToBRatio, b / y, 3e-5);
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc b/third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc
new file mode 100644
index 0000000000..088253c2ce
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc
@@ -0,0 +1,57 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(OpsinInverseTest, LinearInverseInverts) {
+  Image3F linear(128, 128);
+  RandomFillImage(&linear, 0.0f, 1.0f);
+
+  CodecInOut io;
+  io.metadata.m.SetFloat32Samples();
+  io.metadata.m.color_encoding = ColorEncoding::LinearSRGB();
+  io.SetFromImage(CopyImage(linear), io.metadata.m.color_encoding);
+  ThreadPool* null_pool = nullptr;
+  Image3F opsin(io.xsize(), io.ysize());
+  (void)ToXYB(io.Main(), null_pool, &opsin, GetJxlCms());
+
+  OpsinParams opsin_params;
+  opsin_params.Init(/*intensity_target=*/255.0f);
+  OpsinToLinearInplace(&opsin, /*pool=*/nullptr, opsin_params);
+
+  JXL_ASSERT_OK(VerifyRelativeError(linear, opsin, 3E-3, 2E-4, _));
+}
+
+TEST(OpsinInverseTest, YcbCrInverts) {
+  Image3F rgb(128, 128);
+  RandomFillImage(&rgb, 0.0f, 1.0f);
+
+  ThreadPool* null_pool = nullptr;
+  Image3F ycbcr(rgb.xsize(), rgb.ysize());
+  EXPECT_TRUE(RgbToYcbcr(rgb.Plane(0), rgb.Plane(1), rgb.Plane(2),
+                         &ycbcr.Plane(1), &ycbcr.Plane(0), &ycbcr.Plane(2),
+                         null_pool));
+
+  Image3F rgb2(rgb.xsize(), rgb.ysize());
+  YcbcrToRgb(ycbcr, &rgb2, Rect(rgb));
+
+  JXL_ASSERT_OK(VerifyRelativeError(rgb, rgb2, 4E-5, 4E-7, _));
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/opsin_params.cc b/third_party/jpeg-xl/lib/jxl/opsin_params.cc
new file mode 100644
index 0000000000..ec3db4ee76
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/opsin_params.cc
@@ -0,0 +1,44 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/opsin_params.h"
+
+#include <stdlib.h>
+
+#include "lib/jxl/matrix_ops.h"
+
+namespace jxl {
+
+#define INVERSE_OPSIN_FROM_SPEC 1
+
+const float* GetOpsinAbsorbanceInverseMatrix() {
+#if INVERSE_OPSIN_FROM_SPEC
+  return DefaultInverseOpsinAbsorbanceMatrix();
+#else   // INVERSE_OPSIN_FROM_SPEC
+  // Compute the inverse opsin matrix from the forward matrix. Less precise
+  // than taking the values from the specification, but must be used if the
+  // forward transform is changed and the spec will require updating.
+  static const float* const kInverse = [] {
+    static float inverse[9];
+    for (int i = 0; i < 9; i++) {
+      inverse[i] = kOpsinAbsorbanceMatrix[i];
+    }
+    Inv3x3Matrix(inverse);
+    return inverse;
+  }();
+  return kInverse;
+#endif  // INVERSE_OPSIN_FROM_SPEC
+}
+
+void InitSIMDInverseMatrix(const float* JXL_RESTRICT inverse,
+                           float* JXL_RESTRICT simd_inverse,
+                           float intensity_target) {
+  for (size_t i = 0; i < 9; ++i) {
+    simd_inverse[4 * i] = simd_inverse[4 * i + 1] = simd_inverse[4 * i + 2] =
+        simd_inverse[4 * i + 3] = inverse[i] * (255.0f / intensity_target);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/opsin_params.h b/third_party/jpeg-xl/lib/jxl/opsin_params.h
new file mode 100644
index 0000000000..3a7da97d8a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/opsin_params.h
@@ -0,0 +1,86 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_OPSIN_PARAMS_H_
+#define LIB_JXL_OPSIN_PARAMS_H_
+
+// Constants that define the XYB color space.
+
+#include <stdlib.h>
+
+#include <cmath>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+
+// Parameters for opsin absorbance.
+static const float kM02 = 0.078f;
+static const float kM00 = 0.30f;
+static const float kM01 = 1.0f - kM02 - kM00;
+
+static const float kM12 = 0.078f;
+static const float kM10 = 0.23f;
+static const float kM11 = 1.0f - kM12 - kM10;
+
+static const float kM20 = 0.24342268924547819f;
+static const float kM21 = 0.20476744424496821f;
+static const float kM22 = 1.0f - kM20 - kM21;
+
+static const float kBScale = 1.0f;
+static const float kYToBRatio = 1.0f;  // works better with 0.50017729543783418
+static const float kBToYRatio = 1.0f / kYToBRatio;
+
+static const float kB0 = 0.0037930732552754493f;
+static const float kB1 = kB0;
+static const float kB2 = kB0;
+
+// Opsin absorbance matrix is now frozen.
+static const float kOpsinAbsorbanceMatrix[9] = {
+    kM00, kM01, kM02, kM10, kM11, kM12, kM20, kM21, kM22,
+};
+
+// Must be the inverse matrix of kOpsinAbsorbanceMatrix and match the spec.
+static inline const float* DefaultInverseOpsinAbsorbanceMatrix() {
+  static float kDefaultInverseOpsinAbsorbanceMatrix[9] = {
+      11.031566901960783f,  -9.866943921568629f, -0.16462299647058826f,
+      -3.254147380392157f,  4.418770392156863f,  -0.16462299647058826f,
+      -3.6588512862745097f, 2.7129230470588235f, 1.9459282392156863f};
+  return kDefaultInverseOpsinAbsorbanceMatrix;
+}
+
+// Returns 3x3 row-major matrix inverse of kOpsinAbsorbanceMatrix.
+// opsin_image_test verifies this is actually the inverse.
+const float* GetOpsinAbsorbanceInverseMatrix();
+
+void InitSIMDInverseMatrix(const float* JXL_RESTRICT inverse,
+                           float* JXL_RESTRICT simd_inverse,
+                           float intensity_target);
+
+static const float kOpsinAbsorbanceBias[3] = {
+    kB0,
+    kB1,
+    kB2,
+};
+
+static const float kNegOpsinAbsorbanceBiasRGB[4] = {
+    -kOpsinAbsorbanceBias[0], -kOpsinAbsorbanceBias[1],
+    -kOpsinAbsorbanceBias[2], 1.0f};
+
+static const float kScaledXYBOffset[3] = {
+    0.015386134f,
+    0.0f,
+    0.27770459f,
+};
+
+static const float kScaledXYBScale[3] = {
+    22.995788804f,
+    1.183000077f,
+    1.502141333f,
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_OPSIN_PARAMS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/padded_bytes_test.cc b/third_party/jpeg-xl/lib/jxl/padded_bytes_test.cc
new file mode 100644
index 0000000000..9ca7a22423
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/padded_bytes_test.cc
@@ -0,0 +1,126 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/padded_bytes.h"
+
+#include <numeric>  // iota
+#include <vector>
+
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(PaddedBytesTest, TestNonEmptyFirstByteZero) {
+  PaddedBytes pb(1);
+  EXPECT_EQ(0, pb[0]);
+  // Even after resizing..
+  pb.resize(20);
+  EXPECT_EQ(0, pb[0]);
+  // And reserving.
+  pb.reserve(200);
+  EXPECT_EQ(0, pb[0]);
+}
+
+TEST(PaddedBytesTest, TestEmptyFirstByteZero) {
+  PaddedBytes pb(0);
+  // After resizing - new zero is written despite there being nothing to copy.
+  pb.resize(20);
+  EXPECT_EQ(0, pb[0]);
+}
+
+TEST(PaddedBytesTest, TestFillWithoutReserve) {
+  PaddedBytes pb;
+  for (size_t i = 0; i < 170u; ++i) {
+    pb.push_back(i);
+  }
+  EXPECT_EQ(170u, pb.size());
+  EXPECT_GE(pb.capacity(), 170u);
+}
+
+TEST(PaddedBytesTest, TestFillWithExactReserve) {
+  PaddedBytes pb;
+  pb.reserve(170);
+  for (size_t i = 0; i < 170u; ++i) {
+    pb.push_back(i);
+  }
+  EXPECT_EQ(170u, pb.size());
+  EXPECT_EQ(pb.capacity(), 170u);
+}
+
+TEST(PaddedBytesTest, TestFillWithMoreReserve) {
+  PaddedBytes pb;
+  pb.reserve(171);
+  for (size_t i = 0; i < 170u; ++i) {
+    pb.push_back(i);
+  }
+  EXPECT_EQ(170u, pb.size());
+  EXPECT_GT(pb.capacity(), 170u);
+}
+
+// Can assign() a subset of the valid data.
+TEST(PaddedBytesTest, TestAssignFromWithin) {
+  PaddedBytes pb;
+  pb.reserve(256);
+  for (size_t i = 0; i < 256; ++i) {
+    pb.push_back(i);
+  }
+  pb.assign(pb.data() + 64, pb.data() + 192);
+  EXPECT_EQ(128u, pb.size());
+  for (size_t i = 0; i < 128; ++i) {
+    EXPECT_EQ(i + 64, pb[i]);
+  }
+}
+
+// Can assign() a range with both valid and previously-allocated data.
+TEST(PaddedBytesTest, TestAssignReclaim) {
+  PaddedBytes pb;
+  pb.reserve(256);
+  for (size_t i = 0; i < 256; ++i) {
+    pb.push_back(i);
+  }
+
+  const uint8_t* mem = pb.data();
+  pb.resize(200);
+  // Just shrank without reallocating
+  EXPECT_EQ(mem, pb.data());
+  EXPECT_EQ(256u, pb.capacity());
+
+  // Reclaim part of initial allocation
+  pb.assign(pb.data() + 100, pb.data() + 240);
+  EXPECT_EQ(140u, pb.size());
+
+  for (size_t i = 0; i < 140; ++i) {
+    EXPECT_EQ(i + 100, pb[i]);
+  }
+}
+
+// Can assign() smaller and larger ranges outside the current allocation.
+TEST(PaddedBytesTest, TestAssignOutside) {
+  PaddedBytes pb;
+  pb.resize(400);
+  std::iota(pb.begin(), pb.end(), 1);
+
+  std::vector<uint8_t> small(64);
+  std::iota(small.begin(), small.end(), 500);
+
+  pb.assign(small.data(), small.data() + small.size());
+  EXPECT_EQ(64u, pb.size());
+  for (size_t i = 0; i < 64; ++i) {
+    EXPECT_EQ((i + 500) & 0xFF, pb[i]);
+  }
+
+  std::vector<uint8_t> large(1000);
+  std::iota(large.begin(), large.end(), 600);
+
+  pb.assign(large.data(), large.data() + large.size());
+  EXPECT_EQ(1000u, pb.size());
+  for (size_t i = 0; i < 1000; ++i) {
+    EXPECT_EQ((i + 600) & 0xFF, pb[i]);
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/passes_state.cc b/third_party/jpeg-xl/lib/jxl/passes_state.cc
new file mode 100644
index 0000000000..2f287ec9b6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/passes_state.cc
@@ -0,0 +1,70 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/passes_state.h"
+
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/common.h"
+
+namespace jxl {
+
+Status InitializePassesSharedState(const FrameHeader& frame_header,
+                                   PassesSharedState* JXL_RESTRICT shared,
+                                   bool encoder) {
+  JXL_ASSERT(frame_header.nonserialized_metadata != nullptr);
+  shared->frame_header = frame_header;
+  shared->metadata = frame_header.nonserialized_metadata;
+  shared->frame_dim = frame_header.ToFrameDimensions();
+  shared->image_features.patches.SetPassesSharedState(shared);
+
+  const FrameDimensions& frame_dim = shared->frame_dim;
+
+  shared->ac_strategy =
+      AcStrategyImage(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+  shared->raw_quant_field =
+      ImageI(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+  shared->epf_sharpness =
+      ImageB(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+  shared->cmap = ColorCorrelationMap(frame_dim.xsize, frame_dim.ysize);
+
+  // In the decoder, we allocate coeff orders afterwards, when we know how many
+  // we will actually need.
+  shared->coeff_order_size = kCoeffOrderMaxSize;
+  if (encoder &&
+      shared->coeff_orders.size() <
+          frame_header.passes.num_passes * kCoeffOrderMaxSize &&
+      frame_header.encoding == FrameEncoding::kVarDCT) {
+    shared->coeff_orders.resize(frame_header.passes.num_passes *
+                                kCoeffOrderMaxSize);
+  }
+
+  shared->quant_dc = ImageB(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+
+  bool use_dc_frame = !!(frame_header.flags & FrameHeader::kUseDcFrame);
+  if (!encoder && use_dc_frame) {
+    if (frame_header.dc_level == 4) {
+      return JXL_FAILURE("Invalid DC level for kUseDcFrame: %u",
+                         frame_header.dc_level);
+    }
+    shared->dc_storage = Image3F();
+    shared->dc = &shared->dc_frames[frame_header.dc_level];
+    if (shared->dc->xsize() == 0) {
+      return JXL_FAILURE(
+          "kUseDcFrame specified for dc_level %u, but no frame was decoded "
+          "with level %u",
+          frame_header.dc_level, frame_header.dc_level + 1);
+    }
+    ZeroFillImage(&shared->quant_dc);
+  } else {
+    shared->dc_storage =
+        Image3F(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+    shared->dc = &shared->dc_storage;
+  }
+
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/passes_state.h b/third_party/jpeg-xl/lib/jxl/passes_state.h
new file mode 100644
index 0000000000..8d648a8feb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/passes_state.h
@@ -0,0 +1,133 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_PASSES_STATE_H_
+#define LIB_JXL_PASSES_STATE_H_
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_patch_dictionary.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/noise.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer.h"
+#include "lib/jxl/splines.h"
+
+// Structures that hold the (en/de)coder state for a JPEG XL kVarDCT
+// (en/de)coder.
+
+namespace jxl {
+
+struct ImageFeatures {
+  NoiseParams noise_params;
+  PatchDictionary patches;
+  Splines splines;
+};
+
+// State common to both encoder and decoder.
+// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding)
+struct PassesSharedState {
+  PassesSharedState() : frame_header(nullptr) {}
+
+  // Headers and metadata.
+  const CodecMetadata* metadata;
+  FrameHeader frame_header;
+
+  FrameDimensions frame_dim;
+
+  // Control fields and parameters.
+  AcStrategyImage ac_strategy;
+
+  // Dequant matrices + quantizer.
+  DequantMatrices matrices;
+  Quantizer quantizer{&matrices};
+  ImageI raw_quant_field;
+
+  // Per-block side information for EPF detail preservation.
+  ImageB epf_sharpness;
+
+  ColorCorrelationMap cmap;
+
+  ImageFeatures image_features;
+
+  // Memory area for storing coefficient orders.
+  // `coeff_order_size` is the size used by *one* set of coefficient orders (at
+  // most kMaxCoeffOrderSize). A set of coefficient orders is present for each
+  // pass.
+  size_t coeff_order_size = 0;
+  std::vector<coeff_order_t> coeff_orders;
+
+  // Decoder-side DC and quantized DC.
+  ImageB quant_dc;
+  Image3F dc_storage;
+  const Image3F* JXL_RESTRICT dc = &dc_storage;
+
+  BlockCtxMap block_ctx_map;
+
+  Image3F dc_frames[4];
+
+  struct {
+    ImageBundle frame;
+    // ImageBundle doesn't yet have a simple way to state it is in XYB.
+    bool ib_is_in_xyb = false;
+  } reference_frames[4] = {};
+
+  // Number of pre-clustered set of histograms (with the same ctx map), per
+  // pass. Encoded as num_histograms_ - 1.
+  size_t num_histograms = 0;
+
+  bool IsGrayscale() const { return metadata->m.color_encoding.IsGray(); }
+
+  Rect GroupRect(size_t group_index) const {
+    const size_t gx = group_index % frame_dim.xsize_groups;
+    const size_t gy = group_index / frame_dim.xsize_groups;
+    const Rect rect(gx * frame_dim.group_dim, gy * frame_dim.group_dim,
+                    frame_dim.group_dim, frame_dim.group_dim, frame_dim.xsize,
+                    frame_dim.ysize);
+    return rect;
+  }
+
+  Rect PaddedGroupRect(size_t group_index) const {
+    const size_t gx = group_index % frame_dim.xsize_groups;
+    const size_t gy = group_index / frame_dim.xsize_groups;
+    const Rect rect(gx * frame_dim.group_dim, gy * frame_dim.group_dim,
+                    frame_dim.group_dim, frame_dim.group_dim,
+                    frame_dim.xsize_padded, frame_dim.ysize_padded);
+    return rect;
+  }
+
+  Rect BlockGroupRect(size_t group_index) const {
+    const size_t gx = group_index % frame_dim.xsize_groups;
+    const size_t gy = group_index / frame_dim.xsize_groups;
+    const Rect rect(gx * (frame_dim.group_dim >> 3),
+                    gy * (frame_dim.group_dim >> 3), frame_dim.group_dim >> 3,
+                    frame_dim.group_dim >> 3, frame_dim.xsize_blocks,
+                    frame_dim.ysize_blocks);
+    return rect;
+  }
+
+  Rect DCGroupRect(size_t group_index) const {
+    const size_t gx = group_index % frame_dim.xsize_dc_groups;
+    const size_t gy = group_index / frame_dim.xsize_dc_groups;
+    const Rect rect(gx * frame_dim.group_dim, gy * frame_dim.group_dim,
+                    frame_dim.group_dim, frame_dim.group_dim,
+                    frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+    return rect;
+  }
+};
+
+// Initialized the state information that is shared between encoder and decoder.
+Status InitializePassesSharedState(const FrameHeader& frame_header,
+                                   PassesSharedState* JXL_RESTRICT shared,
+                                   bool encoder = false);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_PASSES_STATE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/passes_test.cc b/third_party/jpeg-xl/lib/jxl/passes_test.cc
new file mode 100644
index 0000000000..97d776f941
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/passes_test.cc
@@ -0,0 +1,402 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stddef.h>
+
+#include <future>
+#include <string>
+#include <utility>
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+
+using test::Roundtrip;
+using test::ThreadPoolForTests;
+
+namespace {
+
+TEST(PassesTest, RoundtripSmallPasses) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(io.xsize() / 8, io.ysize() / 8);
+
+  CompressParams cparams;
+  cparams.butteraugli_distance = 1.0;
+  cparams.progressive_mode = true;
+
+  CodecInOut io2;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _));
+  EXPECT_THAT(
+      ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(),
+                          /*distmap=*/nullptr),
+      IsSlightlyBelow(1.1));
+}
+
+TEST(PassesTest, RoundtripUnalignedPasses) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(io.xsize() / 12, io.ysize() / 7);
+
+  CompressParams cparams;
+  cparams.butteraugli_distance = 2.0;
+  cparams.progressive_mode = true;
+
+  CodecInOut io2;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _));
+  EXPECT_THAT(
+      ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(),
+                          /*distmap=*/nullptr),
+      IsSlightlyBelow(1.72));
+}
+
+TEST(PassesTest, RoundtripMultiGroupPasses) {
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  CodecInOut io;
+  {
+    ThreadPoolForTests pool(4);
+    ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  }
+  io.ShrinkTo(600, 1024);  // partial X, full Y group
+
+  auto test = [&](float target_distance, float threshold) {
+    ThreadPoolForTests pool(4);
+    CompressParams cparams;
+    cparams.butteraugli_distance = target_distance;
+    cparams.progressive_mode = true;
+    CodecInOut io2;
+    JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _,
+                            /* compressed_size */ nullptr, &pool));
+    EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, cparams.ba_params,
+                                    GetJxlCms(),
+                                    /*distmap=*/nullptr, &pool),
+                IsSlightlyBelow(target_distance + threshold));
+  };
+
+  auto run1 = std::async(std::launch::async, test, 1.0f, 0.5f);
+  auto run2 = std::async(std::launch::async, test, 2.0f, 0.5f);
+}
+
+TEST(PassesTest, RoundtripLargeFastPasses) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+
+  CompressParams cparams;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+  cparams.progressive_mode = true;
+
+  CodecInOut io2;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _,
+                          /* comrpessed_size */ nullptr, &pool));
+}
+
+// Checks for differing size/distance in two consecutive runs of distance 2,
+// which involves additional processing including adaptive reconstruction.
+// Failing this may be a sign of race conditions or invalid memory accesses.
+TEST(PassesTest, RoundtripProgressiveConsistent) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+
+  CompressParams cparams;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+  cparams.progressive_mode = true;
+  cparams.butteraugli_distance = 2.0;
+
+  // Try each xsize mod kBlockDim to verify right border handling.
+  for (size_t xsize = 48; xsize > 40; --xsize) {
+    io.ShrinkTo(xsize, 15);
+
+    CodecInOut io2;
+    size_t size2;
+    JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &size2, &pool));
+
+    CodecInOut io3;
+    size_t size3;
+    JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io3, _, &size3, &pool));
+
+    // Exact same compressed size.
+    EXPECT_EQ(size2, size3);
+
+    // Exact same distance.
+    const float dist2 = ButteraugliDistance(io.frames, io2.frames,
+                                            cparams.ba_params, GetJxlCms(),
+                                            /*distmap=*/nullptr, &pool);
+    const float dist3 = ButteraugliDistance(io.frames, io3.frames,
+                                            cparams.ba_params, GetJxlCms(),
+                                            /*distmap=*/nullptr, &pool);
+    EXPECT_EQ(dist2, dist3);
+  }
+}
+
+TEST(PassesTest, AllDownsampleFeasible) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+
+  PaddedBytes compressed;
+  AuxOut aux;
+
+  CompressParams cparams;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+  cparams.progressive_mode = true;
+  cparams.butteraugli_distance = 1.0;
+  PassesEncoderState enc_state;
+  ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                         &aux, &pool));
+
+  EXPECT_LE(compressed.size(), 240000u);
+  float target_butteraugli[9] = {};
+  target_butteraugli[1] = 2.5f;
+  target_butteraugli[2] = 16.0f;
+  target_butteraugli[4] = 20.0f;
+  target_butteraugli[8] = 80.0f;
+
+  // The default progressive encoding scheme should make all these downsampling
+  // factors achievable.
+  // TODO(veluca): re-enable downsampling 16.
+  std::vector<size_t> downsamplings = {1, 2, 4, 8};  //, 16};
+
+  auto check = [&](const uint32_t task, size_t /* thread */) -> void {
+    const size_t downsampling = downsamplings[task];
+    extras::JXLDecompressParams dparams;
+    dparams.max_downsampling = downsampling;
+    CodecInOut output;
+    ASSERT_TRUE(
+        test::DecodeFile(dparams, Span<const uint8_t>(compressed), &output));
+    EXPECT_EQ(output.xsize(), io.xsize()) << "downsampling = " << downsampling;
+    EXPECT_EQ(output.ysize(), io.ysize()) << "downsampling = " << downsampling;
+    EXPECT_LE(ButteraugliDistance(io.frames, output.frames, cparams.ba_params,
+                                  GetJxlCms(),
+                                  /*distmap=*/nullptr, nullptr),
+              target_butteraugli[downsampling])
+        << "downsampling: " << downsampling;
+  };
+  EXPECT_TRUE(RunOnPool(&pool, 0, downsamplings.size(), ThreadPool::NoInit,
+                        check, "TestDownsampling"));
+}
+
+TEST(PassesTest, AllDownsampleFeasibleQProgressive) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+
+  PaddedBytes compressed;
+  AuxOut aux;
+
+  CompressParams cparams;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+  cparams.qprogressive_mode = true;
+  cparams.butteraugli_distance = 1.0;
+  PassesEncoderState enc_state;
+  ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                         &aux, &pool));
+
+  EXPECT_LE(compressed.size(), 220000u);
+
+  float target_butteraugli[9] = {};
+  target_butteraugli[1] = 3.0f;
+  target_butteraugli[2] = 6.0f;
+  target_butteraugli[4] = 10.0f;
+  target_butteraugli[8] = 80.0f;
+
+  // The default progressive encoding scheme should make all these downsampling
+  // factors achievable.
+  std::vector<size_t> downsamplings = {1, 2, 4, 8};
+
+  auto check = [&](const uint32_t task, size_t /* thread */) -> void {
+    const size_t downsampling = downsamplings[task];
+    extras::JXLDecompressParams dparams;
+    dparams.max_downsampling = downsampling;
+    CodecInOut output;
+    ASSERT_TRUE(
+        test::DecodeFile(dparams, Span<const uint8_t>(compressed), &output));
+    EXPECT_EQ(output.xsize(), io.xsize()) << "downsampling = " << downsampling;
+    EXPECT_EQ(output.ysize(), io.ysize()) << "downsampling = " << downsampling;
+    EXPECT_LE(ButteraugliDistance(io.frames, output.frames, cparams.ba_params,
+                                  GetJxlCms(),
+                                  /*distmap=*/nullptr),
+              target_butteraugli[downsampling])
+        << "downsampling: " << downsampling;
+  };
+  EXPECT_TRUE(RunOnPool(&pool, 0, downsamplings.size(), ThreadPool::NoInit,
+                        check, "TestQProgressive"));
+}
+
+TEST(PassesTest, ProgressiveDownsample2DegradesCorrectlyGrayscale) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/cvo9xd_keong_macan_grayscale.png");
+  CodecInOut io_orig;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io_orig, &pool));
+  Rect rect(0, 0, io_orig.xsize(), 128);
+  // need 2 DC groups for the DC frame to actually be progressive.
+  Image3F large(4242, rect.ysize());
+  ZeroFillImage(&large);
+  CopyImageTo(rect, *io_orig.Main().color(), rect, &large);
+  CodecInOut io;
+  io.metadata = io_orig.metadata;
+  io.SetFromImage(std::move(large), io_orig.Main().c_current());
+
+  PaddedBytes compressed;
+  AuxOut aux;
+
+  CompressParams cparams;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+  cparams.progressive_dc = 1;
+  cparams.responsive = true;
+  cparams.qprogressive_mode = true;
+  cparams.butteraugli_distance = 1.0;
+  PassesEncoderState enc_state;
+  ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                         &aux, &pool));
+
+  EXPECT_LE(compressed.size(), 10000u);
+
+  extras::JXLDecompressParams dparams;
+  dparams.max_downsampling = 1;
+  CodecInOut output;
+  ASSERT_TRUE(
+      test::DecodeFile(dparams, Span<const uint8_t>(compressed), &output));
+
+  dparams.max_downsampling = 2;
+  CodecInOut output_d2;
+  ASSERT_TRUE(
+      test::DecodeFile(dparams, Span<const uint8_t>(compressed), &output_d2));
+
+  // 0 if reading all the passes, ~15 if skipping the 8x pass.
+  float butteraugli_distance_down2_full = ButteraugliDistance(
+      output.frames, output_d2.frames, cparams.ba_params, GetJxlCms(),
+      /*distmap=*/nullptr);
+
+  EXPECT_LE(butteraugli_distance_down2_full, 3.2f);
+  EXPECT_GE(butteraugli_distance_down2_full, 1.0f);
+}
+
+TEST(PassesTest, ProgressiveDownsample2DegradesCorrectly) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  CodecInOut io_orig;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io_orig, &pool));
+  Rect rect(0, 0, io_orig.xsize(), 128);
+  // need 2 DC groups for the DC frame to actually be progressive.
+  Image3F large(4242, rect.ysize());
+  ZeroFillImage(&large);
+  CopyImageTo(rect, *io_orig.Main().color(), rect, &large);
+  CodecInOut io;
+  io.SetFromImage(std::move(large), io_orig.Main().c_current());
+
+  PaddedBytes compressed;
+  AuxOut aux;
+
+  CompressParams cparams;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+  cparams.progressive_dc = 1;
+  cparams.responsive = true;
+  cparams.qprogressive_mode = true;
+  cparams.butteraugli_distance = 1.0;
+  PassesEncoderState enc_state;
+  ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                         &aux, &pool));
+
+  EXPECT_LE(compressed.size(), 220000u);
+
+  extras::JXLDecompressParams dparams;
+  dparams.max_downsampling = 1;
+  CodecInOut output;
+  ASSERT_TRUE(
+      test::DecodeFile(dparams, Span<const uint8_t>(compressed), &output));
+
+  dparams.max_downsampling = 2;
+  CodecInOut output_d2;
+  ASSERT_TRUE(
+      test::DecodeFile(dparams, Span<const uint8_t>(compressed), &output_d2));
+
+  // 0 if reading all the passes, ~15 if skipping the 8x pass.
+  float butteraugli_distance_down2_full = ButteraugliDistance(
+      output.frames, output_d2.frames, cparams.ba_params, GetJxlCms(),
+      /*distmap=*/nullptr);
+
+  EXPECT_LE(butteraugli_distance_down2_full, 3.0f);
+  EXPECT_GE(butteraugli_distance_down2_full, 1.0f);
+}
+
+TEST(PassesTest, NonProgressiveDCImage) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+
+  PaddedBytes compressed;
+  AuxOut aux;
+
+  CompressParams cparams;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+  cparams.progressive_mode = false;
+  cparams.butteraugli_distance = 2.0;
+  PassesEncoderState enc_state;
+  ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                         &aux, &pool));
+
+  // Even in non-progressive mode, it should be possible to return a DC-only
+  // image.
+  extras::JXLDecompressParams dparams;
+  dparams.max_downsampling = 100;
+  CodecInOut output;
+  ASSERT_TRUE(test::DecodeFile(dparams, Span<const uint8_t>(compressed),
+                               &output, &pool));
+  EXPECT_EQ(output.xsize(), io.xsize());
+  EXPECT_EQ(output.ysize(), io.ysize());
+}
+
+TEST(PassesTest, RoundtripSmallNoGaborishPasses) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(io.xsize() / 8, io.ysize() / 8);
+
+  CompressParams cparams;
+  cparams.gaborish = Override::kOff;
+  cparams.butteraugli_distance = 1.0;
+  cparams.progressive_mode = true;
+
+  CodecInOut io2;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _));
+  EXPECT_THAT(
+      ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(),
+                          /*distmap=*/nullptr),
+      IsSlightlyBelow(1.2));
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/patch_dictionary_internal.h b/third_party/jpeg-xl/lib/jxl/patch_dictionary_internal.h
new file mode 100644
index 0000000000..e4172f6db6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/patch_dictionary_internal.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_PATCH_DICTIONARY_INTERNAL_H_
+#define LIB_JXL_PATCH_DICTIONARY_INTERNAL_H_
+
+#include "lib/jxl/dec_patch_dictionary.h"
+#include "lib/jxl/passes_state.h"  // for PassesSharedState
+
+namespace jxl {
+
+// Context numbers as specified in Section C.4.5, Listing C.2:
+enum Contexts {
+  kNumRefPatchContext = 0,
+  kReferenceFrameContext = 1,
+  kPatchSizeContext = 2,
+  kPatchReferencePositionContext = 3,
+  kPatchPositionContext = 4,
+  kPatchBlendModeContext = 5,
+  kPatchOffsetContext = 6,
+  kPatchCountContext = 7,
+  kPatchAlphaChannelContext = 8,
+  kPatchClampContext = 9,
+  kNumPatchDictionaryContexts
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_PATCH_DICTIONARY_INTERNAL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/patch_dictionary_test.cc b/third_party/jpeg-xl/lib/jxl/patch_dictionary_test.cc
new file mode 100644
index 0000000000..5cc6c13a9e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/patch_dictionary_test.cc
@@ -0,0 +1,58 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+using ::jxl::test::Roundtrip;
+
+TEST(PatchDictionaryTest, GrayscaleModular) {
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/grayscale_patches.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+
+  CompressParams cparams;
+  cparams.SetLossless();
+  cparams.patches = jxl::Override::kOn;
+
+  CodecInOut io2;
+  // Without patches: ~25k
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
+  EXPECT_LE(compressed_size, 8000u);
+  JXL_ASSERT_OK(VerifyRelativeError(*io.Main().color(), *io2.Main().color(),
+                                    1e-7f, 0, _));
+}
+
+TEST(PatchDictionaryTest, GrayscaleVarDCT) {
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/grayscale_patches.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+
+  CompressParams cparams;
+  cparams.patches = jxl::Override::kOn;
+
+  CodecInOut io2;
+  // Without patches: ~47k
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
+  EXPECT_LE(compressed_size, 14000u);
+  // Without patches: ~1.2
+  EXPECT_LE(
+      ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(),
+                          /*distmap=*/nullptr),
+      1.1);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/preview_test.cc b/third_party/jpeg-xl/lib/jxl/preview_test.cc
new file mode 100644
index 0000000000..9d4603ca70
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/preview_test.cc
@@ -0,0 +1,68 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stddef.h>
+
+#include <string>
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+using test::Roundtrip;
+
+TEST(PreviewTest, RoundtripGivenPreview) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(io.xsize() / 8, io.ysize() / 8);
+  // Same as main image
+  io.preview_frame = io.Main().Copy();
+  const size_t preview_xsize = 15;
+  const size_t preview_ysize = 27;
+  io.preview_frame.ShrinkTo(preview_xsize, preview_ysize);
+  io.metadata.m.have_preview = true;
+  ASSERT_TRUE(io.metadata.m.preview_size.Set(io.preview_frame.xsize(),
+                                             io.preview_frame.ysize()));
+
+  CompressParams cparams;
+  cparams.butteraugli_distance = 2.0;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+
+  CodecInOut io2;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _));
+  EXPECT_EQ(preview_xsize, io2.metadata.m.preview_size.xsize());
+  EXPECT_EQ(preview_ysize, io2.metadata.m.preview_size.ysize());
+  EXPECT_EQ(preview_xsize, io2.preview_frame.xsize());
+  EXPECT_EQ(preview_ysize, io2.preview_frame.ysize());
+
+  EXPECT_LE(ButteraugliDistance(io.preview_frame, io2.preview_frame,
+                                cparams.ba_params, GetJxlCms(),
+                                /*distmap=*/nullptr),
+            2.5);
+  EXPECT_LE(
+      ButteraugliDistance(io.Main(), io2.Main(), cparams.ba_params, GetJxlCms(),
+                          /*distmap=*/nullptr),
+      2.5);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/quant_weights.cc b/third_party/jpeg-xl/lib/jxl/quant_weights.cc
new file mode 100644
index 0000000000..5e3f3424aa
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/quant_weights.cc
@@ -0,0 +1,1239 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+#include "lib/jxl/quant_weights.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <utility>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/dec_modular.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/quant_weights.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/fast_math-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Lt;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Sqrt;
+
+// kQuantWeights[N * N * c + N * y + x] is the relative weight of the (x, y)
+// coefficient in component c. Higher weights correspond to finer quantization
+// intervals and more bits spent in encoding.
+
+static constexpr const float kAlmostZero = 1e-8f;
+
+void GetQuantWeightsDCT2(const QuantEncoding::DCT2Weights& dct2weights,
+                         float* weights) {
+  for (size_t c = 0; c < 3; c++) {
+    size_t start = c * 64;
+    weights[start] = 0xBAD;
+    weights[start + 1] = weights[start + 8] = dct2weights[c][0];
+    weights[start + 9] = dct2weights[c][1];
+    for (size_t y = 0; y < 2; y++) {
+      for (size_t x = 0; x < 2; x++) {
+        weights[start + y * 8 + x + 2] = dct2weights[c][2];
+        weights[start + (y + 2) * 8 + x] = dct2weights[c][2];
+      }
+    }
+    for (size_t y = 0; y < 2; y++) {
+      for (size_t x = 0; x < 2; x++) {
+        weights[start + (y + 2) * 8 + x + 2] = dct2weights[c][3];
+      }
+    }
+    for (size_t y = 0; y < 4; y++) {
+      for (size_t x = 0; x < 4; x++) {
+        weights[start + y * 8 + x + 4] = dct2weights[c][4];
+        weights[start + (y + 4) * 8 + x] = dct2weights[c][4];
+      }
+    }
+    for (size_t y = 0; y < 4; y++) {
+      for (size_t x = 0; x < 4; x++) {
+        weights[start + (y + 4) * 8 + x + 4] = dct2weights[c][5];
+      }
+    }
+  }
+}
+
+void GetQuantWeightsIdentity(const QuantEncoding::IdWeights& idweights,
+                             float* weights) {
+  for (size_t c = 0; c < 3; c++) {
+    for (int i = 0; i < 64; i++) {
+      weights[64 * c + i] = idweights[c][0];
+    }
+    weights[64 * c + 1] = idweights[c][1];
+    weights[64 * c + 8] = idweights[c][1];
+    weights[64 * c + 9] = idweights[c][2];
+  }
+}
+
+float Interpolate(float pos, float max, const float* array, size_t len) {
+  float scaled_pos = pos * (len - 1) / max;
+  size_t idx = scaled_pos;
+  JXL_DASSERT(idx + 1 < len);
+  float a = array[idx];
+  float b = array[idx + 1];
+  return a * FastPowf(b / a, scaled_pos - idx);
+}
+
+float Mult(float v) {
+  if (v > 0.0f) return 1.0f + v;
+  return 1.0f / (1.0f - v);
+}
+
+using DF4 = HWY_CAPPED(float, 4);
+
+hwy::HWY_NAMESPACE::Vec<DF4> InterpolateVec(
+    hwy::HWY_NAMESPACE::Vec<DF4> scaled_pos, const float* array) {
+  HWY_CAPPED(int32_t, 4) di;
+
+  auto idx = ConvertTo(di, scaled_pos);
+
+  auto frac = Sub(scaled_pos, ConvertTo(DF4(), idx));
+
+  // TODO(veluca): in theory, this could be done with 8 TableLookupBytes, but
+  // it's probably slower.
+  auto a = GatherIndex(DF4(), array, idx);
+  auto b = GatherIndex(DF4(), array + 1, idx);
+
+  return Mul(a, FastPowf(DF4(), Div(b, a), frac));
+}
+
+// Computes quant weights for a COLS*ROWS-sized transform, using num_bands
+// eccentricity bands and num_ebands eccentricity bands. If print_mode is 1,
+// prints the resulting matrix; if print_mode is 2, prints the matrix in a
+// format suitable for a 3d plot with gnuplot.
+Status GetQuantWeights(
+    size_t ROWS, size_t COLS,
+    const DctQuantWeightParams::DistanceBandsArray& distance_bands,
+    size_t num_bands, float* out) {
+  for (size_t c = 0; c < 3; c++) {
+    float bands[DctQuantWeightParams::kMaxDistanceBands] = {
+        distance_bands[c][0]};
+    if (bands[0] < kAlmostZero) return JXL_FAILURE("Invalid distance bands");
+    for (size_t i = 1; i < num_bands; i++) {
+      bands[i] = bands[i - 1] * Mult(distance_bands[c][i]);
+      if (bands[i] < kAlmostZero) return JXL_FAILURE("Invalid distance bands");
+    }
+    float scale = (num_bands - 1) / (kSqrt2 + 1e-6f);
+    float rcpcol = scale / (COLS - 1);
+    float rcprow = scale / (ROWS - 1);
+    JXL_ASSERT(COLS >= Lanes(DF4()));
+    HWY_ALIGN float l0123[4] = {0, 1, 2, 3};
+    for (uint32_t y = 0; y < ROWS; y++) {
+      float dy = y * rcprow;
+      float dy2 = dy * dy;
+      for (uint32_t x = 0; x < COLS; x += Lanes(DF4())) {
+        auto dx =
+            Mul(Add(Set(DF4(), x), Load(DF4(), l0123)), Set(DF4(), rcpcol));
+        auto scaled_distance = Sqrt(MulAdd(dx, dx, Set(DF4(), dy2)));
+        auto weight = num_bands == 1 ? Set(DF4(), bands[0])
+                                     : InterpolateVec(scaled_distance, bands);
+        StoreU(weight, DF4(), out + c * COLS * ROWS + y * COLS + x);
+      }
+    }
+  }
+  return true;
+}
+
+// TODO(veluca): SIMD-fy. With 256x256, this is actually slow.
+Status ComputeQuantTable(const QuantEncoding& encoding,
+                         float* JXL_RESTRICT table,
+                         float* JXL_RESTRICT inv_table, size_t table_num,
+                         DequantMatrices::QuantTable kind, size_t* pos) {
+  constexpr size_t N = kBlockDim;
+  size_t wrows = 8 * DequantMatrices::required_size_x[kind],
+         wcols = 8 * DequantMatrices::required_size_y[kind];
+  size_t num = wrows * wcols;
+
+  std::vector<float> weights(3 * num);
+
+  switch (encoding.mode) {
+    case QuantEncoding::kQuantModeLibrary: {
+      // Library and copy quant encoding should get replaced by the actual
+      // parameters by the caller.
+      JXL_ASSERT(false);
+      break;
+    }
+    case QuantEncoding::kQuantModeID: {
+      JXL_ASSERT(num == kDCTBlockSize);
+      GetQuantWeightsIdentity(encoding.idweights, weights.data());
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT2: {
+      JXL_ASSERT(num == kDCTBlockSize);
+      GetQuantWeightsDCT2(encoding.dct2weights, weights.data());
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT4: {
+      JXL_ASSERT(num == kDCTBlockSize);
+      float weights4x4[3 * 4 * 4];
+      // Always use 4x4 GetQuantWeights for DCT4 quantization tables.
+      JXL_RETURN_IF_ERROR(
+          GetQuantWeights(4, 4, encoding.dct_params.distance_bands,
+                          encoding.dct_params.num_distance_bands, weights4x4));
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t y = 0; y < kBlockDim; y++) {
+          for (size_t x = 0; x < kBlockDim; x++) {
+            weights[c * num + y * kBlockDim + x] =
+                weights4x4[c * 16 + (y / 2) * 4 + (x / 2)];
+          }
+        }
+        weights[c * num + 1] /= encoding.dct4multipliers[c][0];
+        weights[c * num + N] /= encoding.dct4multipliers[c][0];
+        weights[c * num + N + 1] /= encoding.dct4multipliers[c][1];
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT4X8: {
+      JXL_ASSERT(num == kDCTBlockSize);
+      float weights4x8[3 * 4 * 8];
+      // Always use 4x8 GetQuantWeights for DCT4X8 quantization tables.
+      JXL_RETURN_IF_ERROR(
+          GetQuantWeights(4, 8, encoding.dct_params.distance_bands,
+                          encoding.dct_params.num_distance_bands, weights4x8));
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t y = 0; y < kBlockDim; y++) {
+          for (size_t x = 0; x < kBlockDim; x++) {
+            weights[c * num + y * kBlockDim + x] =
+                weights4x8[c * 32 + (y / 2) * 8 + x];
+          }
+        }
+        weights[c * num + N] /= encoding.dct4x8multipliers[c];
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT: {
+      JXL_RETURN_IF_ERROR(GetQuantWeights(
+          wrows, wcols, encoding.dct_params.distance_bands,
+          encoding.dct_params.num_distance_bands, weights.data()));
+      break;
+    }
+    case QuantEncoding::kQuantModeRAW: {
+      if (!encoding.qraw.qtable || encoding.qraw.qtable->size() != 3 * num) {
+        return JXL_FAILURE("Invalid table encoding");
+      }
+      for (size_t i = 0; i < 3 * num; i++) {
+        weights[i] =
+            1.f / (encoding.qraw.qtable_den * (*encoding.qraw.qtable)[i]);
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeAFV: {
+      constexpr float kFreqs[] = {
+          0xBAD,
+          0xBAD,
+          0.8517778890324296,
+          5.37778436506804,
+          0xBAD,
+          0xBAD,
+          4.734747904497923,
+          5.449245381693219,
+          1.6598270267479331,
+          4,
+          7.275749096817861,
+          10.423227632456525,
+          2.662932286148962,
+          7.630657783650829,
+          8.962388608184032,
+          12.97166202570235,
+      };
+
+      float weights4x8[3 * 4 * 8];
+      JXL_RETURN_IF_ERROR((
+          GetQuantWeights(4, 8, encoding.dct_params.distance_bands,
+                          encoding.dct_params.num_distance_bands, weights4x8)));
+      float weights4x4[3 * 4 * 4];
+      JXL_RETURN_IF_ERROR((GetQuantWeights(
+          4, 4, encoding.dct_params_afv_4x4.distance_bands,
+          encoding.dct_params_afv_4x4.num_distance_bands, weights4x4)));
+
+      constexpr float lo = 0.8517778890324296;
+      constexpr float hi = 12.97166202570235f - lo + 1e-6f;
+      for (size_t c = 0; c < 3; c++) {
+        float bands[4];
+        bands[0] = encoding.afv_weights[c][5];
+        if (bands[0] < kAlmostZero) return JXL_FAILURE("Invalid AFV bands");
+        for (size_t i = 1; i < 4; i++) {
+          bands[i] = bands[i - 1] * Mult(encoding.afv_weights[c][i + 5]);
+          if (bands[i] < kAlmostZero) return JXL_FAILURE("Invalid AFV bands");
+        }
+        size_t start = c * 64;
+        auto set_weight = [&start, &weights](size_t x, size_t y, float val) {
+          weights[start + y * 8 + x] = val;
+        };
+        weights[start] = 1;  // Not used, but causes MSAN error otherwise.
+        // Weights for (0, 1) and (1, 0).
+        set_weight(0, 1, encoding.afv_weights[c][0]);
+        set_weight(1, 0, encoding.afv_weights[c][1]);
+        // AFV special weights for 3-pixel corner.
+        set_weight(0, 2, encoding.afv_weights[c][2]);
+        set_weight(2, 0, encoding.afv_weights[c][3]);
+        set_weight(2, 2, encoding.afv_weights[c][4]);
+
+        // All other AFV weights.
+        for (size_t y = 0; y < 4; y++) {
+          for (size_t x = 0; x < 4; x++) {
+            if (x < 2 && y < 2) continue;
+            float val = Interpolate(kFreqs[y * 4 + x] - lo, hi, bands, 4);
+            set_weight(2 * x, 2 * y, val);
+          }
+        }
+
+        // Put 4x8 weights in odd rows, except (1, 0).
+        for (size_t y = 0; y < kBlockDim / 2; y++) {
+          for (size_t x = 0; x < kBlockDim; x++) {
+            if (x == 0 && y == 0) continue;
+            weights[c * num + (2 * y + 1) * kBlockDim + x] =
+                weights4x8[c * 32 + y * 8 + x];
+          }
+        }
+        // Put 4x4 weights in even rows / odd columns, except (0, 1).
+        for (size_t y = 0; y < kBlockDim / 2; y++) {
+          for (size_t x = 0; x < kBlockDim / 2; x++) {
+            if (x == 0 && y == 0) continue;
+            weights[c * num + (2 * y) * kBlockDim + 2 * x + 1] =
+                weights4x4[c * 16 + y * 4 + x];
+          }
+        }
+      }
+      break;
+    }
+  }
+  size_t prev_pos = *pos;
+  HWY_CAPPED(float, 64) d;
+  for (size_t i = 0; i < num * 3; i += Lanes(d)) {
+    auto inv_val = LoadU(d, weights.data() + i);
+    if (JXL_UNLIKELY(!AllFalse(d, Ge(inv_val, Set(d, 1.0f / kAlmostZero))) ||
+                     !AllFalse(d, Lt(inv_val, Set(d, kAlmostZero))))) {
+      return JXL_FAILURE("Invalid quantization table");
+    }
+    auto val = Div(Set(d, 1.0f), inv_val);
+    StoreU(val, d, table + *pos + i);
+    StoreU(inv_val, d, inv_table + *pos + i);
+  }
+  (*pos) += 3 * num;
+
+  // Ensure that the lowest frequencies have a 0 inverse table.
+  // This does not affect en/decoding, but allows AC strategy selection to be
+  // slightly simpler.
+  size_t xs = DequantMatrices::required_size_x[kind];
+  size_t ys = DequantMatrices::required_size_y[kind];
+  CoefficientLayout(&ys, &xs);
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < ys; y++) {
+      for (size_t x = 0; x < xs; x++) {
+        inv_table[prev_pos + c * ys * xs * kDCTBlockSize + y * kBlockDim * xs +
+                  x] = 0;
+      }
+    }
+  }
+  return true;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace jxl {
+namespace {
+
+HWY_EXPORT(ComputeQuantTable);
+
+static constexpr const float kAlmostZero = 1e-8f;
+
+Status DecodeDctParams(BitReader* br, DctQuantWeightParams* params) {
+  params->num_distance_bands =
+      br->ReadFixedBits<DctQuantWeightParams::kLog2MaxDistanceBands>() + 1;
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t i = 0; i < params->num_distance_bands; i++) {
+      JXL_RETURN_IF_ERROR(F16Coder::Read(br, &params->distance_bands[c][i]));
+    }
+    if (params->distance_bands[c][0] < kAlmostZero) {
+      return JXL_FAILURE("Distance band seed is too small");
+    }
+    params->distance_bands[c][0] *= 64.0f;
+  }
+  return true;
+}
+
+Status Decode(BitReader* br, QuantEncoding* encoding, size_t required_size_x,
+              size_t required_size_y, size_t idx,
+              ModularFrameDecoder* modular_frame_decoder) {
+  size_t required_size = required_size_x * required_size_y;
+  required_size_x *= kBlockDim;
+  required_size_y *= kBlockDim;
+  int mode = br->ReadFixedBits<kLog2NumQuantModes>();
+  switch (mode) {
+    case QuantEncoding::kQuantModeLibrary: {
+      encoding->predefined = br->ReadFixedBits<kCeilLog2NumPredefinedTables>();
+      if (encoding->predefined >= kNumPredefinedTables) {
+        return JXL_FAILURE("Invalid predefined table");
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeID: {
+      if (required_size != 1) return JXL_FAILURE("Invalid mode");
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 3; i++) {
+          JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->idweights[c][i]));
+          if (std::abs(encoding->idweights[c][i]) < kAlmostZero) {
+            return JXL_FAILURE("ID Quantizer is too small");
+          }
+          encoding->idweights[c][i] *= 64;
+        }
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT2: {
+      if (required_size != 1) return JXL_FAILURE("Invalid mode");
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 6; i++) {
+          JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->dct2weights[c][i]));
+          if (std::abs(encoding->dct2weights[c][i]) < kAlmostZero) {
+            return JXL_FAILURE("Quantizer is too small");
+          }
+          encoding->dct2weights[c][i] *= 64;
+        }
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT4X8: {
+      if (required_size != 1) return JXL_FAILURE("Invalid mode");
+      for (size_t c = 0; c < 3; c++) {
+        JXL_RETURN_IF_ERROR(
+            F16Coder::Read(br, &encoding->dct4x8multipliers[c]));
+        if (std::abs(encoding->dct4x8multipliers[c]) < kAlmostZero) {
+          return JXL_FAILURE("DCT4X8 multiplier is too small");
+        }
+      }
+      JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params));
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT4: {
+      if (required_size != 1) return JXL_FAILURE("Invalid mode");
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 2; i++) {
+          JXL_RETURN_IF_ERROR(
+              F16Coder::Read(br, &encoding->dct4multipliers[c][i]));
+          if (std::abs(encoding->dct4multipliers[c][i]) < kAlmostZero) {
+            return JXL_FAILURE("DCT4 multiplier is too small");
+          }
+        }
+      }
+      JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params));
+      break;
+    }
+    case QuantEncoding::kQuantModeAFV: {
+      if (required_size != 1) return JXL_FAILURE("Invalid mode");
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 9; i++) {
+          JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->afv_weights[c][i]));
+        }
+        for (size_t i = 0; i < 6; i++) {
+          encoding->afv_weights[c][i] *= 64;
+        }
+      }
+      JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params));
+      JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params_afv_4x4));
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT: {
+      JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params));
+      break;
+    }
+    case QuantEncoding::kQuantModeRAW: {
+      // Set mode early, to avoid mem-leak.
+      encoding->mode = QuantEncoding::kQuantModeRAW;
+      JXL_RETURN_IF_ERROR(ModularFrameDecoder::DecodeQuantTable(
+          required_size_x, required_size_y, br, encoding, idx,
+          modular_frame_decoder));
+      break;
+    }
+    default:
+      return JXL_FAILURE("Invalid quantization table encoding");
+  }
+  encoding->mode = QuantEncoding::Mode(mode);
+  return true;
+}
+
+}  // namespace
+
+// These definitions are needed before C++17.
+constexpr size_t DequantMatrices::required_size_[];
+constexpr size_t DequantMatrices::required_size_x[];
+constexpr size_t DequantMatrices::required_size_y[];
+constexpr DequantMatrices::QuantTable DequantMatrices::kQuantTable[];
+
+Status DequantMatrices::Decode(BitReader* br,
+                               ModularFrameDecoder* modular_frame_decoder) {
+  size_t all_default = br->ReadBits(1);
+  size_t num_tables = all_default ? 0 : static_cast<size_t>(kNum);
+  encodings_.clear();
+  encodings_.resize(kNum, QuantEncoding::Library(0));
+  for (size_t i = 0; i < num_tables; i++) {
+    JXL_RETURN_IF_ERROR(
+        jxl::Decode(br, &encodings_[i], required_size_x[i % kNum],
+                    required_size_y[i % kNum], i, modular_frame_decoder));
+  }
+  computed_mask_ = 0;
+  return true;
+}
+
+Status DequantMatrices::DecodeDC(BitReader* br) {
+  bool all_default = br->ReadBits(1);
+  if (!br->AllReadsWithinBounds()) return JXL_FAILURE("EOS during DecodeDC");
+  if (!all_default) {
+    for (size_t c = 0; c < 3; c++) {
+      JXL_RETURN_IF_ERROR(F16Coder::Read(br, &dc_quant_[c]));
+      dc_quant_[c] *= 1.0f / 128.0f;
+      // Negative values and nearly zero are invalid values.
+      if (dc_quant_[c] < kAlmostZero) {
+        return JXL_FAILURE("Invalid dc_quant: coefficient is too small.");
+      }
+      inv_dc_quant_[c] = 1.0f / dc_quant_[c];
+    }
+  }
+  return true;
+}
+
+constexpr float V(float v) { return static_cast<float>(v); }
+
+namespace {
+struct DequantMatricesLibraryDef {
+  // DCT8
+  static constexpr QuantEncodingInternal DCT() {
+    return QuantEncodingInternal::DCT(DctQuantWeightParams({{{{
+                                                                 V(3150.0),
+                                                                 V(0.0),
+                                                                 V(-0.4),
+                                                                 V(-0.4),
+                                                                 V(-0.4),
+                                                                 V(-2.0),
+                                                             }},
+                                                             {{
+                                                                 V(560.0),
+                                                                 V(0.0),
+                                                                 V(-0.3),
+                                                                 V(-0.3),
+                                                                 V(-0.3),
+                                                                 V(-0.3),
+                                                             }},
+                                                             {{
+                                                                 V(512.0),
+                                                                 V(-2.0),
+                                                                 V(-1.0),
+                                                                 V(0.0),
+                                                                 V(-1.0),
+                                                                 V(-2.0),
+                                                             }}}},
+                                                           6));
+  }
+
+  // Identity
+  static constexpr QuantEncodingInternal IDENTITY() {
+    return QuantEncodingInternal::Identity({{{{
+                                                 V(280.0),
+                                                 V(3160.0),
+                                                 V(3160.0),
+                                             }},
+                                             {{
+                                                 V(60.0),
+                                                 V(864.0),
+                                                 V(864.0),
+                                             }},
+                                             {{
+                                                 V(18.0),
+                                                 V(200.0),
+                                                 V(200.0),
+                                             }}}});
+  }
+
+  // DCT2
+  static constexpr QuantEncodingInternal DCT2X2() {
+    return QuantEncodingInternal::DCT2({{{{
+                                             V(3840.0),
+                                             V(2560.0),
+                                             V(1280.0),
+                                             V(640.0),
+                                             V(480.0),
+                                             V(300.0),
+                                         }},
+                                         {{
+                                             V(960.0),
+                                             V(640.0),
+                                             V(320.0),
+                                             V(180.0),
+                                             V(140.0),
+                                             V(120.0),
+                                         }},
+                                         {{
+                                             V(640.0),
+                                             V(320.0),
+                                             V(128.0),
+                                             V(64.0),
+                                             V(32.0),
+                                             V(16.0),
+                                         }}}});
+  }
+
+  // DCT4 (quant_kind 3)
+  static constexpr QuantEncodingInternal DCT4X4() {
+    return QuantEncodingInternal::DCT4(DctQuantWeightParams({{{{
+                                                                  V(2200.0),
+                                                                  V(0.0),
+                                                                  V(0.0),
+                                                                  V(0.0),
+                                                              }},
+                                                              {{
+                                                                  V(392.0),
+                                                                  V(0.0),
+                                                                  V(0.0),
+                                                                  V(0.0),
+                                                              }},
+                                                              {{
+                                                                  V(112.0),
+                                                                  V(-0.25),
+                                                                  V(-0.25),
+                                                                  V(-0.5),
+                                                              }}}},
+                                                            4),
+                                       /* kMul */
+                                       {{{{
+                                             V(1.0),
+                                             V(1.0),
+                                         }},
+                                         {{
+                                             V(1.0),
+                                             V(1.0),
+                                         }},
+                                         {{
+                                             V(1.0),
+                                             V(1.0),
+                                         }}}});
+  }
+
+  // DCT16
+  static constexpr QuantEncodingInternal DCT16X16() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(8996.8725711814115328),
+                                   V(-1.3000777393353804),
+                                   V(-0.49424529824571225),
+                                   V(-0.439093774457103443),
+                                   V(-0.6350101832695744),
+                                   V(-0.90177264050827612),
+                                   V(-1.6162099239887414),
+                               }},
+                               {{
+                                   V(3191.48366296844234752),
+                                   V(-0.67424582104194355),
+                                   V(-0.80745813428471001),
+                                   V(-0.44925837484843441),
+                                   V(-0.35865440981033403),
+                                   V(-0.31322389111877305),
+                                   V(-0.37615025315725483),
+                               }},
+                               {{
+                                   V(1157.50408145487200256),
+                                   V(-2.0531423165804414),
+                                   V(-1.4),
+                                   V(-0.50687130033378396),
+                                   V(-0.42708730624733904),
+                                   V(-1.4856834539296244),
+                                   V(-4.9209142884401604),
+                               }}}},
+                             7));
+  }
+
+  // DCT32
+  static constexpr QuantEncodingInternal DCT32X32() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(15718.40830982518931456),
+                                   V(-1.025),
+                                   V(-0.98),
+                                   V(-0.9012),
+                                   V(-0.4),
+                                   V(-0.48819395464),
+                                   V(-0.421064),
+                                   V(-0.27),
+                               }},
+                               {{
+                                   V(7305.7636810695983104),
+                                   V(-0.8041958212306401),
+                                   V(-0.7633036457487539),
+                                   V(-0.55660379990111464),
+                                   V(-0.49785304658857626),
+                                   V(-0.43699592683512467),
+                                   V(-0.40180866526242109),
+                                   V(-0.27321683125358037),
+                               }},
+                               {{
+                                   V(3803.53173721215041536),
+                                   V(-3.060733579805728),
+                                   V(-2.0413270132490346),
+                                   V(-2.0235650159727417),
+                                   V(-0.5495389509954993),
+                                   V(-0.4),
+                                   V(-0.4),
+                                   V(-0.3),
+                               }}}},
+                             8));
+  }
+
+  // DCT16X8
+  static constexpr QuantEncodingInternal DCT8X16() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(7240.7734393502),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.2),
+                                   V(-0.2),
+                                   V(-0.2),
+                                   V(-0.5),
+                               }},
+                               {{
+                                   V(1448.15468787004),
+                                   V(-0.5),
+                                   V(-0.5),
+                                   V(-0.5),
+                                   V(-0.2),
+                                   V(-0.2),
+                                   V(-0.2),
+                               }},
+                               {{
+                                   V(506.854140754517),
+                                   V(-1.4),
+                                   V(-0.2),
+                                   V(-0.5),
+                                   V(-0.5),
+                                   V(-1.5),
+                                   V(-3.6),
+                               }}}},
+                             7));
+  }
+
+  // DCT32X8
+  static constexpr QuantEncodingInternal DCT8X32() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(16283.2494710648897),
+                                   V(-1.7812845336559429),
+                                   V(-1.6309059012653515),
+                                   V(-1.0382179034313539),
+                                   V(-0.85),
+                                   V(-0.7),
+                                   V(-0.9),
+                                   V(-1.2360638576849587),
+                               }},
+                               {{
+                                   V(5089.15750884921511936),
+                                   V(-0.320049391452786891),
+                                   V(-0.35362849922161446),
+                                   V(-0.30340000000000003),
+                                   V(-0.61),
+                                   V(-0.5),
+                                   V(-0.5),
+                                   V(-0.6),
+                               }},
+                               {{
+                                   V(3397.77603275308720128),
+                                   V(-0.321327362693153371),
+                                   V(-0.34507619223117997),
+                                   V(-0.70340000000000003),
+                                   V(-0.9),
+                                   V(-1.0),
+                                   V(-1.0),
+                                   V(-1.1754605576265209),
+                               }}}},
+                             8));
+  }
+
+  // DCT32X16
+  static constexpr QuantEncodingInternal DCT16X32() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(13844.97076442300573),
+                                   V(-0.97113799999999995),
+                                   V(-0.658),
+                                   V(-0.42026),
+                                   V(-0.22712),
+                                   V(-0.2206),
+                                   V(-0.226),
+                                   V(-0.6),
+                               }},
+                               {{
+                                   V(4798.964084220744293),
+                                   V(-0.61125308982767057),
+                                   V(-0.83770786552491361),
+                                   V(-0.79014862079498627),
+                                   V(-0.2692727459704829),
+                                   V(-0.38272769465388551),
+                                   V(-0.22924222653091453),
+                                   V(-0.20719098826199578),
+                               }},
+                               {{
+                                   V(1807.236946760964614),
+                                   V(-1.2),
+                                   V(-1.2),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.4),
+                                   V(-0.5),
+                               }}}},
+                             8));
+  }
+
+  // DCT4X8 and 8x4
+  static constexpr QuantEncodingInternal DCT4X8() {
+    return QuantEncodingInternal::DCT4X8(
+        DctQuantWeightParams({{
+                                 {{
+                                     V(2198.050556016380522),
+                                     V(-0.96269623020744692),
+                                     V(-0.76194253026666783),
+                                     V(-0.6551140670773547),
+                                 }},
+                                 {{
+                                     V(764.3655248643528689),
+                                     V(-0.92630200888366945),
+                                     V(-0.9675229603596517),
+                                     V(-0.27845290869168118),
+                                 }},
+                                 {{
+                                     V(527.107573587542228),
+                                     V(-1.4594385811273854),
+                                     V(-1.450082094097871593),
+                                     V(-1.5843722511996204),
+                                 }},
+                             }},
+                             4),
+        /* kMuls */
+        {{
+            V(1.0),
+            V(1.0),
+            V(1.0),
+        }});
+  }
+  // AFV
+  static QuantEncodingInternal AFV0() {
+    return QuantEncodingInternal::AFV(DCT4X8().dct_params, DCT4X4().dct_params,
+                                      {{{{
+                                            // 4x4/4x8 DC tendency.
+                                            V(3072.0),
+                                            V(3072.0),
+                                            // AFV corner.
+                                            V(256.0),
+                                            V(256.0),
+                                            V(256.0),
+                                            // AFV high freqs.
+                                            V(414.0),
+                                            V(0.0),
+                                            V(0.0),
+                                            V(0.0),
+                                        }},
+                                        {{
+                                            // 4x4/4x8 DC tendency.
+                                            V(1024.0),
+                                            V(1024.0),
+                                            // AFV corner.
+                                            V(50),
+                                            V(50),
+                                            V(50),
+                                            // AFV high freqs.
+                                            V(58.0),
+                                            V(0.0),
+                                            V(0.0),
+                                            V(0.0),
+                                        }},
+                                        {{
+                                            // 4x4/4x8 DC tendency.
+                                            V(384.0),
+                                            V(384.0),
+                                            // AFV corner.
+                                            V(12.0),
+                                            V(12.0),
+                                            V(12.0),
+                                            // AFV high freqs.
+                                            V(22.0),
+                                            V(-0.25),
+                                            V(-0.25),
+                                            V(-0.25),
+                                        }}}});
+  }
+
+  // DCT64
+  static QuantEncodingInternal DCT64X64() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(0.9 * 26629.073922049845),
+                                   V(-1.025),
+                                   V(-0.78),
+                                   V(-0.65012),
+                                   V(-0.19041574084286472),
+                                   V(-0.20819395464),
+                                   V(-0.421064),
+                                   V(-0.32733845535848671),
+                               }},
+                               {{
+                                   V(0.9 * 9311.3238710010046),
+                                   V(-0.3041958212306401),
+                                   V(-0.3633036457487539),
+                                   V(-0.35660379990111464),
+                                   V(-0.3443074455424403),
+                                   V(-0.33699592683512467),
+                                   V(-0.30180866526242109),
+                                   V(-0.27321683125358037),
+                               }},
+                               {{
+                                   V(0.9 * 4992.2486445538634),
+                                   V(-1.2),
+                                   V(-1.2),
+                                   V(-0.8),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.4),
+                                   V(-0.5),
+                               }}}},
+                             8));
+  }
+
+  // DCT64X32
+  static QuantEncodingInternal DCT32X64() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(0.65 * 23629.073922049845),
+                                   V(-1.025),
+                                   V(-0.78),
+                                   V(-0.65012),
+                                   V(-0.19041574084286472),
+                                   V(-0.20819395464),
+                                   V(-0.421064),
+                                   V(-0.32733845535848671),
+                               }},
+                               {{
+                                   V(0.65 * 8611.3238710010046),
+                                   V(-0.3041958212306401),
+                                   V(-0.3633036457487539),
+                                   V(-0.35660379990111464),
+                                   V(-0.3443074455424403),
+                                   V(-0.33699592683512467),
+                                   V(-0.30180866526242109),
+                                   V(-0.27321683125358037),
+                               }},
+                               {{
+                                   V(0.65 * 4492.2486445538634),
+                                   V(-1.2),
+                                   V(-1.2),
+                                   V(-0.8),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.4),
+                                   V(-0.5),
+                               }}}},
+                             8));
+  }
+  // DCT128X128
+  static QuantEncodingInternal DCT128X128() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(1.8 * 26629.073922049845),
+                                   V(-1.025),
+                                   V(-0.78),
+                                   V(-0.65012),
+                                   V(-0.19041574084286472),
+                                   V(-0.20819395464),
+                                   V(-0.421064),
+                                   V(-0.32733845535848671),
+                               }},
+                               {{
+                                   V(1.8 * 9311.3238710010046),
+                                   V(-0.3041958212306401),
+                                   V(-0.3633036457487539),
+                                   V(-0.35660379990111464),
+                                   V(-0.3443074455424403),
+                                   V(-0.33699592683512467),
+                                   V(-0.30180866526242109),
+                                   V(-0.27321683125358037),
+                               }},
+                               {{
+                                   V(1.8 * 4992.2486445538634),
+                                   V(-1.2),
+                                   V(-1.2),
+                                   V(-0.8),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.4),
+                                   V(-0.5),
+                               }}}},
+                             8));
+  }
+
+  // DCT128X64
+  static QuantEncodingInternal DCT64X128() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(1.3 * 23629.073922049845),
+                                   V(-1.025),
+                                   V(-0.78),
+                                   V(-0.65012),
+                                   V(-0.19041574084286472),
+                                   V(-0.20819395464),
+                                   V(-0.421064),
+                                   V(-0.32733845535848671),
+                               }},
+                               {{
+                                   V(1.3 * 8611.3238710010046),
+                                   V(-0.3041958212306401),
+                                   V(-0.3633036457487539),
+                                   V(-0.35660379990111464),
+                                   V(-0.3443074455424403),
+                                   V(-0.33699592683512467),
+                                   V(-0.30180866526242109),
+                                   V(-0.27321683125358037),
+                               }},
+                               {{
+                                   V(1.3 * 4492.2486445538634),
+                                   V(-1.2),
+                                   V(-1.2),
+                                   V(-0.8),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.4),
+                                   V(-0.5),
+                               }}}},
+                             8));
+  }
+  // DCT256X256
+  static QuantEncodingInternal DCT256X256() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(3.6 * 26629.073922049845),
+                                   V(-1.025),
+                                   V(-0.78),
+                                   V(-0.65012),
+                                   V(-0.19041574084286472),
+                                   V(-0.20819395464),
+                                   V(-0.421064),
+                                   V(-0.32733845535848671),
+                               }},
+                               {{
+                                   V(3.6 * 9311.3238710010046),
+                                   V(-0.3041958212306401),
+                                   V(-0.3633036457487539),
+                                   V(-0.35660379990111464),
+                                   V(-0.3443074455424403),
+                                   V(-0.33699592683512467),
+                                   V(-0.30180866526242109),
+                                   V(-0.27321683125358037),
+                               }},
+                               {{
+                                   V(3.6 * 4992.2486445538634),
+                                   V(-1.2),
+                                   V(-1.2),
+                                   V(-0.8),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.4),
+                                   V(-0.5),
+                               }}}},
+                             8));
+  }
+
+  // DCT256X128
+  static QuantEncodingInternal DCT128X256() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(2.6 * 23629.073922049845),
+                                   V(-1.025),
+                                   V(-0.78),
+                                   V(-0.65012),
+                                   V(-0.19041574084286472),
+                                   V(-0.20819395464),
+                                   V(-0.421064),
+                                   V(-0.32733845535848671),
+                               }},
+                               {{
+                                   V(2.6 * 8611.3238710010046),
+                                   V(-0.3041958212306401),
+                                   V(-0.3633036457487539),
+                                   V(-0.35660379990111464),
+                                   V(-0.3443074455424403),
+                                   V(-0.33699592683512467),
+                                   V(-0.30180866526242109),
+                                   V(-0.27321683125358037),
+                               }},
+                               {{
+                                   V(2.6 * 4492.2486445538634),
+                                   V(-1.2),
+                                   V(-1.2),
+                                   V(-0.8),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.4),
+                                   V(-0.5),
+                               }}}},
+                             8));
+  }
+};
+}  // namespace
+
+DequantMatrices::DequantLibraryInternal DequantMatrices::LibraryInit() {
+  static_assert(kNum == 17,
+                "Update this function when adding new quantization kinds.");
+  static_assert(kNumPredefinedTables == 1,
+                "Update this function when adding new quantization matrices to "
+                "the library.");
+
+  // The library and the indices need to be kept in sync manually.
+  static_assert(0 == DCT, "Update the DequantLibrary array below.");
+  static_assert(1 == IDENTITY, "Update the DequantLibrary array below.");
+  static_assert(2 == DCT2X2, "Update the DequantLibrary array below.");
+  static_assert(3 == DCT4X4, "Update the DequantLibrary array below.");
+  static_assert(4 == DCT16X16, "Update the DequantLibrary array below.");
+  static_assert(5 == DCT32X32, "Update the DequantLibrary array below.");
+  static_assert(6 == DCT8X16, "Update the DequantLibrary array below.");
+  static_assert(7 == DCT8X32, "Update the DequantLibrary array below.");
+  static_assert(8 == DCT16X32, "Update the DequantLibrary array below.");
+  static_assert(9 == DCT4X8, "Update the DequantLibrary array below.");
+  static_assert(10 == AFV0, "Update the DequantLibrary array below.");
+  static_assert(11 == DCT64X64, "Update the DequantLibrary array below.");
+  static_assert(12 == DCT32X64, "Update the DequantLibrary array below.");
+  static_assert(13 == DCT128X128, "Update the DequantLibrary array below.");
+  static_assert(14 == DCT64X128, "Update the DequantLibrary array below.");
+  static_assert(15 == DCT256X256, "Update the DequantLibrary array below.");
+  static_assert(16 == DCT128X256, "Update the DequantLibrary array below.");
+  return DequantMatrices::DequantLibraryInternal{{
+      DequantMatricesLibraryDef::DCT(),
+      DequantMatricesLibraryDef::IDENTITY(),
+      DequantMatricesLibraryDef::DCT2X2(),
+      DequantMatricesLibraryDef::DCT4X4(),
+      DequantMatricesLibraryDef::DCT16X16(),
+      DequantMatricesLibraryDef::DCT32X32(),
+      DequantMatricesLibraryDef::DCT8X16(),
+      DequantMatricesLibraryDef::DCT8X32(),
+      DequantMatricesLibraryDef::DCT16X32(),
+      DequantMatricesLibraryDef::DCT4X8(),
+      DequantMatricesLibraryDef::AFV0(),
+      DequantMatricesLibraryDef::DCT64X64(),
+      DequantMatricesLibraryDef::DCT32X64(),
+      // Same default for large transforms (128+) as for 64x* transforms.
+      DequantMatricesLibraryDef::DCT128X128(),
+      DequantMatricesLibraryDef::DCT64X128(),
+      DequantMatricesLibraryDef::DCT256X256(),
+      DequantMatricesLibraryDef::DCT128X256(),
+  }};
+}
+
+const QuantEncoding* DequantMatrices::Library() {
+  static const DequantMatrices::DequantLibraryInternal kDequantLibrary =
+      DequantMatrices::LibraryInit();
+  // Downcast the result to a const QuantEncoding* from QuantEncodingInternal*
+  // since the subclass (QuantEncoding) doesn't add any new members and users
+  // will need to upcast to QuantEncodingInternal to access the members of that
+  // class. This allows to have kDequantLibrary as a constexpr value while still
+  // allowing to create QuantEncoding::RAW() instances that use std::vector in
+  // C++11.
+  return reinterpret_cast<const QuantEncoding*>(kDequantLibrary.data());
+}
+
+DequantMatrices::DequantMatrices() {
+  encodings_.resize(size_t(QuantTable::kNum), QuantEncoding::Library(0));
+  size_t pos = 0;
+  size_t offsets[kNum * 3];
+  for (size_t i = 0; i < size_t(QuantTable::kNum); i++) {
+    size_t num = required_size_[i] * kDCTBlockSize;
+    for (size_t c = 0; c < 3; c++) {
+      offsets[3 * i + c] = pos + c * num;
+    }
+    pos += 3 * num;
+  }
+  for (size_t i = 0; i < AcStrategy::kNumValidStrategies; i++) {
+    for (size_t c = 0; c < 3; c++) {
+      table_offsets_[i * 3 + c] = offsets[kQuantTable[i] * 3 + c];
+    }
+  }
+}
+
+Status DequantMatrices::EnsureComputed(uint32_t acs_mask) {
+  const QuantEncoding* library = Library();
+
+  if (!table_storage_) {
+    table_storage_ = hwy::AllocateAligned<float>(2 * kTotalTableSize);
+    table_ = table_storage_.get();
+    inv_table_ = table_storage_.get() + kTotalTableSize;
+  }
+
+  size_t offsets[kNum * 3 + 1];
+  size_t pos = 0;
+  for (size_t i = 0; i < kNum; i++) {
+    size_t num = required_size_[i] * kDCTBlockSize;
+    for (size_t c = 0; c < 3; c++) {
+      offsets[3 * i + c] = pos + c * num;
+    }
+    pos += 3 * num;
+  }
+  offsets[kNum * 3] = pos;
+  JXL_ASSERT(pos == kTotalTableSize);
+
+  uint32_t kind_mask = 0;
+  for (size_t i = 0; i < AcStrategy::kNumValidStrategies; i++) {
+    if (acs_mask & (1u << i)) {
+      kind_mask |= 1u << kQuantTable[i];
+    }
+  }
+  uint32_t computed_kind_mask = 0;
+  for (size_t i = 0; i < AcStrategy::kNumValidStrategies; i++) {
+    if (computed_mask_ & (1u << i)) {
+      computed_kind_mask |= 1u << kQuantTable[i];
+    }
+  }
+  for (size_t table = 0; table < kNum; table++) {
+    if ((1 << table) & computed_kind_mask) continue;
+    if ((1 << table) & ~kind_mask) continue;
+    size_t pos = offsets[table * 3];
+    if (encodings_[table].mode == QuantEncoding::kQuantModeLibrary) {
+      JXL_CHECK(HWY_DYNAMIC_DISPATCH(ComputeQuantTable)(
+          library[table], table_storage_.get(),
+          table_storage_.get() + kTotalTableSize, table, QuantTable(table),
+          &pos));
+    } else {
+      JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(ComputeQuantTable)(
+          encodings_[table], table_storage_.get(),
+          table_storage_.get() + kTotalTableSize, table, QuantTable(table),
+          &pos));
+    }
+    JXL_ASSERT(pos == offsets[table * 3 + 3]);
+  }
+  computed_mask_ |= acs_mask;
+
+  return true;
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/quant_weights.h b/third_party/jpeg-xl/lib/jxl/quant_weights.h
new file mode 100644
index 0000000000..d76fc1d1e6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/quant_weights.h
@@ -0,0 +1,448 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_QUANT_WEIGHTS_H_
+#define LIB_JXL_QUANT_WEIGHTS_H_
+
+#include <stdint.h>
+#include <string.h>
+
+#include <array>
+#include <hwy/aligned_allocator.h>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/cache_aligned.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+template <typename T, size_t N>
+constexpr T ArraySum(T (&a)[N], size_t i = N - 1) {
+  static_assert(N > 0, "Trying to compute the sum of an empty array");
+  return i == 0 ? a[0] : a[i] + ArraySum(a, i - 1);
+}
+
+static constexpr size_t kMaxQuantTableSize = AcStrategy::kMaxCoeffArea;
+static constexpr size_t kNumPredefinedTables = 1;
+static constexpr size_t kCeilLog2NumPredefinedTables = 0;
+static constexpr size_t kLog2NumQuantModes = 3;
+
+struct DctQuantWeightParams {
+  static constexpr size_t kLog2MaxDistanceBands = 4;
+  static constexpr size_t kMaxDistanceBands = 1 + (1 << kLog2MaxDistanceBands);
+  typedef std::array<std::array<float, kMaxDistanceBands>, 3>
+      DistanceBandsArray;
+
+  size_t num_distance_bands = 0;
+  DistanceBandsArray distance_bands = {};
+
+  constexpr DctQuantWeightParams() : num_distance_bands(0) {}
+
+  constexpr DctQuantWeightParams(const DistanceBandsArray& dist_bands,
+                                 size_t num_dist_bands)
+      : num_distance_bands(num_dist_bands), distance_bands(dist_bands) {}
+
+  template <size_t num_dist_bands>
+  explicit DctQuantWeightParams(const float dist_bands[3][num_dist_bands]) {
+    num_distance_bands = num_dist_bands;
+    for (size_t c = 0; c < 3; c++) {
+      memcpy(distance_bands[c].data(), dist_bands[c],
+             sizeof(float) * num_dist_bands);
+    }
+  }
+};
+
+// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding)
+struct QuantEncodingInternal {
+  enum Mode {
+    kQuantModeLibrary,
+    kQuantModeID,
+    kQuantModeDCT2,
+    kQuantModeDCT4,
+    kQuantModeDCT4X8,
+    kQuantModeAFV,
+    kQuantModeDCT,
+    kQuantModeRAW,
+  };
+
+  template <Mode mode>
+  struct Tag {};
+
+  typedef std::array<std::array<float, 3>, 3> IdWeights;
+  typedef std::array<std::array<float, 6>, 3> DCT2Weights;
+  typedef std::array<std::array<float, 2>, 3> DCT4Multipliers;
+  typedef std::array<std::array<float, 9>, 3> AFVWeights;
+  typedef std::array<float, 3> DCT4x8Multipliers;
+
+  static constexpr QuantEncodingInternal Library(uint8_t predefined) {
+    return ((predefined < kNumPredefinedTables) ||
+            JXL_ABORT("Assert predefined < kNumPredefinedTables")),
+           QuantEncodingInternal(Tag<kQuantModeLibrary>(), predefined);
+  }
+  constexpr QuantEncodingInternal(Tag<kQuantModeLibrary> /* tag */,
+                                  uint8_t predefined)
+      : mode(kQuantModeLibrary), predefined(predefined) {}
+
+  // Identity
+  // xybweights is an array of {xweights, yweights, bweights}.
+  static constexpr QuantEncodingInternal Identity(const IdWeights& xybweights) {
+    return QuantEncodingInternal(Tag<kQuantModeID>(), xybweights);
+  }
+  constexpr QuantEncodingInternal(Tag<kQuantModeID> /* tag */,
+                                  const IdWeights& xybweights)
+      : mode(kQuantModeID), idweights(xybweights) {}
+
+  // DCT2
+  static constexpr QuantEncodingInternal DCT2(const DCT2Weights& xybweights) {
+    return QuantEncodingInternal(Tag<kQuantModeDCT2>(), xybweights);
+  }
+  constexpr QuantEncodingInternal(Tag<kQuantModeDCT2> /* tag */,
+                                  const DCT2Weights& xybweights)
+      : mode(kQuantModeDCT2), dct2weights(xybweights) {}
+
+  // DCT4
+  static constexpr QuantEncodingInternal DCT4(
+      const DctQuantWeightParams& params, const DCT4Multipliers& xybmul) {
+    return QuantEncodingInternal(Tag<kQuantModeDCT4>(), params, xybmul);
+  }
+  constexpr QuantEncodingInternal(Tag<kQuantModeDCT4> /* tag */,
+                                  const DctQuantWeightParams& params,
+                                  const DCT4Multipliers& xybmul)
+      : mode(kQuantModeDCT4), dct_params(params), dct4multipliers(xybmul) {}
+
+  // DCT4x8
+  static constexpr QuantEncodingInternal DCT4X8(
+      const DctQuantWeightParams& params, const DCT4x8Multipliers& xybmul) {
+    return QuantEncodingInternal(Tag<kQuantModeDCT4X8>(), params, xybmul);
+  }
+  constexpr QuantEncodingInternal(Tag<kQuantModeDCT4X8> /* tag */,
+                                  const DctQuantWeightParams& params,
+                                  const DCT4x8Multipliers& xybmul)
+      : mode(kQuantModeDCT4X8), dct_params(params), dct4x8multipliers(xybmul) {}
+
+  // DCT
+  static constexpr QuantEncodingInternal DCT(
+      const DctQuantWeightParams& params) {
+    return QuantEncodingInternal(Tag<kQuantModeDCT>(), params);
+  }
+  constexpr QuantEncodingInternal(Tag<kQuantModeDCT> /* tag */,
+                                  const DctQuantWeightParams& params)
+      : mode(kQuantModeDCT), dct_params(params) {}
+
+  // AFV
+  static constexpr QuantEncodingInternal AFV(
+      const DctQuantWeightParams& params4x8,
+      const DctQuantWeightParams& params4x4, const AFVWeights& weights) {
+    return QuantEncodingInternal(Tag<kQuantModeAFV>(), params4x8, params4x4,
+                                 weights);
+  }
+  constexpr QuantEncodingInternal(Tag<kQuantModeAFV> /* tag */,
+                                  const DctQuantWeightParams& params4x8,
+                                  const DctQuantWeightParams& params4x4,
+                                  const AFVWeights& weights)
+      : mode(kQuantModeAFV),
+        dct_params(params4x8),
+        afv_weights(weights),
+        dct_params_afv_4x4(params4x4) {}
+
+  // This constructor is not constexpr so it can't be used in any of the
+  // constexpr cases above.
+  explicit QuantEncodingInternal(Mode mode) : mode(mode) {}
+
+  Mode mode;
+
+  // Weights for DCT4+ tables.
+  DctQuantWeightParams dct_params;
+
+  union {
+    // Weights for identity.
+    IdWeights idweights;
+
+    // Weights for DCT2.
+    DCT2Weights dct2weights;
+
+    // Extra multipliers for coefficients 01/10 and 11 for DCT4 and AFV.
+    DCT4Multipliers dct4multipliers;
+
+    // Weights for AFV. {0, 1} are used directly for coefficients (0, 1) and (1,
+    // 0);  {2, 3, 4} are used directly corner DC, (1,0) - (0,1) and (0, 1) +
+    // (1, 0) - (0, 0) inside the AFV block. Values from 5 to 8 are interpolated
+    // as in GetQuantWeights for DC and are used for other coefficients.
+    AFVWeights afv_weights = {};
+
+    // Extra multipliers for coefficients 01 or 10 for DCT4X8 and DCT8X4.
+    DCT4x8Multipliers dct4x8multipliers;
+
+    // Only used in kQuantModeRAW mode.
+    struct {
+      // explicit quantization table (like in JPEG)
+      std::vector<int>* qtable = nullptr;
+      float qtable_den = 1.f / (8 * 255);
+    } qraw;
+  };
+
+  // Weights for 4x4 sub-block in AFV.
+  DctQuantWeightParams dct_params_afv_4x4;
+
+  union {
+    // Which predefined table to use. Only used if mode is kQuantModeLibrary.
+    uint8_t predefined = 0;
+
+    // Which other quant table to copy; must copy from a table that comes before
+    // the current one. Only used if mode is kQuantModeCopy.
+    uint8_t source;
+  };
+};
+
+class QuantEncoding final : public QuantEncodingInternal {
+ public:
+  QuantEncoding(const QuantEncoding& other)
+      : QuantEncodingInternal(
+            static_cast<const QuantEncodingInternal&>(other)) {
+    if (mode == kQuantModeRAW && qraw.qtable) {
+      // Need to make a copy of the passed *qtable.
+      qraw.qtable = new std::vector<int>(*other.qraw.qtable);
+    }
+  }
+  QuantEncoding(QuantEncoding&& other) noexcept
+      : QuantEncodingInternal(
+            static_cast<const QuantEncodingInternal&>(other)) {
+    // Steal the qtable from the other object if any.
+    if (mode == kQuantModeRAW) {
+      other.qraw.qtable = nullptr;
+    }
+  }
+  QuantEncoding& operator=(const QuantEncoding& other) {
+    if (mode == kQuantModeRAW && qraw.qtable) {
+      delete qraw.qtable;
+    }
+    *static_cast<QuantEncodingInternal*>(this) =
+        QuantEncodingInternal(static_cast<const QuantEncodingInternal&>(other));
+    if (mode == kQuantModeRAW && qraw.qtable) {
+      // Need to make a copy of the passed *qtable.
+      qraw.qtable = new std::vector<int>(*other.qraw.qtable);
+    }
+    return *this;
+  }
+
+  ~QuantEncoding() {
+    if (mode == kQuantModeRAW && qraw.qtable) {
+      delete qraw.qtable;
+    }
+  }
+
+  // Wrappers of the QuantEncodingInternal:: static functions that return a
+  // QuantEncoding instead. This is using the explicit and private cast from
+  // QuantEncodingInternal to QuantEncoding, which would be inlined anyway.
+  // In general, you should use this wrappers. The only reason to directly
+  // create a QuantEncodingInternal instance is if you need a constexpr version
+  // of this class. Note that RAW() is not supported in that case since it uses
+  // a std::vector.
+  static QuantEncoding Library(uint8_t predefined_arg) {
+    return QuantEncoding(QuantEncodingInternal::Library(predefined_arg));
+  }
+  static QuantEncoding Identity(const IdWeights& xybweights) {
+    return QuantEncoding(QuantEncodingInternal::Identity(xybweights));
+  }
+  static QuantEncoding DCT2(const DCT2Weights& xybweights) {
+    return QuantEncoding(QuantEncodingInternal::DCT2(xybweights));
+  }
+  static QuantEncoding DCT4(const DctQuantWeightParams& params,
+                            const DCT4Multipliers& xybmul) {
+    return QuantEncoding(QuantEncodingInternal::DCT4(params, xybmul));
+  }
+  static QuantEncoding DCT4X8(const DctQuantWeightParams& params,
+                              const DCT4x8Multipliers& xybmul) {
+    return QuantEncoding(QuantEncodingInternal::DCT4X8(params, xybmul));
+  }
+  static QuantEncoding DCT(const DctQuantWeightParams& params) {
+    return QuantEncoding(QuantEncodingInternal::DCT(params));
+  }
+  static QuantEncoding AFV(const DctQuantWeightParams& params4x8,
+                           const DctQuantWeightParams& params4x4,
+                           const AFVWeights& weights) {
+    return QuantEncoding(
+        QuantEncodingInternal::AFV(params4x8, params4x4, weights));
+  }
+
+  // RAW, note that this one is not a constexpr one.
+  static QuantEncoding RAW(const std::vector<int>& qtable, int shift = 0) {
+    QuantEncoding encoding(kQuantModeRAW);
+    encoding.qraw.qtable = new std::vector<int>();
+    *encoding.qraw.qtable = qtable;
+    encoding.qraw.qtable_den = (1 << shift) * (1.f / (8 * 255));
+    return encoding;
+  }
+
+ private:
+  explicit QuantEncoding(const QuantEncodingInternal& other)
+      : QuantEncodingInternal(other) {}
+
+  explicit QuantEncoding(QuantEncodingInternal::Mode mode_arg)
+      : QuantEncodingInternal(mode_arg) {}
+};
+
+// A constexpr QuantEncodingInternal instance is often downcasted to the
+// QuantEncoding subclass even if the instance wasn't an instance of the
+// subclass. This is safe because user will upcast to QuantEncodingInternal to
+// access any of its members.
+static_assert(sizeof(QuantEncoding) == sizeof(QuantEncodingInternal),
+              "Don't add any members to QuantEncoding");
+
+// Let's try to keep these 2**N for possible future simplicity.
+const float kInvDCQuant[3] = {
+    4096.0f,
+    512.0f,
+    256.0f,
+};
+
+const float kDCQuant[3] = {
+    1.0f / kInvDCQuant[0],
+    1.0f / kInvDCQuant[1],
+    1.0f / kInvDCQuant[2],
+};
+
+class ModularFrameEncoder;
+class ModularFrameDecoder;
+
+class DequantMatrices {
+ public:
+  enum QuantTable : size_t {
+    DCT = 0,
+    IDENTITY,
+    DCT2X2,
+    DCT4X4,
+    DCT16X16,
+    DCT32X32,
+    // DCT16X8
+    DCT8X16,
+    // DCT32X8
+    DCT8X32,
+    // DCT32X16
+    DCT16X32,
+    DCT4X8,
+    // DCT8X4
+    AFV0,
+    // AFV1
+    // AFV2
+    // AFV3
+    DCT64X64,
+    // DCT64X32,
+    DCT32X64,
+    DCT128X128,
+    // DCT128X64,
+    DCT64X128,
+    DCT256X256,
+    // DCT256X128,
+    DCT128X256,
+    kNum
+  };
+
+  static constexpr QuantTable kQuantTable[] = {
+      QuantTable::DCT,        QuantTable::IDENTITY,   QuantTable::DCT2X2,
+      QuantTable::DCT4X4,     QuantTable::DCT16X16,   QuantTable::DCT32X32,
+      QuantTable::DCT8X16,    QuantTable::DCT8X16,    QuantTable::DCT8X32,
+      QuantTable::DCT8X32,    QuantTable::DCT16X32,   QuantTable::DCT16X32,
+      QuantTable::DCT4X8,     QuantTable::DCT4X8,     QuantTable::AFV0,
+      QuantTable::AFV0,       QuantTable::AFV0,       QuantTable::AFV0,
+      QuantTable::DCT64X64,   QuantTable::DCT32X64,   QuantTable::DCT32X64,
+      QuantTable::DCT128X128, QuantTable::DCT64X128,  QuantTable::DCT64X128,
+      QuantTable::DCT256X256, QuantTable::DCT128X256, QuantTable::DCT128X256,
+  };
+  static_assert(AcStrategy::kNumValidStrategies ==
+                    sizeof(kQuantTable) / sizeof *kQuantTable,
+                "Update this array when adding or removing AC strategies.");
+
+  DequantMatrices();
+
+  static const QuantEncoding* Library();
+
+  typedef std::array<QuantEncodingInternal, kNumPredefinedTables * kNum>
+      DequantLibraryInternal;
+  // Return the array of library kNumPredefinedTables QuantEncoding entries as
+  // a constexpr array. Use Library() to obtain a pointer to the copy in the
+  // .cc file.
+  static DequantLibraryInternal LibraryInit();
+
+  // Returns aligned memory.
+  JXL_INLINE const float* Matrix(size_t quant_kind, size_t c) const {
+    JXL_DASSERT(quant_kind < AcStrategy::kNumValidStrategies);
+    JXL_DASSERT((1 << quant_kind) & computed_mask_);
+    return &table_[table_offsets_[quant_kind * 3 + c]];
+  }
+
+  JXL_INLINE const float* InvMatrix(size_t quant_kind, size_t c) const {
+    JXL_DASSERT(quant_kind < AcStrategy::kNumValidStrategies);
+    JXL_DASSERT((1 << quant_kind) & computed_mask_);
+    return &inv_table_[table_offsets_[quant_kind * 3 + c]];
+  }
+
+  // DC quants are used in modular mode for XYB multipliers.
+  JXL_INLINE float DCQuant(size_t c) const { return dc_quant_[c]; }
+  JXL_INLINE const float* DCQuants() const { return dc_quant_; }
+
+  JXL_INLINE float InvDCQuant(size_t c) const { return inv_dc_quant_[c]; }
+
+  // For encoder.
+  void SetEncodings(const std::vector<QuantEncoding>& encodings) {
+    encodings_ = encodings;
+    computed_mask_ = 0;
+  }
+
+  // For encoder.
+  void SetDCQuant(const float dc[3]) {
+    for (size_t c = 0; c < 3; c++) {
+      dc_quant_[c] = 1.0f / dc[c];
+      inv_dc_quant_[c] = dc[c];
+    }
+  }
+
+  Status Decode(BitReader* br,
+                ModularFrameDecoder* modular_frame_decoder = nullptr);
+  Status DecodeDC(BitReader* br);
+
+  const std::vector<QuantEncoding>& encodings() const { return encodings_; }
+
+  static constexpr size_t required_size_x[] = {1, 1, 1, 1, 2,  4, 1,  1, 2,
+                                               1, 1, 8, 4, 16, 8, 32, 16};
+  static_assert(kNum == sizeof(required_size_x) / sizeof(*required_size_x),
+                "Update this array when adding or removing quant tables.");
+
+  static constexpr size_t required_size_y[] = {1, 1, 1, 1, 2,  4,  2,  4, 4,
+                                               1, 1, 8, 8, 16, 16, 32, 32};
+  static_assert(kNum == sizeof(required_size_y) / sizeof(*required_size_y),
+                "Update this array when adding or removing quant tables.");
+
+  Status EnsureComputed(uint32_t acs_mask);
+
+ private:
+  static constexpr size_t required_size_[] = {
+      1, 1, 1, 1, 4, 16, 2, 4, 8, 1, 1, 64, 32, 256, 128, 1024, 512};
+  static_assert(kNum == sizeof(required_size_) / sizeof(*required_size_),
+                "Update this array when adding or removing quant tables.");
+  static constexpr size_t kTotalTableSize =
+      ArraySum(required_size_) * kDCTBlockSize * 3;
+
+  uint32_t computed_mask_ = 0;
+  // kTotalTableSize entries followed by kTotalTableSize for inv_table
+  hwy::AlignedFreeUniquePtr<float[]> table_storage_;
+  const float* table_;
+  const float* inv_table_;
+  float dc_quant_[3] = {kDCQuant[0], kDCQuant[1], kDCQuant[2]};
+  float inv_dc_quant_[3] = {kInvDCQuant[0], kInvDCQuant[1], kInvDCQuant[2]};
+  size_t table_offsets_[AcStrategy::kNumValidStrategies * 3];
+  std::vector<QuantEncoding> encodings_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_QUANT_WEIGHTS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/quant_weights_test.cc b/third_party/jpeg-xl/lib/jxl/quant_weights_test.cc
new file mode 100644
index 0000000000..f0497948a7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/quant_weights_test.cc
@@ -0,0 +1,240 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+#include "lib/jxl/quant_weights.h"
+
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+#include <hwy/base.h>  // HWY_ALIGN_MAX
+#include <hwy/tests/test_util-inl.h>
+#include <numeric>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/dct_for_test.h"
+#include "lib/jxl/dec_transforms_testonly.h"
+#include "lib/jxl/enc_modular.h"
+#include "lib/jxl/enc_quant_weights.h"
+#include "lib/jxl/enc_transforms.h"
+
+namespace jxl {
+namespace {
+
+template <typename T>
+void CheckSimilar(T a, T b) {
+  EXPECT_EQ(a, b);
+}
+// minimum exponent = -15.
+template <>
+void CheckSimilar(float a, float b) {
+  float m = std::max(std::abs(a), std::abs(b));
+  // 10 bits of precision are used in the format. Relative error should be
+  // below 2^-10.
+  EXPECT_LE(std::abs(a - b), m / 1024.0f) << "a: " << a << " b: " << b;
+}
+
+TEST(QuantWeightsTest, DC) {
+  DequantMatrices mat;
+  float dc_quant[3] = {1e+5, 1e+3, 1e+1};
+  DequantMatricesSetCustomDC(&mat, dc_quant);
+  for (size_t c = 0; c < 3; c++) {
+    CheckSimilar(mat.InvDCQuant(c), dc_quant[c]);
+  }
+}
+
+void RoundtripMatrices(const std::vector<QuantEncoding>& encodings) {
+  ASSERT_TRUE(encodings.size() == DequantMatrices::kNum);
+  DequantMatrices mat;
+  CodecMetadata metadata;
+  FrameHeader frame_header(&metadata);
+  ModularFrameEncoder encoder(frame_header, CompressParams{});
+  DequantMatricesSetCustom(&mat, encodings, &encoder);
+  const std::vector<QuantEncoding>& encodings_dec = mat.encodings();
+  for (size_t i = 0; i < encodings.size(); i++) {
+    const QuantEncoding& e = encodings[i];
+    const QuantEncoding& d = encodings_dec[i];
+    // Check values roundtripped correctly.
+    EXPECT_EQ(e.mode, d.mode);
+    EXPECT_EQ(e.predefined, d.predefined);
+    EXPECT_EQ(e.source, d.source);
+
+    EXPECT_EQ(static_cast<uint64_t>(e.dct_params.num_distance_bands),
+              static_cast<uint64_t>(d.dct_params.num_distance_bands));
+    for (size_t c = 0; c < 3; c++) {
+      for (size_t j = 0; j < DctQuantWeightParams::kMaxDistanceBands; j++) {
+        CheckSimilar(e.dct_params.distance_bands[c][j],
+                     d.dct_params.distance_bands[c][j]);
+      }
+    }
+
+    if (e.mode == QuantEncoding::kQuantModeRAW) {
+      EXPECT_FALSE(!e.qraw.qtable);
+      EXPECT_FALSE(!d.qraw.qtable);
+      EXPECT_EQ(e.qraw.qtable->size(), d.qraw.qtable->size());
+      for (size_t j = 0; j < e.qraw.qtable->size(); j++) {
+        EXPECT_EQ((*e.qraw.qtable)[j], (*d.qraw.qtable)[j]);
+      }
+      EXPECT_NEAR(e.qraw.qtable_den, d.qraw.qtable_den, 1e-7f);
+    } else {
+      // modes different than kQuantModeRAW use one of the other fields used
+      // here, which all happen to be arrays of floats.
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t j = 0; j < 3; j++) {
+          CheckSimilar(e.idweights[c][j], d.idweights[c][j]);
+        }
+        for (size_t j = 0; j < 6; j++) {
+          CheckSimilar(e.dct2weights[c][j], d.dct2weights[c][j]);
+        }
+        for (size_t j = 0; j < 2; j++) {
+          CheckSimilar(e.dct4multipliers[c][j], d.dct4multipliers[c][j]);
+        }
+        CheckSimilar(e.dct4x8multipliers[c], d.dct4x8multipliers[c]);
+        for (size_t j = 0; j < 9; j++) {
+          CheckSimilar(e.afv_weights[c][j], d.afv_weights[c][j]);
+        }
+        for (size_t j = 0; j < DctQuantWeightParams::kMaxDistanceBands; j++) {
+          CheckSimilar(e.dct_params_afv_4x4.distance_bands[c][j],
+                       d.dct_params_afv_4x4.distance_bands[c][j]);
+        }
+      }
+    }
+  }
+}
+
+TEST(QuantWeightsTest, AllDefault) {
+  std::vector<QuantEncoding> encodings(DequantMatrices::kNum,
+                                       QuantEncoding::Library(0));
+  RoundtripMatrices(encodings);
+}
+
+void TestSingleQuantMatrix(DequantMatrices::QuantTable kind) {
+  std::vector<QuantEncoding> encodings(DequantMatrices::kNum,
+                                       QuantEncoding::Library(0));
+  encodings[kind] = DequantMatrices::Library()[kind];
+  RoundtripMatrices(encodings);
+}
+
+// Ensure we can reasonably represent default quant tables.
+TEST(QuantWeightsTest, DCT) { TestSingleQuantMatrix(DequantMatrices::DCT); }
+TEST(QuantWeightsTest, IDENTITY) {
+  TestSingleQuantMatrix(DequantMatrices::IDENTITY);
+}
+TEST(QuantWeightsTest, DCT2X2) {
+  TestSingleQuantMatrix(DequantMatrices::DCT2X2);
+}
+TEST(QuantWeightsTest, DCT4X4) {
+  TestSingleQuantMatrix(DequantMatrices::DCT4X4);
+}
+TEST(QuantWeightsTest, DCT16X16) {
+  TestSingleQuantMatrix(DequantMatrices::DCT16X16);
+}
+TEST(QuantWeightsTest, DCT32X32) {
+  TestSingleQuantMatrix(DequantMatrices::DCT32X32);
+}
+TEST(QuantWeightsTest, DCT8X16) {
+  TestSingleQuantMatrix(DequantMatrices::DCT8X16);
+}
+TEST(QuantWeightsTest, DCT8X32) {
+  TestSingleQuantMatrix(DequantMatrices::DCT8X32);
+}
+TEST(QuantWeightsTest, DCT16X32) {
+  TestSingleQuantMatrix(DequantMatrices::DCT16X32);
+}
+TEST(QuantWeightsTest, DCT4X8) {
+  TestSingleQuantMatrix(DequantMatrices::DCT4X8);
+}
+TEST(QuantWeightsTest, AFV0) { TestSingleQuantMatrix(DequantMatrices::AFV0); }
+TEST(QuantWeightsTest, RAW) {
+  std::vector<QuantEncoding> encodings(DequantMatrices::kNum,
+                                       QuantEncoding::Library(0));
+  std::vector<int> matrix(3 * 32 * 32);
+  Rng rng(0);
+  for (size_t i = 0; i < matrix.size(); i++) matrix[i] = rng.UniformI(1, 256);
+  encodings[DequantMatrices::kQuantTable[AcStrategy::DCT32X32]] =
+      QuantEncoding::RAW(matrix, 2);
+  RoundtripMatrices(encodings);
+}
+
+class QuantWeightsTargetTest : public hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(QuantWeightsTargetTest);
+
+TEST_P(QuantWeightsTargetTest, DCTUniform) {
+  constexpr float kUniformQuant = 4;
+  float weights[3][2] = {{1.0f / kUniformQuant, 0},
+                         {1.0f / kUniformQuant, 0},
+                         {1.0f / kUniformQuant, 0}};
+  DctQuantWeightParams dct_params(weights);
+  std::vector<QuantEncoding> encodings(DequantMatrices::kNum,
+                                       QuantEncoding::DCT(dct_params));
+  DequantMatrices dequant_matrices;
+  CodecMetadata metadata;
+  FrameHeader frame_header(&metadata);
+  ModularFrameEncoder encoder(frame_header, CompressParams{});
+  DequantMatricesSetCustom(&dequant_matrices, encodings, &encoder);
+  JXL_CHECK(dequant_matrices.EnsureComputed(~0u));
+
+  const float dc_quant[3] = {1.0f / kUniformQuant, 1.0f / kUniformQuant,
+                             1.0f / kUniformQuant};
+  DequantMatricesSetCustomDC(&dequant_matrices, dc_quant);
+
+  HWY_ALIGN_MAX float scratch_space[16 * 16 * 2];
+
+  // DCT8
+  {
+    HWY_ALIGN_MAX float pixels[64];
+    std::iota(std::begin(pixels), std::end(pixels), 0);
+    HWY_ALIGN_MAX float coeffs[64];
+    const AcStrategy::Type dct = AcStrategy::DCT;
+    TransformFromPixels(dct, pixels, 8, coeffs, scratch_space);
+    HWY_ALIGN_MAX double slow_coeffs[64];
+    for (size_t i = 0; i < 64; i++) slow_coeffs[i] = pixels[i];
+    DCTSlow<8>(slow_coeffs);
+
+    for (size_t i = 0; i < 64; i++) {
+      // DCTSlow doesn't multiply/divide by 1/N, so we do it manually.
+      slow_coeffs[i] = roundf(slow_coeffs[i] / kUniformQuant) * kUniformQuant;
+      coeffs[i] = roundf(coeffs[i] / dequant_matrices.Matrix(dct, 0)[i]) *
+                  dequant_matrices.Matrix(dct, 0)[i];
+    }
+    IDCTSlow<8>(slow_coeffs);
+    TransformToPixels(dct, coeffs, pixels, 8, scratch_space);
+    for (size_t i = 0; i < 64; i++) {
+      EXPECT_NEAR(pixels[i], slow_coeffs[i], 1e-4);
+    }
+  }
+
+  // DCT16
+  {
+    HWY_ALIGN_MAX float pixels[64 * 4];
+    std::iota(std::begin(pixels), std::end(pixels), 0);
+    HWY_ALIGN_MAX float coeffs[64 * 4];
+    const AcStrategy::Type dct = AcStrategy::DCT16X16;
+    TransformFromPixels(dct, pixels, 16, coeffs, scratch_space);
+    HWY_ALIGN_MAX double slow_coeffs[64 * 4];
+    for (size_t i = 0; i < 64 * 4; i++) slow_coeffs[i] = pixels[i];
+    DCTSlow<16>(slow_coeffs);
+
+    for (size_t i = 0; i < 64 * 4; i++) {
+      slow_coeffs[i] = roundf(slow_coeffs[i] / kUniformQuant) * kUniformQuant;
+      coeffs[i] = roundf(coeffs[i] / dequant_matrices.Matrix(dct, 0)[i]) *
+                  dequant_matrices.Matrix(dct, 0)[i];
+    }
+
+    IDCTSlow<16>(slow_coeffs);
+    TransformToPixels(dct, coeffs, pixels, 16, scratch_space);
+    for (size_t i = 0; i < 64 * 4; i++) {
+      EXPECT_NEAR(pixels[i], slow_coeffs[i], 1e-4);
+    }
+  }
+
+  // Check that all matrices have the same DC quantization, i.e. that they all
+  // have the same scaling.
+  for (size_t i = 0; i < AcStrategy::kNumValidStrategies; i++) {
+    EXPECT_NEAR(dequant_matrices.Matrix(i, 0)[0], kUniformQuant, 1e-6);
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/quantizer-inl.h b/third_party/jpeg-xl/lib/jxl/quantizer-inl.h
new file mode 100644
index 0000000000..64d273c552
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/quantizer-inl.h
@@ -0,0 +1,74 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JXL_QUANTIZER_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_QUANTIZER_INL_H_
+#undef LIB_JXL_QUANTIZER_INL_H_
+#else
+#define LIB_JXL_QUANTIZER_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::AndNot;
+using hwy::HWY_NAMESPACE::ApproximateReciprocal;
+using hwy::HWY_NAMESPACE::Gt;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::Lt;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::Xor;
+
+template <class DI>
+HWY_INLINE HWY_MAYBE_UNUSED Vec<Rebind<float, DI>> AdjustQuantBias(
+    DI di, const size_t c, const Vec<DI> quant_i,
+    const float* HWY_RESTRICT biases) {
+  const Rebind<float, DI> df;
+
+  const auto quant = ConvertTo(df, quant_i);
+
+  // Compare |quant|, keep sign bit for negating result.
+  const auto kSign = BitCast(df, Set(di, INT32_MIN));
+  const auto sign = And(quant, kSign);  // TODO(janwas): = abs ^ orig
+  const auto abs_quant = AndNot(kSign, quant);
+
+  // If |x| is 1, kZeroBias creates a different bias for each channel.
+  // We're implementing the following:
+  // if (quant == 0) return 0;
+  // if (quant == 1) return biases[c];
+  // if (quant == -1) return -biases[c];
+  // return quant - biases[3] / quant;
+
+  // Integer comparison is not helpful because Clang incurs bypass penalties
+  // from unnecessarily mixing integer and float.
+  const auto is_01 = Lt(abs_quant, Set(df, 1.125f));
+  const auto not_0 = Gt(abs_quant, Zero(df));
+
+  // Bitwise logic is faster than quant * biases[c].
+  const auto one_bias = IfThenElseZero(not_0, Xor(Set(df, biases[c]), sign));
+
+  // About 2E-5 worse than ReciprocalNR or division.
+  const auto bias =
+      NegMulAdd(Set(df, biases[3]), ApproximateReciprocal(quant), quant);
+
+  return IfThenElse(is_01, one_bias, bias);
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_QUANTIZER_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/quantizer.cc b/third_party/jpeg-xl/lib/jxl/quantizer.cc
new file mode 100644
index 0000000000..153cf19b21
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/quantizer.cc
@@ -0,0 +1,156 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/quantizer.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/quant_weights.h"
+
+namespace jxl {
+
+static const int32_t kDefaultQuant = 64;
+
+constexpr int32_t Quantizer::kQuantMax;
+
+Quantizer::Quantizer(const DequantMatrices* dequant)
+    : Quantizer(dequant, kDefaultQuant, kGlobalScaleDenom / kDefaultQuant) {}
+
+Quantizer::Quantizer(const DequantMatrices* dequant, int quant_dc,
+                     int global_scale)
+    : global_scale_(global_scale), quant_dc_(quant_dc), dequant_(dequant) {
+  JXL_ASSERT(dequant_ != nullptr);
+  RecomputeFromGlobalScale();
+  inv_quant_dc_ = inv_global_scale_ / quant_dc_;
+
+  memcpy(zero_bias_, kZeroBiasDefault, sizeof(kZeroBiasDefault));
+}
+
+void Quantizer::ComputeGlobalScaleAndQuant(float quant_dc, float quant_median,
+                                           float quant_median_absd) {
+  // Target value for the median value in the quant field.
+  const float kQuantFieldTarget = 5;
+  // We reduce the median of the quant field by the median absolute deviation:
+  // higher resolution on highly varying quant fields.
+  float scale = kGlobalScaleDenom * (quant_median - quant_median_absd) /
+                kQuantFieldTarget;
+  // Ensure that new_global_scale is positive and no more than 1<<15.
+  if (scale < 1) scale = 1;
+  if (scale > (1 << 15)) scale = 1 << 15;
+  int new_global_scale = static_cast<int>(scale);
+  // Ensure that quant_dc_ will always be at least
+  // 0.625 * kGlobalScaleDenom/kGlobalScaleNumerator = 10.
+  const int scaled_quant_dc =
+      static_cast<int>(quant_dc * kGlobalScaleNumerator * 1.6);
+  if (new_global_scale > scaled_quant_dc) {
+    new_global_scale = scaled_quant_dc;
+    if (new_global_scale <= 0) new_global_scale = 1;
+  }
+  global_scale_ = new_global_scale;
+  // Code below uses inv_global_scale_.
+  RecomputeFromGlobalScale();
+
+  float fval = quant_dc * inv_global_scale_ + 0.5f;
+  fval = std::min<float>(1 << 16, fval);
+  const int new_quant_dc = static_cast<int>(fval);
+  quant_dc_ = new_quant_dc;
+
+  // quant_dc_ was updated, recompute values.
+  RecomputeFromGlobalScale();
+}
+
+void Quantizer::SetQuantFieldRect(const ImageF& qf, const Rect& rect,
+                                  ImageI* JXL_RESTRICT raw_quant_field) const {
+  for (size_t y = 0; y < rect.ysize(); ++y) {
+    const float* JXL_RESTRICT row_qf = rect.ConstRow(qf, y);
+    int32_t* JXL_RESTRICT row_qi = rect.Row(raw_quant_field, y);
+    for (size_t x = 0; x < rect.xsize(); ++x) {
+      int val = ClampVal(row_qf[x] * inv_global_scale_ + 0.5f);
+      row_qi[x] = val;
+    }
+  }
+}
+
+void Quantizer::SetQuantField(const float quant_dc, const ImageF& qf,
+                              ImageI* JXL_RESTRICT raw_quant_field) {
+  std::vector<float> data(qf.xsize() * qf.ysize());
+  for (size_t y = 0; y < qf.ysize(); ++y) {
+    const float* JXL_RESTRICT row_qf = qf.Row(y);
+    for (size_t x = 0; x < qf.xsize(); ++x) {
+      float quant = row_qf[x];
+      data[qf.xsize() * y + x] = quant;
+    }
+  }
+  std::nth_element(data.begin(), data.begin() + data.size() / 2, data.end());
+  const float quant_median = data[data.size() / 2];
+  std::vector<float> deviations(data.size());
+  for (size_t i = 0; i < data.size(); i++) {
+    deviations[i] = fabsf(data[i] - quant_median);
+  }
+  std::nth_element(deviations.begin(),
+                   deviations.begin() + deviations.size() / 2,
+                   deviations.end());
+  const float quant_median_absd = deviations[deviations.size() / 2];
+  ComputeGlobalScaleAndQuant(quant_dc, quant_median, quant_median_absd);
+  if (raw_quant_field) {
+    JXL_CHECK(SameSize(*raw_quant_field, qf));
+    SetQuantFieldRect(qf, Rect(qf), raw_quant_field);
+  }
+}
+
+void Quantizer::SetQuant(float quant_dc, float quant_ac,
+                         ImageI* JXL_RESTRICT raw_quant_field) {
+  ComputeGlobalScaleAndQuant(quant_dc, quant_ac, 0);
+  int32_t val = ClampVal(quant_ac * inv_global_scale_ + 0.5f);
+  FillImage(val, raw_quant_field);
+}
+
+Status QuantizerParams::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+      BitsOffset(11, 1), BitsOffset(11, 2049), BitsOffset(12, 4097),
+      BitsOffset(16, 8193), 1, &global_scale));
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(16), BitsOffset(5, 1),
+                                         BitsOffset(8, 1), BitsOffset(16, 1), 1,
+                                         &quant_dc));
+  return true;
+}
+
+QuantizerParams Quantizer::GetParams() const {
+  QuantizerParams params;
+  params.global_scale = global_scale_;
+  params.quant_dc = quant_dc_;
+  return params;
+}
+
+Status Quantizer::Decode(BitReader* reader) {
+  QuantizerParams params;
+  JXL_RETURN_IF_ERROR(Bundle::Read(reader, &params));
+  global_scale_ = static_cast<int>(params.global_scale);
+  quant_dc_ = static_cast<int>(params.quant_dc);
+  RecomputeFromGlobalScale();
+  return true;
+}
+
+void Quantizer::DumpQuantizationMap(const ImageI& raw_quant_field) const {
+  printf("Global scale: %d (%.7f)\nDC quant: %d\n", global_scale_,
+         global_scale_ * 1.0 / kGlobalScaleDenom, quant_dc_);
+  printf("AC quantization Map:\n");
+  for (size_t y = 0; y < raw_quant_field.ysize(); ++y) {
+    for (size_t x = 0; x < raw_quant_field.xsize(); ++x) {
+      printf(" %3d", raw_quant_field.Row(y)[x]);
+    }
+    printf("\n");
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/quantizer.h b/third_party/jpeg-xl/lib/jxl/quantizer.h
new file mode 100644
index 0000000000..d78ba7b3fc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/quantizer.h
@@ -0,0 +1,182 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_QUANTIZER_H_
+#define LIB_JXL_QUANTIZER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/quant_weights.h"
+
+// Quantizes DC and AC coefficients, with separate quantization tables according
+// to the quant_kind (which is currently computed from the AC strategy and the
+// block index inside that strategy).
+
+namespace jxl {
+
+static constexpr int kGlobalScaleDenom = 1 << 16;
+static constexpr int kGlobalScaleNumerator = 4096;
+
+// zero-biases for quantizing channels X, Y, B
+static constexpr float kZeroBiasDefault[3] = {0.5f, 0.5f, 0.5f};
+
+// Returns adjusted version of a quantized integer, such that its value is
+// closer to the expected value of the original.
+// The residuals of AC coefficients that we quantize are not uniformly
+// distributed. Numerical experiments show that they have a distribution with
+// the "shape" of 1/(1+x^2) [up to some coefficients]. This means that the
+// expected value of a coefficient that gets quantized to x will not be x
+// itself, but (at least with reasonable approximation):
+// - 0 if x is 0
+// - x * biases[c] if x is 1 or -1
+// - x - biases[3]/x otherwise
+// This follows from computing the distribution of the quantization bias, which
+// can be approximated fairly well by <constant>/x when |x| is at least two.
+static constexpr float kBiasNumerator = 0.145f;
+
+static constexpr float kDefaultQuantBias[4] = {
+    1.0f - 0.05465007330715401f,
+    1.0f - 0.07005449891748593f,
+    1.0f - 0.049935103337343655f,
+    0.145f,
+};
+
+struct QuantizerParams;
+
+class Quantizer {
+ public:
+  explicit Quantizer(const DequantMatrices* dequant);
+  Quantizer(const DequantMatrices* dequant, int quant_dc, int global_scale);
+
+  static constexpr int32_t kQuantMax = 256;
+
+  static JXL_INLINE int32_t ClampVal(float val) {
+    return static_cast<int32_t>(
+        std::max(1.0f, std::min<float>(val, kQuantMax)));
+  }
+
+  float ScaleGlobalScale(const float scale) {
+    int new_global_scale = static_cast<int>(global_scale_ * scale + 0.5f);
+    float scale_out = new_global_scale * 1.0f / global_scale_;
+    global_scale_ = new_global_scale;
+    RecomputeFromGlobalScale();
+    return scale_out;
+  }
+
+  // Recomputes other derived fields after global_scale_ has changed.
+  void RecomputeFromGlobalScale() {
+    global_scale_float_ = global_scale_ * (1.0 / kGlobalScaleDenom);
+    inv_global_scale_ = 1.0 * kGlobalScaleDenom / global_scale_;
+    inv_quant_dc_ = inv_global_scale_ / quant_dc_;
+    for (size_t c = 0; c < 3; c++) {
+      mul_dc_[c] = GetDcStep(c);
+      inv_mul_dc_[c] = GetInvDcStep(c);
+    }
+  }
+
+  // Returns scaling factor such that Scale() * (RawDC() or RawQuantField())
+  // pixels yields the same float values returned by GetQuantField.
+  JXL_INLINE float Scale() const { return global_scale_float_; }
+
+  // Reciprocal of Scale().
+  JXL_INLINE float InvGlobalScale() const { return inv_global_scale_; }
+
+  void SetQuantFieldRect(const ImageF& qf, const Rect& rect,
+                         ImageI* JXL_RESTRICT raw_quant_field) const;
+
+  void SetQuantField(float quant_dc, const ImageF& qf,
+                     ImageI* JXL_RESTRICT raw_quant_field);
+
+  void SetQuant(float quant_dc, float quant_ac,
+                ImageI* JXL_RESTRICT raw_quant_field);
+
+  // Returns the DC quantization base value, which is currently global (not
+  // adaptive). The actual scale factor used to dequantize pixels in channel c
+  // is: inv_quant_dc() * dequant_->DCQuant(c).
+  float inv_quant_dc() const { return inv_quant_dc_; }
+
+  // Dequantize by multiplying with this times dequant_matrix.
+  float inv_quant_ac(int32_t quant) const { return inv_global_scale_ / quant; }
+
+  QuantizerParams GetParams() const;
+
+  Status Decode(BitReader* reader);
+
+  void DumpQuantizationMap(const ImageI& raw_quant_field) const;
+
+  JXL_INLINE const float* DequantMatrix(size_t quant_kind, size_t c) const {
+    return dequant_->Matrix(quant_kind, c);
+  }
+
+  JXL_INLINE const float* InvDequantMatrix(size_t quant_kind, size_t c) const {
+    return dequant_->InvMatrix(quant_kind, c);
+  }
+
+  // Calculates DC quantization step.
+  JXL_INLINE float GetDcStep(size_t c) const {
+    return inv_quant_dc_ * dequant_->DCQuant(c);
+  }
+  JXL_INLINE float GetInvDcStep(size_t c) const {
+    return dequant_->InvDCQuant(c) * (global_scale_float_ * quant_dc_);
+  }
+
+  JXL_INLINE const float* MulDC() const { return mul_dc_; }
+  JXL_INLINE const float* InvMulDC() const { return inv_mul_dc_; }
+
+  JXL_INLINE void ClearDCMul() {
+    std::fill(mul_dc_, mul_dc_ + 4, 1.f);
+    std::fill(inv_mul_dc_, inv_mul_dc_ + 4, 1.f);
+  }
+
+  void ComputeGlobalScaleAndQuant(float quant_dc, float quant_median,
+                                  float quant_median_absd);
+
+ private:
+  float mul_dc_[4];
+  float inv_mul_dc_[4];
+
+  // These are serialized:
+  int global_scale_;
+  int quant_dc_;
+
+  // These are derived from global_scale_:
+  float inv_global_scale_;
+  float global_scale_float_;  // reciprocal of inv_global_scale_
+  float inv_quant_dc_;
+
+  float zero_bias_[3];
+  const DequantMatrices* dequant_;
+};
+
+struct QuantizerParams : public Fields {
+  QuantizerParams() { Bundle::Init(this); }
+  JXL_FIELDS_NAME(QuantizerParams)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  uint32_t global_scale;
+  uint32_t quant_dc;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_QUANTIZER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/quantizer_test.cc b/third_party/jpeg-xl/lib/jxl/quantizer_test.cc
new file mode 100644
index 0000000000..f9cf2c838e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/quantizer_test.cc
@@ -0,0 +1,81 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/quantizer.h"
+
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+void TestEquivalence(int qxsize, int qysize, const Quantizer& quantizer1,
+                     const Quantizer& quantizer2) {
+  ASSERT_NEAR(quantizer1.inv_quant_dc(), quantizer2.inv_quant_dc(), 1e-7);
+}
+
+TEST(QuantizerTest, QuantizerParams) {
+  for (uint32_t i = 1; i < 10000; ++i) {
+    QuantizerParams p;
+    p.global_scale = i;
+    size_t extension_bits = 0, total_bits = 0;
+    EXPECT_TRUE(Bundle::CanEncode(p, &extension_bits, &total_bits));
+    EXPECT_EQ(0u, extension_bits);
+    EXPECT_GE(total_bits, 4u);
+  }
+}
+
+TEST(QuantizerTest, BitStreamRoundtripSameQuant) {
+  const int qxsize = 8;
+  const int qysize = 8;
+  DequantMatrices dequant;
+  Quantizer quantizer1(&dequant);
+  ImageI raw_quant_field(qxsize, qysize);
+  quantizer1.SetQuant(0.17f, 0.17f, &raw_quant_field);
+  BitWriter writer;
+  QuantizerParams params = quantizer1.GetParams();
+  EXPECT_TRUE(WriteQuantizerParams(params, &writer, 0, nullptr));
+  writer.ZeroPadToByte();
+  const size_t bits_written = writer.BitsWritten();
+  Quantizer quantizer2(&dequant);
+  BitReader reader(writer.GetSpan());
+  EXPECT_TRUE(quantizer2.Decode(&reader));
+  EXPECT_TRUE(reader.JumpToByteBoundary());
+  EXPECT_EQ(reader.TotalBitsConsumed(), bits_written);
+  EXPECT_TRUE(reader.Close());
+  TestEquivalence(qxsize, qysize, quantizer1, quantizer2);
+}
+
+TEST(QuantizerTest, BitStreamRoundtripRandomQuant) {
+  const int qxsize = 8;
+  const int qysize = 8;
+  DequantMatrices dequant;
+  Quantizer quantizer1(&dequant);
+  ImageI raw_quant_field(qxsize, qysize);
+  quantizer1.SetQuant(0.17f, 0.17f, &raw_quant_field);
+  float quant_dc = 0.17f;
+  ImageF qf(qxsize, qysize);
+  RandomFillImage(&qf, 0.0f, 1.0f);
+  quantizer1.SetQuantField(quant_dc, qf, &raw_quant_field);
+  BitWriter writer;
+  QuantizerParams params = quantizer1.GetParams();
+  EXPECT_TRUE(WriteQuantizerParams(params, &writer, 0, nullptr));
+  writer.ZeroPadToByte();
+  const size_t bits_written = writer.BitsWritten();
+  Quantizer quantizer2(&dequant);
+  BitReader reader(writer.GetSpan());
+  EXPECT_TRUE(quantizer2.Decode(&reader));
+  EXPECT_TRUE(reader.JumpToByteBoundary());
+  EXPECT_EQ(reader.TotalBitsConsumed(), bits_written);
+  EXPECT_TRUE(reader.Close());
+  TestEquivalence(qxsize, qysize, quantizer1, quantizer2);
+}
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/rational_polynomial-inl.h b/third_party/jpeg-xl/lib/jxl/rational_polynomial-inl.h
new file mode 100644
index 0000000000..176e24092c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/rational_polynomial-inl.h
@@ -0,0 +1,98 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Fast SIMD evaluation of rational polynomials for approximating functions.
+
+#if defined(LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_
+#undef LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_
+#else
+#define LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+// Primary template: default to actual division.
+template <typename T, class V>
+struct FastDivision {
+  HWY_INLINE V operator()(const V n, const V d) const { return n / d; }
+};
+// Partial specialization for float vectors.
+template <class V>
+struct FastDivision<float, V> {
+  // One Newton-Raphson iteration.
+  static HWY_INLINE V ReciprocalNR(const V x) {
+    const auto rcp = ApproximateReciprocal(x);
+    const auto sum = Add(rcp, rcp);
+    const auto x_rcp = Mul(x, rcp);
+    return NegMulAdd(x_rcp, rcp, sum);
+  }
+
+  V operator()(const V n, const V d) const {
+#if 1  // Faster on SKX
+    return Div(n, d);
+#else
+    return n * ReciprocalNR(d);
+#endif
+  }
+};
+
+// Approximates smooth functions via rational polynomials (i.e. dividing two
+// polynomials). Evaluates polynomials via Horner's scheme, which is faster than
+// Clenshaw recurrence for Chebyshev polynomials. LoadDup128 allows us to
+// specify constants (replicated 4x) independently of the lane count.
+template <size_t NP, size_t NQ, class D, class V, typename T>
+HWY_INLINE HWY_MAYBE_UNUSED V EvalRationalPolynomial(const D d, const V x,
+                                                     const T (&p)[NP],
+                                                     const T (&q)[NQ]) {
+  constexpr size_t kDegP = NP / 4 - 1;
+  constexpr size_t kDegQ = NQ / 4 - 1;
+  auto yp = LoadDup128(d, &p[kDegP * 4]);
+  auto yq = LoadDup128(d, &q[kDegQ * 4]);
+  // We use pointer arithmetic to refer to &p[(kDegP - n) * 4] to avoid a
+  // compiler warning that the index is out of bounds since we are already
+  // checking that it is not out of bounds with (kDegP >= n) and the access
+  // will be optimized away. Similarly with q and kDegQ.
+  HWY_FENCE;
+  if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 1) * 4)));
+  if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 1) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 2) * 4)));
+  if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 2) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 3) * 4)));
+  if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 3) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 4) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 4) * 4)));
+  if (kDegQ >= 4) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 4) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 5) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 5) * 4)));
+  if (kDegQ >= 5) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 5) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 6) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 6) * 4)));
+  if (kDegQ >= 6) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 6) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 7) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 7) * 4)));
+  if (kDegQ >= 7) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 7) * 4)));
+
+  return FastDivision<T, V>()(yp, yq);
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+#endif  // LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/rational_polynomial_test.cc b/third_party/jpeg-xl/lib/jxl/rational_polynomial_test.cc
new file mode 100644
index 0000000000..13fc044a55
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/rational_polynomial_test.cc
@@ -0,0 +1,238 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#include <cmath>
+#include <string>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/rational_polynomial_test.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+#include <hwy/tests/test_util-inl.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/rational_polynomial-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+using T = float;  // required by EvalLog2
+using D = HWY_FULL(T);
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::ShiftLeft;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Sub;
+
+// Generic: only computes polynomial
+struct EvalPoly {
+  template <size_t NP, size_t NQ>
+  T operator()(T x, const T (&p)[NP], const T (&q)[NQ]) const {
+    const HWY_FULL(T) d;
+    const auto vx = Set(d, x);
+    const auto approx = EvalRationalPolynomial(d, vx, p, q);
+    return GetLane(approx);
+  }
+};
+
+// Range reduction for log2
+struct EvalLog2 {
+  template <size_t NP, size_t NQ>
+  T operator()(T x, const T (&p)[NP], const T (&q)[NQ]) const {
+    const HWY_FULL(T) d;
+    auto vx = Set(d, x);
+
+    const HWY_FULL(int32_t) di;
+    const auto x_bits = BitCast(di, vx);
+    // Cannot handle negative numbers / NaN.
+    JXL_DASSERT(AllTrue(di, Eq(Abs(x_bits), x_bits)));
+
+    // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
+    const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
+    // Shifted exponent = log2; also used to clear mantissa.
+    const auto exp_shifted = ShiftRight<23>(exp_bits);
+    const auto mantissa = BitCast(d, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
+    const auto exp_val = ConvertTo(d, exp_shifted);
+    vx = Sub(mantissa, Set(d, 1.0f));
+
+    const auto approx = Add(EvalRationalPolynomial(d, vx, p, q), exp_val);
+    return GetLane(approx);
+  }
+};
+
+// Functions to approximate:
+
+T LinearToSrgb8Direct(T val) {
+  if (val < 0.0) return 0.0;
+  if (val >= 255.0) return 255.0;
+  if (val <= 10.0 / 12.92) return val * 12.92;
+  return 255.0 * (std::pow(val / 255.0, 1.0 / 2.4) * 1.055 - 0.055);
+}
+
+T SimpleGamma(T v) {
+  static const T kGamma = 0.387494322593;
+  static const T limit = 43.01745241042018;
+  T bright = v - limit;
+  if (bright >= 0) {
+    static const T mul = 0.0383723643799;
+    v -= bright * mul;
+  }
+  static const T limit2 = 94.68634353321337;
+  T bright2 = v - limit2;
+  if (bright2 >= 0) {
+    static const T mul = 0.22885405968;
+    v -= bright2 * mul;
+  }
+  static const T offset = 0.156775786057;
+  static const T scale = 8.898059160493739;
+  T retval = scale * (offset + pow(v, kGamma));
+  return retval;
+}
+
+// Runs CaratheodoryFejer and verifies the polynomial using a lot of samples to
+// return the biggest error.
+template <size_t NP, size_t NQ, class Eval>
+T RunApproximation(T x0, T x1, const T (&p)[NP], const T (&q)[NQ],
+                   const Eval& eval, T func_to_approx(T)) {
+  float maxerr = 0;
+  T lastPrint = 0;
+  // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter)
+  for (T x = x0; x <= x1; x += (x1 - x0) / 10000.0) {
+    const T f = func_to_approx(x);
+    const T g = eval(x, p, q);
+    maxerr = std::max(fabsf(g - f), maxerr);
+    if (x == x0 || x - lastPrint > (x1 - x0) / 20.0) {
+      printf("x: %11.6f, f: %11.6f, g: %11.6f, e: %11.6f\n", x, f, g,
+             fabs(g - f));
+      lastPrint = x;
+    }
+  }
+  return maxerr;
+}
+
+void TestSimpleGamma() {
+  const T p[4 * (6 + 1)] = {
+      HWY_REP4(-5.0646949363741811E-05), HWY_REP4(6.7369380528439771E-05),
+      HWY_REP4(8.9376652530412794E-05),  HWY_REP4(2.1153513301520462E-06),
+      HWY_REP4(-6.9130322970386449E-08), HWY_REP4(3.9424752749293728E-10),
+      HWY_REP4(1.2360288207619576E-13)};
+
+  const T q[4 * (6 + 1)] = {
+      HWY_REP4(-6.6389733798591366E-06), HWY_REP4(1.3299859726565908E-05),
+      HWY_REP4(3.8538748358398873E-06),  HWY_REP4(-2.8707687262928236E-08),
+      HWY_REP4(-6.6897385800005434E-10), HWY_REP4(6.1428748869186003E-12),
+      HWY_REP4(-2.5475738169252870E-15)};
+
+  const T err = RunApproximation(0.77, 274.579999999999984, p, q, EvalPoly(),
+                                 SimpleGamma);
+  EXPECT_LT(err, 0.05);
+}
+
+void TestLinearToSrgb8Direct() {
+  const T p[4 * (5 + 1)] = {
+      HWY_REP4(-9.5357499040105154E-05), HWY_REP4(4.6761186249798248E-04),
+      HWY_REP4(2.5708174333943594E-04),  HWY_REP4(1.5250087770436082E-05),
+      HWY_REP4(1.1946768008931187E-07),  HWY_REP4(5.9916446295972850E-11)};
+
+  const T q[4 * (4 + 1)] = {
+      HWY_REP4(1.8932479758079768E-05), HWY_REP4(2.7312342474687321E-05),
+      HWY_REP4(4.3901204783327006E-06), HWY_REP4(1.0417787306920273E-07),
+      HWY_REP4(3.0084206762140419E-10)};
+
+  const T err =
+      RunApproximation(0.77, 255, p, q, EvalPoly(), LinearToSrgb8Direct);
+  EXPECT_LT(err, 0.05);
+}
+
+void TestExp() {
+  const T p[4 * (2 + 1)] = {HWY_REP4(9.6266879665530902E-01),
+                            HWY_REP4(4.8961265681586763E-01),
+                            HWY_REP4(8.2619259189548433E-02)};
+  const T q[4 * (2 + 1)] = {HWY_REP4(9.6259895571622622E-01),
+                            HWY_REP4(-4.7272457588933831E-01),
+                            HWY_REP4(7.4802088567547664E-02)};
+  const T err =
+      RunApproximation(-1, 1, p, q, EvalPoly(), [](T x) { return T(exp(x)); });
+  EXPECT_LT(err, 1E-4);
+}
+
+void TestNegExp() {
+  // 4,3 is the min required for monotonicity; max error in 0,10: 751 ppm
+  // no benefit for k>50.
+  const T p[4 * (4 + 1)] = {
+      HWY_REP4(5.9580258551150123E-02), HWY_REP4(-2.5073728806886408E-02),
+      HWY_REP4(4.1561830213689248E-03), HWY_REP4(-3.1815408488900372E-04),
+      HWY_REP4(9.3866690094906802E-06)};
+  const T q[4 * (3 + 1)] = {
+      HWY_REP4(5.9579108238812878E-02), HWY_REP4(3.4542074345478582E-02),
+      HWY_REP4(8.7263562483501714E-03), HWY_REP4(1.4095109143061216E-03)};
+
+  const T err =
+      RunApproximation(0, 10, p, q, EvalPoly(), [](T x) { return T(exp(-x)); });
+  EXPECT_LT(err, sizeof(T) == 8 ? 2E-5 : 3E-5);
+}
+
+void TestSin() {
+  const T p[4 * (6 + 1)] = {
+      HWY_REP4(1.5518122109203780E-05),  HWY_REP4(2.3388958643675966E+00),
+      HWY_REP4(-8.6705520940849157E-01), HWY_REP4(-1.9702294764873535E-01),
+      HWY_REP4(1.2193404314472320E-01),  HWY_REP4(-1.7373966109788839E-02),
+      HWY_REP4(7.8829435883034796E-04)};
+  const T q[4 * (5 + 1)] = {
+      HWY_REP4(2.3394371422557279E+00), HWY_REP4(-8.7028221081288615E-01),
+      HWY_REP4(2.0052872219658430E-01), HWY_REP4(-3.2460335995264836E-02),
+      HWY_REP4(3.1546157932479282E-03), HWY_REP4(-1.6692542019380155E-04)};
+
+  const T err = RunApproximation(0, Pi<T>(1) * 2, p, q, EvalPoly(),
+                                 [](T x) { return T(sin(x)); });
+  EXPECT_LT(err, sizeof(T) == 8 ? 5E-4 : 7E-4);
+}
+
+void TestLog() {
+  HWY_ALIGN const T p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06),
+                                      HWY_REP4(1.4287160470083755E+00),
+                                      HWY_REP4(7.4245873327820566E-01)};
+  HWY_ALIGN const T q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01),
+                                      HWY_REP4(1.0096718572241148E+00),
+                                      HWY_REP4(1.7409343003366853E-01)};
+  const T err = RunApproximation(1E-6, 1000, p, q, EvalLog2(), std::log2);
+  printf("%E\n", err);
+}
+
+HWY_NOINLINE void TestRationalPolynomial() {
+  TestSimpleGamma();
+  TestLinearToSrgb8Direct();
+  TestExp();
+  TestNegExp();
+  TestSin();
+  TestLog();
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class RationalPolynomialTest : public hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(RationalPolynomialTest);
+
+HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestSimpleGamma);
+HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestLinearToSrgb8Direct);
+HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestExp);
+HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestNegExp);
+HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestSin);
+HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestLog);
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc
new file mode 100644
index 0000000000..db60a458db
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc
@@ -0,0 +1,865 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/low_memory_render_pipeline.h"
+
+#include <algorithm>
+#include <queue>
+#include <tuple>
+
+#include "lib/jxl/base/arch_macros.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+std::pair<size_t, size_t>
+LowMemoryRenderPipeline::ColorDimensionsToChannelDimensions(
+    std::pair<size_t, size_t> in, size_t c, size_t stage) const {
+  std::pair<size_t, size_t> ret;
+  std::pair<size_t, size_t> shift = channel_shifts_[stage][c];
+  ret.first =
+      ((in.first << base_color_shift_) + (1 << shift.first) - 1) >> shift.first;
+  ret.second = ((in.second << base_color_shift_) + (1 << shift.second) - 1) >>
+               shift.second;
+  return ret;
+}
+
+std::pair<size_t, size_t> LowMemoryRenderPipeline::BorderToStore(
+    size_t c) const {
+  auto ret = ColorDimensionsToChannelDimensions(group_border_, c, 0);
+  ret.first += padding_[0][c].first;
+  ret.second += padding_[0][c].second;
+  return ret;
+}
+
+void LowMemoryRenderPipeline::SaveBorders(size_t group_id, size_t c,
+                                          const ImageF& in) {
+  size_t gy = group_id / frame_dimensions_.xsize_groups;
+  size_t gx = group_id % frame_dimensions_.xsize_groups;
+  size_t hshift = channel_shifts_[0][c].first;
+  size_t vshift = channel_shifts_[0][c].second;
+  size_t x0 = gx * GroupInputXSize(c);
+  size_t x1 = std::min((gx + 1) * GroupInputXSize(c),
+                       DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
+  size_t y0 = gy * GroupInputYSize(c);
+  size_t y1 = std::min((gy + 1) * GroupInputYSize(c),
+                       DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
+
+  auto borders = BorderToStore(c);
+  size_t borderx_write = borders.first;
+  size_t bordery_write = borders.second;
+
+  if (gy > 0) {
+    Rect from(group_data_x_border_, group_data_y_border_, x1 - x0,
+              bordery_write);
+    Rect to(x0, (gy * 2 - 1) * bordery_write, x1 - x0, bordery_write);
+    CopyImageTo(from, in, to, &borders_horizontal_[c]);
+  }
+  if (gy + 1 < frame_dimensions_.ysize_groups) {
+    Rect from(group_data_x_border_,
+              group_data_y_border_ + y1 - y0 - bordery_write, x1 - x0,
+              bordery_write);
+    Rect to(x0, (gy * 2) * bordery_write, x1 - x0, bordery_write);
+    CopyImageTo(from, in, to, &borders_horizontal_[c]);
+  }
+  if (gx > 0) {
+    Rect from(group_data_x_border_, group_data_y_border_, borderx_write,
+              y1 - y0);
+    Rect to((gx * 2 - 1) * borderx_write, y0, borderx_write, y1 - y0);
+    CopyImageTo(from, in, to, &borders_vertical_[c]);
+  }
+  if (gx + 1 < frame_dimensions_.xsize_groups) {
+    Rect from(group_data_x_border_ + x1 - x0 - borderx_write,
+              group_data_y_border_, borderx_write, y1 - y0);
+    Rect to((gx * 2) * borderx_write, y0, borderx_write, y1 - y0);
+    CopyImageTo(from, in, to, &borders_vertical_[c]);
+  }
+}
+
+void LowMemoryRenderPipeline::LoadBorders(size_t group_id, size_t c,
+                                          const Rect& r, ImageF* out) {
+  size_t gy = group_id / frame_dimensions_.xsize_groups;
+  size_t gx = group_id % frame_dimensions_.xsize_groups;
+  size_t hshift = channel_shifts_[0][c].first;
+  size_t vshift = channel_shifts_[0][c].second;
+  // Coordinates of the group in the image.
+  size_t x0 = gx * GroupInputXSize(c);
+  size_t x1 = std::min((gx + 1) * GroupInputXSize(c),
+                       DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
+  size_t y0 = gy * GroupInputYSize(c);
+  size_t y1 = std::min((gy + 1) * GroupInputYSize(c),
+                       DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
+
+  size_t paddingx = padding_[0][c].first;
+  size_t paddingy = padding_[0][c].second;
+
+  auto borders = BorderToStore(c);
+  size_t borderx_write = borders.first;
+  size_t bordery_write = borders.second;
+
+  // Limits of the area to copy from, in image coordinates.
+  JXL_DASSERT(r.x0() == 0 || (r.x0() << base_color_shift_) >= paddingx);
+  size_t x0src = DivCeil(r.x0() << base_color_shift_, 1 << hshift);
+  if (x0src != 0) {
+    x0src -= paddingx;
+  }
+  // r may be such that r.x1 (namely x0() + xsize()) is within paddingx of the
+  // right side of the image, so we use min() here.
+  size_t x1src =
+      DivCeil((r.x0() + r.xsize()) << base_color_shift_, 1 << hshift);
+  x1src = std::min(x1src + paddingx,
+                   DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
+
+  // Similar computation for y.
+  JXL_DASSERT(r.y0() == 0 || (r.y0() << base_color_shift_) >= paddingy);
+  size_t y0src = DivCeil(r.y0() << base_color_shift_, 1 << vshift);
+  if (y0src != 0) {
+    y0src -= paddingy;
+  }
+  size_t y1src =
+      DivCeil((r.y0() + r.ysize()) << base_color_shift_, 1 << vshift);
+  y1src = std::min(y1src + paddingy,
+                   DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
+
+  // Copy other groups' borders from the border storage.
+  if (y0src < y0) {
+    JXL_DASSERT(gy > 0);
+    CopyImageTo(
+        Rect(x0src, (gy * 2 - 2) * bordery_write, x1src - x0src, bordery_write),
+        borders_horizontal_[c],
+        Rect(group_data_x_border_ + x0src - x0,
+             group_data_y_border_ - bordery_write, x1src - x0src,
+             bordery_write),
+        out);
+  }
+  if (y1src > y1) {
+    // When copying the bottom border we must not be on the bottom groups.
+    JXL_DASSERT(gy + 1 < frame_dimensions_.ysize_groups);
+    CopyImageTo(
+        Rect(x0src, (gy * 2 + 1) * bordery_write, x1src - x0src, bordery_write),
+        borders_horizontal_[c],
+        Rect(group_data_x_border_ + x0src - x0, group_data_y_border_ + y1 - y0,
+             x1src - x0src, bordery_write),
+        out);
+  }
+  if (x0src < x0) {
+    JXL_DASSERT(gx > 0);
+    CopyImageTo(
+        Rect((gx * 2 - 2) * borderx_write, y0src, borderx_write, y1src - y0src),
+        borders_vertical_[c],
+        Rect(group_data_x_border_ - borderx_write,
+             group_data_y_border_ + y0src - y0, borderx_write, y1src - y0src),
+        out);
+  }
+  if (x1src > x1) {
+    // When copying the right border we must not be on the rightmost groups.
+    JXL_DASSERT(gx + 1 < frame_dimensions_.xsize_groups);
+    CopyImageTo(
+        Rect((gx * 2 + 1) * borderx_write, y0src, borderx_write, y1src - y0src),
+        borders_vertical_[c],
+        Rect(group_data_x_border_ + x1 - x0, group_data_y_border_ + y0src - y0,
+             borderx_write, y1src - y0src),
+        out);
+  }
+}
+
+size_t LowMemoryRenderPipeline::GroupInputXSize(size_t c) const {
+  return (frame_dimensions_.group_dim << base_color_shift_) >>
+         channel_shifts_[0][c].first;
+}
+
+size_t LowMemoryRenderPipeline::GroupInputYSize(size_t c) const {
+  return (frame_dimensions_.group_dim << base_color_shift_) >>
+         channel_shifts_[0][c].second;
+}
+
+void LowMemoryRenderPipeline::EnsureBordersStorage() {
+  const auto& shifts = channel_shifts_[0];
+  if (borders_horizontal_.size() < shifts.size()) {
+    borders_horizontal_.resize(shifts.size());
+    borders_vertical_.resize(shifts.size());
+  }
+  for (size_t c = 0; c < shifts.size(); c++) {
+    auto borders = BorderToStore(c);
+    size_t borderx = borders.first;
+    size_t bordery = borders.second;
+    JXL_DASSERT(frame_dimensions_.xsize_groups > 0);
+    size_t num_xborders = (frame_dimensions_.xsize_groups - 1) * 2;
+    JXL_DASSERT(frame_dimensions_.ysize_groups > 0);
+    size_t num_yborders = (frame_dimensions_.ysize_groups - 1) * 2;
+    size_t downsampled_xsize =
+        DivCeil(frame_dimensions_.xsize_upsampled_padded, 1 << shifts[c].first);
+    size_t downsampled_ysize = DivCeil(frame_dimensions_.ysize_upsampled_padded,
+                                       1 << shifts[c].second);
+    Rect horizontal = Rect(0, 0, downsampled_xsize, bordery * num_yborders);
+    if (!SameSize(horizontal, borders_horizontal_[c])) {
+      borders_horizontal_[c] = ImageF(horizontal.xsize(), horizontal.ysize());
+    }
+    Rect vertical = Rect(0, 0, borderx * num_xborders, downsampled_ysize);
+    if (!SameSize(vertical, borders_vertical_[c])) {
+      borders_vertical_[c] = ImageF(vertical.xsize(), vertical.ysize());
+    }
+  }
+}
+
+void LowMemoryRenderPipeline::Init() {
+  group_border_ = {0, 0};
+  base_color_shift_ = CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded /
+                                      frame_dimensions_.xsize_padded);
+
+  const auto& shifts = channel_shifts_[0];
+
+  // Ensure that each channel has enough many border pixels.
+  for (size_t c = 0; c < shifts.size(); c++) {
+    group_border_.first =
+        std::max(group_border_.first,
+                 DivCeil(padding_[0][c].first << channel_shifts_[0][c].first,
+                         1 << base_color_shift_));
+    group_border_.second =
+        std::max(group_border_.second,
+                 DivCeil(padding_[0][c].second << channel_shifts_[0][c].second,
+                         1 << base_color_shift_));
+  }
+
+  // Ensure that all channels have an integer number of border pixels in the
+  // input.
+  for (size_t c = 0; c < shifts.size(); c++) {
+    if (channel_shifts_[0][c].first >= base_color_shift_) {
+      group_border_.first =
+          RoundUpTo(group_border_.first,
+                    1 << (channel_shifts_[0][c].first - base_color_shift_));
+    }
+    if (channel_shifts_[0][c].second >= base_color_shift_) {
+      group_border_.second =
+          RoundUpTo(group_border_.second,
+                    1 << (channel_shifts_[0][c].second - base_color_shift_));
+    }
+  }
+  // Ensure that the X border on color channels is a multiple of kBlockDim or
+  // the vector size (required for EPF stages). Vectors on ARM NEON are never
+  // wider than 4 floats, so rounding to multiples of 4 is enough.
+#if JXL_ARCH_ARM
+  constexpr size_t kGroupXAlign = 4;
+#else
+  constexpr size_t kGroupXAlign = 16;
+#endif
+  group_border_.first = RoundUpTo(group_border_.first, kGroupXAlign);
+  // Allocate borders in group images that are just enough for storing the
+  // borders to be copied in, plus any rounding to ensure alignment.
+  std::pair<size_t, size_t> max_border = {0, 0};
+  for (size_t c = 0; c < shifts.size(); c++) {
+    max_border.first = std::max(BorderToStore(c).first, max_border.first);
+    max_border.second = std::max(BorderToStore(c).second, max_border.second);
+  }
+  group_data_x_border_ = RoundUpTo(max_border.first, kGroupXAlign);
+  group_data_y_border_ = max_border.second;
+
+  EnsureBordersStorage();
+  group_border_assigner_.Init(frame_dimensions_);
+
+  for (first_trailing_stage_ = stages_.size(); first_trailing_stage_ > 0;
+       first_trailing_stage_--) {
+    bool has_inout_c = false;
+    for (size_t c = 0; c < shifts.size(); c++) {
+      if (stages_[first_trailing_stage_ - 1]->GetChannelMode(c) ==
+          RenderPipelineChannelMode::kInOut) {
+        has_inout_c = true;
+      }
+    }
+    if (has_inout_c) {
+      break;
+    }
+  }
+
+  first_image_dim_stage_ = stages_.size();
+  for (size_t i = 0; i < stages_.size(); i++) {
+    std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size());
+    for (size_t c = 0; c < shifts.size(); c++) {
+      input_sizes[c] =
+          std::make_pair(DivCeil(frame_dimensions_.xsize_upsampled,
+                                 1 << channel_shifts_[i][c].first),
+                         DivCeil(frame_dimensions_.ysize_upsampled,
+                                 1 << channel_shifts_[i][c].second));
+    }
+    stages_[i]->SetInputSizes(input_sizes);
+    if (stages_[i]->SwitchToImageDimensions()) {
+      // We don't allow kInOut after switching to image dimensions.
+      JXL_ASSERT(i >= first_trailing_stage_);
+      first_image_dim_stage_ = i + 1;
+      stages_[i]->GetImageDimensions(&full_image_xsize_, &full_image_ysize_,
+                                     &frame_origin_);
+      break;
+    }
+  }
+  for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) {
+    if (stages_[i]->SwitchToImageDimensions()) {
+      JXL_ABORT("Cannot switch to image dimensions multiple times");
+    }
+    std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size());
+    for (size_t c = 0; c < shifts.size(); c++) {
+      input_sizes[c] = {full_image_xsize_, full_image_ysize_};
+    }
+    stages_[i]->SetInputSizes(input_sizes);
+  }
+
+  anyc_.resize(stages_.size());
+  for (size_t i = 0; i < stages_.size(); i++) {
+    for (size_t c = 0; c < shifts.size(); c++) {
+      if (stages_[i]->GetChannelMode(c) !=
+          RenderPipelineChannelMode::kIgnored) {
+        anyc_[i] = c;
+      }
+    }
+  }
+
+  stage_input_for_channel_ = std::vector<std::vector<int32_t>>(
+      stages_.size(), std::vector<int32_t>(shifts.size()));
+  for (size_t c = 0; c < shifts.size(); c++) {
+    int input = -1;
+    for (size_t i = 0; i < stages_.size(); i++) {
+      stage_input_for_channel_[i][c] = input;
+      if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+        input = i;
+      }
+    }
+  }
+
+  image_rect_.resize(stages_.size());
+  for (size_t i = 0; i < stages_.size(); i++) {
+    size_t x1 = DivCeil(frame_dimensions_.xsize_upsampled,
+                        1 << channel_shifts_[i][anyc_[i]].first);
+    size_t y1 = DivCeil(frame_dimensions_.ysize_upsampled,
+                        1 << channel_shifts_[i][anyc_[i]].second);
+    image_rect_[i] = Rect(0, 0, x1, y1);
+  }
+
+  virtual_ypadding_for_output_.resize(stages_.size());
+  xpadding_for_output_.resize(stages_.size());
+  for (size_t c = 0; c < shifts.size(); c++) {
+    int ypad = 0;
+    int xpad = 0;
+    for (size_t i = stages_.size(); i-- > 0;) {
+      if (stages_[i]->GetChannelMode(c) !=
+          RenderPipelineChannelMode::kIgnored) {
+        virtual_ypadding_for_output_[i] =
+            std::max(ypad, virtual_ypadding_for_output_[i]);
+        xpadding_for_output_[i] = std::max(xpad, xpadding_for_output_[i]);
+      }
+      if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+        ypad = (DivCeil(ypad, 1 << channel_shifts_[i][c].second) +
+                stages_[i]->settings_.border_y)
+               << channel_shifts_[i][c].second;
+        xpad = DivCeil(xpad, 1 << stages_[i]->settings_.shift_x) +
+               stages_[i]->settings_.border_x;
+      }
+    }
+  }
+}
+
+void LowMemoryRenderPipeline::PrepareForThreadsInternal(size_t num,
+                                                        bool use_group_ids) {
+  const auto& shifts = channel_shifts_[0];
+
+  use_group_ids_ = use_group_ids;
+  size_t num_buffers = use_group_ids_ ? frame_dimensions_.num_groups : num;
+  for (size_t t = group_data_.size(); t < num_buffers; t++) {
+    group_data_.emplace_back();
+    group_data_[t].resize(shifts.size());
+    for (size_t c = 0; c < shifts.size(); c++) {
+      group_data_[t][c] = ImageF(GroupInputXSize(c) + group_data_x_border_ * 2,
+                                 GroupInputYSize(c) + group_data_y_border_ * 2);
+    }
+  }
+  // TODO(veluca): avoid reallocating buffers if not needed.
+  stage_data_.resize(num);
+  size_t upsampling = 1u << base_color_shift_;
+  size_t group_dim = frame_dimensions_.group_dim * upsampling;
+  size_t padding =
+      2 * group_data_x_border_ * upsampling +  // maximum size of a rect
+      2 * kRenderPipelineXOffset;              // extra padding for processing
+  size_t stage_buffer_xsize = group_dim + padding;
+  for (size_t t = 0; t < num; t++) {
+    stage_data_[t].resize(shifts.size());
+    for (size_t c = 0; c < shifts.size(); c++) {
+      stage_data_[t][c].resize(stages_.size());
+      size_t next_y_border = 0;
+      for (size_t i = stages_.size(); i-- > 0;) {
+        if (stages_[i]->GetChannelMode(c) ==
+            RenderPipelineChannelMode::kInOut) {
+          size_t stage_buffer_ysize =
+              2 * next_y_border + (1 << stages_[i]->settings_.shift_y);
+          stage_buffer_ysize = 1 << CeilLog2Nonzero(stage_buffer_ysize);
+          next_y_border = stages_[i]->settings_.border_y;
+          stage_data_[t][c][i] = ImageF(stage_buffer_xsize, stage_buffer_ysize);
+        }
+      }
+    }
+  }
+  if (first_image_dim_stage_ != stages_.size()) {
+    RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled,
+                              frame_dimensions_.ysize_upsampled);
+    RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_);
+    image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0);
+    image_rect = image_rect.Intersection(full_image_rect);
+    if (image_rect.xsize() == 0 || image_rect.ysize() == 0) {
+      image_rect = RectT<ssize_t>(0, 0, 0, 0);
+    }
+    size_t left_padding = image_rect.x0();
+    size_t middle_padding = group_dim;
+    size_t right_padding = full_image_xsize_ - image_rect.x1();
+    size_t out_of_frame_xsize =
+        padding +
+        std::max(left_padding, std::max(middle_padding, right_padding));
+    out_of_frame_data_.resize(num);
+    for (size_t t = 0; t < num; t++) {
+      out_of_frame_data_[t] = ImageF(out_of_frame_xsize, shifts.size());
+    }
+  }
+}
+
+std::vector<std::pair<ImageF*, Rect>> LowMemoryRenderPipeline::PrepareBuffers(
+    size_t group_id, size_t thread_id) {
+  std::vector<std::pair<ImageF*, Rect>> ret(channel_shifts_[0].size());
+  const size_t gx = group_id % frame_dimensions_.xsize_groups;
+  const size_t gy = group_id / frame_dimensions_.xsize_groups;
+  for (size_t c = 0; c < channel_shifts_[0].size(); c++) {
+    ret[c].first = &group_data_[use_group_ids_ ? group_id : thread_id][c];
+    ret[c].second = Rect(group_data_x_border_, group_data_y_border_,
+                         GroupInputXSize(c), GroupInputYSize(c),
+                         DivCeil(frame_dimensions_.xsize_upsampled,
+                                 1 << channel_shifts_[0][c].first) -
+                             gx * GroupInputXSize(c) + group_data_x_border_,
+                         DivCeil(frame_dimensions_.ysize_upsampled,
+                                 1 << channel_shifts_[0][c].second) -
+                             gy * GroupInputYSize(c) + group_data_y_border_);
+  }
+  return ret;
+}
+
+namespace {
+
+JXL_INLINE int GetMirroredY(int y, ssize_t group_y0, ssize_t image_ysize) {
+  if (group_y0 == 0 && (y < 0 || y + group_y0 >= image_ysize)) {
+    return Mirror(y, image_ysize);
+  }
+  if (y + group_y0 >= image_ysize) {
+    // Here we know that the one mirroring step is sufficient.
+    return 2 * image_ysize - (y + group_y0) - 1 - group_y0;
+  }
+  return y;
+}
+
+JXL_INLINE void ApplyXMirroring(float* row, ssize_t borderx, ssize_t group_x0,
+                                ssize_t group_xsize, ssize_t image_xsize) {
+  if (image_xsize <= borderx) {
+    if (group_x0 == 0) {
+      for (ssize_t ix = 0; ix < borderx; ix++) {
+        row[kRenderPipelineXOffset - ix - 1] =
+            row[kRenderPipelineXOffset + Mirror(-ix - 1, image_xsize)];
+      }
+    }
+    if (group_xsize + borderx + group_x0 >= image_xsize) {
+      for (ssize_t ix = 0; ix < borderx; ix++) {
+        row[kRenderPipelineXOffset + image_xsize + ix - group_x0] =
+            row[kRenderPipelineXOffset + Mirror(image_xsize + ix, image_xsize) -
+                group_x0];
+      }
+    }
+  } else {
+    // Here we know that the one mirroring step is sufficient.
+    if (group_x0 == 0) {
+      for (ssize_t ix = 0; ix < borderx; ix++) {
+        row[kRenderPipelineXOffset - ix - 1] = row[kRenderPipelineXOffset + ix];
+      }
+    }
+    if (group_xsize + borderx + group_x0 >= image_xsize) {
+      for (ssize_t ix = 0; ix < borderx; ix++) {
+        row[kRenderPipelineXOffset + image_xsize - group_x0 + ix] =
+            row[kRenderPipelineXOffset + image_xsize - group_x0 - ix - 1];
+      }
+    }
+  }
+}
+
+// Information about where the *output* of each stage is stored.
+class Rows {
+ public:
+  Rows(const std::vector<std::unique_ptr<RenderPipelineStage>>& stages,
+       const Rect data_max_color_channel_rect, int group_data_x_border,
+       int group_data_y_border,
+       const std::vector<std::pair<size_t, size_t>>& group_data_shift,
+       size_t base_color_shift, std::vector<std::vector<ImageF>>& thread_data,
+       std::vector<ImageF>& input_data) {
+    size_t num_stages = stages.size();
+    size_t num_channels = input_data.size();
+
+    JXL_ASSERT(thread_data.size() == num_channels);
+    JXL_ASSERT(group_data_shift.size() == num_channels);
+
+#if JXL_ENABLE_ASSERT
+    for (const auto& td : thread_data) {
+      JXL_ASSERT(td.size() == num_stages);
+    }
+#endif
+
+    rows_.resize(num_stages + 1, std::vector<RowInfo>(num_channels));
+
+    for (size_t i = 0; i < num_stages; i++) {
+      for (size_t c = 0; c < input_data.size(); c++) {
+        if (stages[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+          rows_[i + 1][c].ymod_minus_1 = thread_data[c][i].ysize() - 1;
+          rows_[i + 1][c].base_ptr = thread_data[c][i].Row(0);
+          rows_[i + 1][c].stride = thread_data[c][i].PixelsPerRow();
+        }
+      }
+    }
+
+    for (size_t c = 0; c < input_data.size(); c++) {
+      auto channel_group_data_rect =
+          data_max_color_channel_rect.As<ssize_t>()
+              .Translate(-group_data_x_border, -group_data_y_border)
+              .ShiftLeft(base_color_shift)
+              .CeilShiftRight(group_data_shift[c])
+              .Translate(group_data_x_border - ssize_t(kRenderPipelineXOffset),
+                         group_data_y_border);
+      rows_[0][c].base_ptr = channel_group_data_rect.Row(&input_data[c], 0);
+      rows_[0][c].stride = input_data[c].PixelsPerRow();
+      rows_[0][c].ymod_minus_1 = -1;
+    }
+  }
+
+  // Stage -1 refers to the input data; all other values must be nonnegative and
+  // refer to the data for the output of that stage.
+  JXL_INLINE float* GetBuffer(int stage, int y, size_t c) const {
+    JXL_DASSERT(stage >= -1);
+    const RowInfo& info = rows_[stage + 1][c];
+    return info.base_ptr + ssize_t(info.stride) * (y & info.ymod_minus_1);
+  }
+
+ private:
+  struct RowInfo {
+    // Pointer to beginning of the first row.
+    float* base_ptr;
+    // Modulo value for the y axis minus 1 (ymod is guaranteed to be a power of
+    // 2, which allows efficient mod computation by masking).
+    int ymod_minus_1;
+    // Number of floats per row.
+    size_t stride;
+  };
+  std::vector<std::vector<RowInfo>> rows_;
+};
+
+}  // namespace
+
+void LowMemoryRenderPipeline::RenderRect(size_t thread_id,
+                                         std::vector<ImageF>& input_data,
+                                         Rect data_max_color_channel_rect,
+                                         Rect image_max_color_channel_rect) {
+  // For each stage, the rect corresponding to the image area currently being
+  // processed, in the coordinates of that stage (i.e. with the scaling factor
+  // that that stage has).
+  std::vector<Rect> group_rect;
+  group_rect.resize(stages_.size());
+  Rect image_area_rect =
+      image_max_color_channel_rect.ShiftLeft(base_color_shift_)
+          .Crop(frame_dimensions_.xsize_upsampled,
+                frame_dimensions_.ysize_upsampled);
+  for (size_t i = 0; i < stages_.size(); i++) {
+    group_rect[i] =
+        image_area_rect.CeilShiftRight(channel_shifts_[i][anyc_[i]]);
+  }
+
+  ssize_t frame_x0 =
+      first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.x0;
+  ssize_t frame_y0 =
+      first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.y0;
+  size_t full_image_xsize = first_image_dim_stage_ == stages_.size()
+                                ? frame_dimensions_.xsize_upsampled
+                                : full_image_xsize_;
+  size_t full_image_ysize = first_image_dim_stage_ == stages_.size()
+                                ? frame_dimensions_.ysize_upsampled
+                                : full_image_ysize_;
+
+  // Compute actual x-axis bounds for the current image area in the context of
+  // the full image this frame is part of. As the left boundary may be negative,
+  // we also create the x_pixels_skip value, defined as follows:
+  // - both x_pixels_skip and full_image_x0 are >= 0, and at least one is 0;
+  // - full_image_x0 - x_pixels_skip is the position of the current frame area
+  //   in the full image.
+  ssize_t full_image_x0 = frame_x0 + image_area_rect.x0();
+  ssize_t x_pixels_skip = 0;
+  if (full_image_x0 < 0) {
+    x_pixels_skip = -full_image_x0;
+    full_image_x0 = 0;
+  }
+  ssize_t full_image_x1 = frame_x0 + image_area_rect.x1();
+  full_image_x1 = std::min<ssize_t>(full_image_x1, full_image_xsize);
+
+  // If the current image area is entirely outside of the visible image, there
+  // is no point in proceeding. Note: this uses the assumption that if there is
+  // a stage with observable effects (i.e. a kInput stage), it only appears
+  // after the stage that switches to image dimensions.
+  if (full_image_x1 <= full_image_x0) return;
+
+  // Data structures to hold information about input/output rows and their
+  // buffers.
+  Rows rows(stages_, data_max_color_channel_rect, group_data_x_border_,
+            group_data_y_border_, channel_shifts_[0], base_color_shift_,
+            stage_data_[thread_id], input_data);
+
+  std::vector<RenderPipelineStage::RowInfo> input_rows(first_trailing_stage_ +
+                                                       1);
+  for (size_t i = 0; i < first_trailing_stage_; i++) {
+    input_rows[i].resize(input_data.size());
+  }
+  input_rows[first_trailing_stage_].resize(input_data.size(),
+                                           std::vector<float*>(1));
+
+  // Maximum possible shift is 3.
+  RenderPipelineStage::RowInfo output_rows(input_data.size(),
+                                           std::vector<float*>(8));
+
+  // Fills in input_rows and output_rows for a given y value (relative to the
+  // start of the group, measured in actual pixels at the appropriate vertical
+  // scaling factor) and a given stage, applying mirroring if necessary. This
+  // function is somewhat inefficient for trailing kInOut or kInput stages,
+  // where just filling the input row once ought to be sufficient.
+  auto prepare_io_rows = [&](int y, size_t i) {
+    ssize_t bordery = stages_[i]->settings_.border_y;
+    size_t shifty = stages_[i]->settings_.shift_y;
+    auto make_row = [&](size_t c, ssize_t iy) {
+      size_t mirrored_y = GetMirroredY(y + iy - bordery, group_rect[i].y0(),
+                                       image_rect_[i].ysize());
+      input_rows[i][c][iy] =
+          rows.GetBuffer(stage_input_for_channel_[i][c], mirrored_y, c);
+      ApplyXMirroring(input_rows[i][c][iy], stages_[i]->settings_.border_x,
+                      group_rect[i].x0(), group_rect[i].xsize(),
+                      image_rect_[i].xsize());
+    };
+    for (size_t c = 0; c < input_data.size(); c++) {
+      RenderPipelineChannelMode mode = stages_[i]->GetChannelMode(c);
+      if (mode == RenderPipelineChannelMode::kIgnored) {
+        continue;
+      }
+      // If we already have rows from a previous iteration, we can just shift
+      // the rows by 1 and insert the new one.
+      if (input_rows[i][c].size() == 2 * size_t(bordery) + 1) {
+        for (ssize_t iy = 0; iy < 2 * bordery; iy++) {
+          input_rows[i][c][iy] = input_rows[i][c][iy + 1];
+        }
+        make_row(c, bordery * 2);
+      } else {
+        input_rows[i][c].resize(2 * bordery + 1);
+        for (ssize_t iy = 0; iy < 2 * bordery + 1; iy++) {
+          make_row(c, iy);
+        }
+      }
+
+      // If necessary, get the output buffers.
+      if (mode == RenderPipelineChannelMode::kInOut) {
+        for (size_t iy = 0; iy < (1u << shifty); iy++) {
+          output_rows[c][iy] = rows.GetBuffer(i, y * (1 << shifty) + iy, c);
+        }
+      }
+    }
+  };
+
+  // We pretend that every stage has a vertical shift of 0, i.e. it is as tall
+  // as the final image.
+  // We call each such row a "virtual" row, because it may or may not correspond
+  // to an actual row of the current processing stage; actual processing happens
+  // when vy % (1<<vshift) == 0.
+
+  int num_extra_rows = *std::max_element(virtual_ypadding_for_output_.begin(),
+                                         virtual_ypadding_for_output_.end());
+
+  for (int vy = -num_extra_rows;
+       vy < int(image_area_rect.ysize()) + num_extra_rows; vy++) {
+    for (size_t i = 0; i < first_trailing_stage_; i++) {
+      int stage_vy = vy - num_extra_rows + virtual_ypadding_for_output_[i];
+
+      if (stage_vy % (1 << channel_shifts_[i][anyc_[i]].second) != 0) {
+        continue;
+      }
+
+      if (stage_vy < -virtual_ypadding_for_output_[i]) {
+        continue;
+      }
+
+      int y = stage_vy >> channel_shifts_[i][anyc_[i]].second;
+
+      ssize_t image_y = ssize_t(group_rect[i].y0()) + y;
+      // Do not produce rows in out-of-bounds areas.
+      if (image_y < 0 || image_y >= ssize_t(image_rect_[i].ysize())) {
+        continue;
+      }
+
+      // Get the input/output rows and potentially apply mirroring to the input.
+      prepare_io_rows(y, i);
+
+      // Produce output rows.
+      stages_[i]->ProcessRow(input_rows[i], output_rows,
+                             xpadding_for_output_[i], group_rect[i].xsize(),
+                             group_rect[i].x0(), image_y, thread_id);
+    }
+
+    // Process trailing stages, i.e. the final set of non-kInOut stages; they
+    // all have the same input buffer and no need to use any mirroring.
+
+    int y = vy - num_extra_rows;
+
+    for (size_t c = 0; c < input_data.size(); c++) {
+      // Skip pixels that are not part of the actual final image area.
+      input_rows[first_trailing_stage_][c][0] =
+          rows.GetBuffer(stage_input_for_channel_[first_trailing_stage_][c], y,
+                         c) +
+          x_pixels_skip;
+    }
+
+    // Check that we are not outside of the bounds for the current rendering
+    // rect. Not doing so might result in overwriting some rows that have been
+    // written (or will be written) by other threads.
+    if (y < 0 || y >= ssize_t(image_area_rect.ysize())) {
+      continue;
+    }
+
+    // Avoid running pipeline stages on pixels that are outside the full image
+    // area. As trailing stages have no borders, this is a free optimization
+    // (and may be necessary for correctness, as some stages assume coordinates
+    // are within bounds).
+    ssize_t full_image_y = frame_y0 + image_area_rect.y0() + y;
+    if (full_image_y < 0 || full_image_y >= ssize_t(full_image_ysize)) {
+      continue;
+    }
+
+    for (size_t i = first_trailing_stage_; i < stages_.size(); i++) {
+      // Before the first_image_dim_stage_, coordinates are relative to the
+      // current frame.
+      size_t x0 =
+          i < first_image_dim_stage_ ? full_image_x0 - frame_x0 : full_image_x0;
+      size_t y =
+          i < first_image_dim_stage_ ? full_image_y - frame_y0 : full_image_y;
+      stages_[i]->ProcessRow(input_rows[first_trailing_stage_], output_rows,
+                             /*xextra=*/0, full_image_x1 - full_image_x0, x0, y,
+                             thread_id);
+    }
+  }
+}
+
+void LowMemoryRenderPipeline::RenderPadding(size_t thread_id, Rect rect) {
+  if (rect.xsize() == 0) return;
+  size_t numc = channel_shifts_[0].size();
+  RenderPipelineStage::RowInfo input_rows(numc, std::vector<float*>(1));
+  RenderPipelineStage::RowInfo output_rows;
+
+  for (size_t c = 0; c < numc; c++) {
+    input_rows[c][0] = out_of_frame_data_[thread_id].Row(c);
+  }
+
+  for (size_t y = 0; y < rect.ysize(); y++) {
+    stages_[first_image_dim_stage_ - 1]->ProcessPaddingRow(
+        input_rows, rect.xsize(), rect.x0(), rect.y0() + y);
+    for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) {
+      stages_[i]->ProcessRow(input_rows, output_rows,
+                             /*xextra=*/0, rect.xsize(), rect.x0(),
+                             rect.y0() + y, thread_id);
+    }
+  }
+}
+
+void LowMemoryRenderPipeline::ProcessBuffers(size_t group_id,
+                                             size_t thread_id) {
+  std::vector<ImageF>& input_data =
+      group_data_[use_group_ids_ ? group_id : thread_id];
+
+  // Copy the group borders to the border storage.
+  for (size_t c = 0; c < input_data.size(); c++) {
+    SaveBorders(group_id, c, input_data[c]);
+  }
+
+  size_t gy = group_id / frame_dimensions_.xsize_groups;
+  size_t gx = group_id % frame_dimensions_.xsize_groups;
+
+  if (first_image_dim_stage_ != stages_.size()) {
+    size_t group_dim = frame_dimensions_.group_dim << base_color_shift_;
+    RectT<ssize_t> group_rect(gx * group_dim, gy * group_dim, group_dim,
+                              group_dim);
+    RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled,
+                              frame_dimensions_.ysize_upsampled);
+    RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_);
+    group_rect = group_rect.Translate(frame_origin_.x0, frame_origin_.y0);
+    image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0);
+    image_rect = image_rect.Intersection(full_image_rect);
+    group_rect = group_rect.Intersection(image_rect);
+    size_t x0 = group_rect.x0();
+    size_t y0 = group_rect.y0();
+    size_t x1 = group_rect.x1();
+    size_t y1 = group_rect.y1();
+    JXL_DEBUG_V(6,
+                "Rendering padding for full image rect %s "
+                "outside group rect %s",
+                Description(full_image_rect).c_str(),
+                Description(group_rect).c_str());
+
+    if (group_id == 0 && (image_rect.xsize() == 0 || image_rect.ysize() == 0)) {
+      // If this frame does not intersect with the full image, we have to
+      // initialize the whole image area with RenderPadding.
+      RenderPadding(thread_id,
+                    Rect(0, 0, full_image_xsize_, full_image_ysize_));
+    }
+
+    // Render padding for groups that intersect with the full image. The case
+    // where no groups intersect was handled above.
+    if (group_rect.xsize() > 0 && group_rect.ysize() > 0) {
+      if (gx == 0 && gy == 0) {
+        RenderPadding(thread_id, Rect(0, 0, x0, y0));
+      }
+      if (gy == 0) {
+        RenderPadding(thread_id, Rect(x0, 0, x1 - x0, y0));
+      }
+      if (gx == 0) {
+        RenderPadding(thread_id, Rect(0, y0, x0, y1 - y0));
+      }
+      if (gx == 0 && gy + 1 == frame_dimensions_.ysize_groups) {
+        RenderPadding(thread_id, Rect(0, y1, x0, full_image_ysize_ - y1));
+      }
+      if (gy + 1 == frame_dimensions_.ysize_groups) {
+        RenderPadding(thread_id, Rect(x0, y1, x1 - x0, full_image_ysize_ - y1));
+      }
+      if (gy == 0 && gx + 1 == frame_dimensions_.xsize_groups) {
+        RenderPadding(thread_id, Rect(x1, 0, full_image_xsize_ - x1, y0));
+      }
+      if (gx + 1 == frame_dimensions_.xsize_groups) {
+        RenderPadding(thread_id, Rect(x1, y0, full_image_xsize_ - x1, y1 - y0));
+      }
+      if (gy + 1 == frame_dimensions_.ysize_groups &&
+          gx + 1 == frame_dimensions_.xsize_groups) {
+        RenderPadding(thread_id, Rect(x1, y1, full_image_xsize_ - x1,
+                                      full_image_ysize_ - y1));
+      }
+    }
+  }
+
+  Rect ready_rects[GroupBorderAssigner::kMaxToFinalize];
+  size_t num_ready_rects = 0;
+  group_border_assigner_.GroupDone(group_id, group_border_.first,
+                                   group_border_.second, ready_rects,
+                                   &num_ready_rects);
+  for (size_t i = 0; i < num_ready_rects; i++) {
+    const Rect& image_max_color_channel_rect = ready_rects[i];
+    for (size_t c = 0; c < input_data.size(); c++) {
+      LoadBorders(group_id, c, image_max_color_channel_rect, &input_data[c]);
+    }
+    Rect data_max_color_channel_rect(
+        group_data_x_border_ + image_max_color_channel_rect.x0() -
+            gx * frame_dimensions_.group_dim,
+        group_data_y_border_ + image_max_color_channel_rect.y0() -
+            gy * frame_dimensions_.group_dim,
+        image_max_color_channel_rect.xsize(),
+        image_max_color_channel_rect.ysize());
+    RenderRect(thread_id, input_data, data_max_color_channel_rect,
+               image_max_color_channel_rect);
+  }
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h
new file mode 100644
index 0000000000..b386f7c078
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h
@@ -0,0 +1,111 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_
+#define LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/dec_group_border.h"
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+namespace jxl {
+
+// A multithreaded, low-memory rendering pipeline that only allocates a minimal
+// amount of buffers.
+class LowMemoryRenderPipeline final : public RenderPipeline {
+ private:
+  std::vector<std::pair<ImageF*, Rect>> PrepareBuffers(
+      size_t group_id, size_t thread_id) override;
+
+  void PrepareForThreadsInternal(size_t num, bool use_group_ids) override;
+
+  void ProcessBuffers(size_t group_id, size_t thread_id) override;
+
+  void ClearDone(size_t i) override { group_border_assigner_.ClearDone(i); }
+
+  void Init() override;
+
+  void EnsureBordersStorage();
+  size_t GroupInputXSize(size_t c) const;
+  size_t GroupInputYSize(size_t c) const;
+  void RenderRect(size_t thread_id, std::vector<ImageF>& input_data,
+                  Rect data_max_color_channel_rect,
+                  Rect image_max_color_channel_rect);
+  void RenderPadding(size_t thread_id, Rect rect);
+
+  void SaveBorders(size_t group_id, size_t c, const ImageF& in);
+  void LoadBorders(size_t group_id, size_t c, const Rect& r, ImageF* out);
+
+  std::pair<size_t, size_t> ColorDimensionsToChannelDimensions(
+      std::pair<size_t, size_t> in, size_t c, size_t stage) const;
+
+  std::pair<size_t, size_t> BorderToStore(size_t c) const;
+
+  bool use_group_ids_;
+
+  // Storage for borders between groups. Borders of adjacent groups are stacked
+  // together, e.g. bottom border of current group is followed by top border
+  // of next group.
+  std::vector<ImageF> borders_horizontal_;
+  std::vector<ImageF> borders_vertical_;
+
+  // Manages the status of borders.
+  GroupBorderAssigner group_border_assigner_;
+
+  // Size (in color-channel-pixels) of the border around each group that might
+  // be assigned to that group.
+  std::pair<size_t, size_t> group_border_;
+  // base_color_shift_ defines the size of groups in terms of final image
+  // pixels.
+  size_t base_color_shift_;
+
+  // Buffer for decoded pixel data for a group, indexed by [thread][channel] or
+  // [group][channel] depending on `use_group_ids_`.
+  std::vector<std::vector<ImageF>> group_data_;
+
+  // Borders for storing group data.
+  size_t group_data_x_border_;
+  size_t group_data_y_border_;
+
+  // Buffers for intermediate rows for the various stages, indexed by
+  // [thread][channel][stage].
+  std::vector<std::vector<std::vector<ImageF>>> stage_data_;
+
+  // Buffers for out-of-frame data, indexed by [thread]; every row is a
+  // different channel.
+  std::vector<ImageF> out_of_frame_data_;
+
+  // For each stage, a non-kIgnored channel.
+  std::vector<int32_t> anyc_;
+
+  // Size of the image at each stage.
+  std::vector<Rect> image_rect_;
+
+  // For each stage, for each channel, keep track of the kInOut stage that
+  // produced the input to that stage (which corresponds to the buffer index
+  // containing the data). -1 if data comes from the original input.
+  std::vector<std::vector<int32_t>> stage_input_for_channel_;
+
+  // Number of (virtual) extra rows that must be processed at each stage
+  // to produce sufficient output for future stages.
+  std::vector<int> virtual_ypadding_for_output_;
+
+  // Same thing for columns, except these are real columns and not virtual ones.
+  std::vector<int> xpadding_for_output_;
+
+  // First stage that doesn't have any kInOut channel.
+  size_t first_trailing_stage_;
+
+  // Origin and size of the frame after switching to image dimensions.
+  FrameOrigin frame_origin_;
+  size_t full_image_xsize_;
+  size_t full_image_ysize_;
+  size_t first_image_dim_stage_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc
new file mode 100644
index 0000000000..68b6ef613f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc
@@ -0,0 +1,132 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+#include <algorithm>
+
+#include "lib/jxl/render_pipeline/low_memory_render_pipeline.h"
+#include "lib/jxl/render_pipeline/simple_render_pipeline.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+
+void RenderPipeline::Builder::AddStage(
+    std::unique_ptr<RenderPipelineStage> stage) {
+  stages_.push_back(std::move(stage));
+}
+
+std::unique_ptr<RenderPipeline> RenderPipeline::Builder::Finalize(
+    FrameDimensions frame_dimensions) && {
+#if JXL_ENABLE_ASSERT
+  // Check that the last stage is not an kInOut stage for any channel, and that
+  // there is at least one stage.
+  JXL_ASSERT(!stages_.empty());
+  for (size_t c = 0; c < num_c_; c++) {
+    JXL_ASSERT(stages_.back()->GetChannelMode(c) !=
+               RenderPipelineChannelMode::kInOut);
+  }
+#endif
+
+  std::unique_ptr<RenderPipeline> res;
+  if (use_simple_implementation_) {
+    res = jxl::make_unique<SimpleRenderPipeline>();
+  } else {
+    res = jxl::make_unique<LowMemoryRenderPipeline>();
+  }
+
+  res->padding_.resize(stages_.size());
+  for (size_t i = stages_.size(); i-- > 0;) {
+    const auto& stage = stages_[i];
+    res->padding_[i].resize(num_c_);
+    if (i + 1 == stages_.size()) {
+      continue;
+    }
+    for (size_t c = 0; c < num_c_; c++) {
+      if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+        res->padding_[i][c].first = DivCeil(res->padding_[i + 1][c].first,
+                                            1 << stage->settings_.shift_x) +
+                                    stage->settings_.border_x;
+        res->padding_[i][c].second = DivCeil(res->padding_[i + 1][c].second,
+                                             1 << stage->settings_.shift_y) +
+                                     stage->settings_.border_y;
+      } else {
+        res->padding_[i][c] = res->padding_[i + 1][c];
+      }
+    }
+  }
+
+  res->frame_dimensions_ = frame_dimensions;
+  res->group_completed_passes_.resize(frame_dimensions.num_groups);
+  res->channel_shifts_.resize(stages_.size());
+  res->channel_shifts_[0].resize(num_c_);
+  for (size_t i = 1; i < stages_.size(); i++) {
+    auto& stage = stages_[i - 1];
+    for (size_t c = 0; c < num_c_; c++) {
+      if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+        res->channel_shifts_[0][c].first += stage->settings_.shift_x;
+        res->channel_shifts_[0][c].second += stage->settings_.shift_y;
+      }
+    }
+  }
+  for (size_t i = 1; i < stages_.size(); i++) {
+    auto& stage = stages_[i - 1];
+    res->channel_shifts_[i].resize(num_c_);
+    for (size_t c = 0; c < num_c_; c++) {
+      if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+        res->channel_shifts_[i][c].first =
+            res->channel_shifts_[i - 1][c].first - stage->settings_.shift_x;
+        res->channel_shifts_[i][c].second =
+            res->channel_shifts_[i - 1][c].second - stage->settings_.shift_y;
+      } else {
+        res->channel_shifts_[i][c].first = res->channel_shifts_[i - 1][c].first;
+        res->channel_shifts_[i][c].second =
+            res->channel_shifts_[i - 1][c].second;
+      }
+    }
+  }
+  res->stages_ = std::move(stages_);
+  res->Init();
+  return res;
+}
+
+RenderPipelineInput RenderPipeline::GetInputBuffers(size_t group_id,
+                                                    size_t thread_id) {
+  RenderPipelineInput ret;
+  JXL_DASSERT(group_id < group_completed_passes_.size());
+  ret.group_id_ = group_id;
+  ret.thread_id_ = thread_id;
+  ret.pipeline_ = this;
+  ret.buffers_ = PrepareBuffers(group_id, thread_id);
+  return ret;
+}
+
+void RenderPipeline::InputReady(
+    size_t group_id, size_t thread_id,
+    const std::vector<std::pair<ImageF*, Rect>>& buffers) {
+  JXL_DASSERT(group_id < group_completed_passes_.size());
+  group_completed_passes_[group_id]++;
+  for (size_t i = 0; i < buffers.size(); ++i) {
+    (void)i;
+    JXL_CHECK_PLANE_INITIALIZED(*buffers[i].first, buffers[i].second, i);
+  }
+
+  ProcessBuffers(group_id, thread_id);
+}
+
+Status RenderPipeline::PrepareForThreads(size_t num, bool use_group_ids) {
+  for (const auto& stage : stages_) {
+    JXL_RETURN_IF_ERROR(stage->PrepareForThreads(num));
+  }
+  PrepareForThreadsInternal(num, use_group_ids);
+  return true;
+}
+
+void RenderPipelineInput::Done() {
+  JXL_ASSERT(pipeline_);
+  pipeline_->InputReady(group_id_, thread_id_, buffers_);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h
new file mode 100644
index 0000000000..bf3ad4975e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h
@@ -0,0 +1,139 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_
+#define LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/image.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Interface to provide input to the rendering pipeline. When this object is
+// destroyed, all the data in the provided ImageF's Rects must have been
+// initialized.
+class RenderPipelineInput {
+ public:
+  RenderPipelineInput(const RenderPipelineInput&) = delete;
+  RenderPipelineInput(RenderPipelineInput&& other) noexcept {
+    *this = std::move(other);
+  }
+  RenderPipelineInput& operator=(RenderPipelineInput&& other) noexcept {
+    pipeline_ = other.pipeline_;
+    group_id_ = other.group_id_;
+    thread_id_ = other.thread_id_;
+    buffers_ = std::move(other.buffers_);
+    other.pipeline_ = nullptr;
+    return *this;
+  }
+
+  RenderPipelineInput() = default;
+  void Done();
+
+  const std::pair<ImageF*, Rect>& GetBuffer(size_t c) const {
+    JXL_ASSERT(c < buffers_.size());
+    return buffers_[c];
+  }
+
+ private:
+  RenderPipeline* pipeline_ = nullptr;
+  size_t group_id_;
+  size_t thread_id_;
+  std::vector<std::pair<ImageF*, Rect>> buffers_;
+  friend class RenderPipeline;
+};
+
+class RenderPipeline {
+ public:
+  class Builder {
+   public:
+    explicit Builder(size_t num_c) : num_c_(num_c) { JXL_ASSERT(num_c > 0); }
+
+    // Adds a stage to the pipeline. Must be called at least once; the last
+    // added stage cannot have kInOut channels.
+    void AddStage(std::unique_ptr<RenderPipelineStage> stage);
+
+    // Enables using the simple (i.e. non-memory-efficient) implementation of
+    // the pipeline.
+    void UseSimpleImplementation() { use_simple_implementation_ = true; }
+
+    // Finalizes setup of the pipeline. Shifts for all channels should be 0 at
+    // this point.
+    std::unique_ptr<RenderPipeline> Finalize(
+        FrameDimensions frame_dimensions) &&;
+
+   private:
+    std::vector<std::unique_ptr<RenderPipelineStage>> stages_;
+    size_t num_c_;
+    bool use_simple_implementation_ = false;
+  };
+
+  friend class Builder;
+
+  virtual ~RenderPipeline() = default;
+
+  Status IsInitialized() const {
+    for (const auto& stage : stages_) {
+      JXL_RETURN_IF_ERROR(stage->IsInitialized());
+    }
+    return true;
+  }
+
+  // Allocates storage to run with `num` threads. If `use_group_ids` is true,
+  // storage is allocated for each group, not each thread. The behaviour is
+  // undefined if calling this function multiple times with a different value
+  // for `use_group_ids`.
+  Status PrepareForThreads(size_t num, bool use_group_ids);
+
+  // Retrieves a buffer where input data should be stored by the callee. When
+  // input has been provided for all buffers, the pipeline will complete its
+  // processing. This method may be called multiple times concurrently from
+  // different threads, provided that a different `thread_id` is given.
+  RenderPipelineInput GetInputBuffers(size_t group_id, size_t thread_id);
+
+  size_t PassesWithAllInput() const {
+    return *std::min_element(group_completed_passes_.begin(),
+                             group_completed_passes_.end());
+  }
+
+  virtual void ClearDone(size_t i) {}
+
+ protected:
+  std::vector<std::unique_ptr<RenderPipelineStage>> stages_;
+  // Shifts for every channel at the input of each stage.
+  std::vector<std::vector<std::pair<size_t, size_t>>> channel_shifts_;
+
+  // Amount of (cumulative) padding required by each stage and channel, in
+  // either direction.
+  std::vector<std::vector<std::pair<size_t, size_t>>> padding_;
+
+  FrameDimensions frame_dimensions_;
+
+  std::vector<uint8_t> group_completed_passes_;
+
+  friend class RenderPipelineInput;
+
+ private:
+  void InputReady(size_t group_id, size_t thread_id,
+                  const std::vector<std::pair<ImageF*, Rect>>& buffers);
+
+  virtual std::vector<std::pair<ImageF*, Rect>> PrepareBuffers(
+      size_t group_id, size_t thread_id) = 0;
+
+  virtual void ProcessBuffers(size_t group_id, size_t thread_id) = 0;
+
+  // Note that this method may be called multiple times with different (or
+  // equal) `num`.
+  virtual void PrepareForThreadsInternal(size_t num, bool use_group_ids) = 0;
+
+  // Called once frame dimensions and stages are known.
+  virtual void Init() {}
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h
new file mode 100644
index 0000000000..d1a0074161
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h
@@ -0,0 +1,171 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_
+#define LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/base/arch_macros.h"
+#include "lib/jxl/frame_header.h"
+
+namespace jxl {
+
+// The first pixel in the input to RenderPipelineStage will be located at
+// this position. Pixels before this position may be accessed as padding.
+// This should be at least the RoundUpTo(maximum padding / 2, maximum vector
+// size) times 2: this is realized when using Gaborish + EPF + upsampling +
+// chroma subsampling.
+#if JXL_ARCH_ARM
+constexpr size_t kRenderPipelineXOffset = 16;
+#else
+constexpr size_t kRenderPipelineXOffset = 32;
+#endif
+
+enum class RenderPipelineChannelMode {
+  // This channel is not modified by this stage.
+  kIgnored = 0,
+  // This channel is modified in-place.
+  kInPlace = 1,
+  // This channel is modified and written to a new buffer.
+  kInOut = 2,
+  // This channel is only read. These are the only stages that are assumed to
+  // have observable effects, i.e. calls to ProcessRow for other stages may be
+  // omitted if it can be shown they can't affect any kInput stage ProcessRow
+  // call that happens inside image boundaries.
+  kInput = 3,
+};
+
+class RenderPipeline;
+
+class RenderPipelineStage {
+ protected:
+  using Row = float*;
+  using ChannelRows = std::vector<Row>;
+
+ public:
+  using RowInfo = std::vector<ChannelRows>;
+  struct Settings {
+    // Amount of padding required in the various directions by all channels
+    // that have kInOut mode.
+    size_t border_x = 0;
+    size_t border_y = 0;
+
+    // Log2 of the number of columns/rows of output that this stage will produce
+    // for every input row for kInOut channels.
+    size_t shift_x = 0;
+    size_t shift_y = 0;
+
+    static Settings ShiftX(size_t shift, size_t border) {
+      Settings settings;
+      settings.border_x = border;
+      settings.shift_x = shift;
+      return settings;
+    }
+
+    static Settings ShiftY(size_t shift, size_t border) {
+      Settings settings;
+      settings.border_y = border;
+      settings.shift_y = shift;
+      return settings;
+    }
+
+    static Settings Symmetric(size_t shift, size_t border) {
+      Settings settings;
+      settings.border_x = settings.border_y = border;
+      settings.shift_x = settings.shift_y = shift;
+      return settings;
+    }
+
+    static Settings SymmetricBorderOnly(size_t border) {
+      return Symmetric(0, border);
+    }
+  };
+
+  virtual ~RenderPipelineStage() = default;
+
+  // Processes one row of input, producing the appropriate number of rows of
+  // output. Input/output rows can be obtained by calls to
+  // `GetInputRow`/`GetOutputRow`. `xsize+2*xextra` represents the total number
+  // of pixels to be processed in the input row, where the first pixel is at
+  // position `kRenderPipelineXOffset-xextra`. All pixels in the
+  // `[kRenderPipelineXOffset-xextra-border_x,
+  // kRenderPipelineXOffset+xsize+xextra+border_x)` range are initialized and
+  // accessible. `xpos` and `ypos` represent the position of the first
+  // (non-extra, i.e. in position kRenderPipelineXOffset) pixel in the center
+  // row of the input in the full image. `xpos` is a multiple of
+  // `GroupBorderAssigner::kPaddingXRound`. If `settings_.temp_buffer_size` is
+  // nonzero, `temp` will point to an HWY-aligned buffer of at least that number
+  // of floats; concurrent calls will have different buffers.
+  virtual void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                          size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                          size_t thread_id) const = 0;
+
+  // How each channel will be processed. Channels are numbered starting from
+  // color channels (always 3) and followed by all other channels.
+  virtual RenderPipelineChannelMode GetChannelMode(size_t c) const = 0;
+
+ protected:
+  explicit RenderPipelineStage(Settings settings) : settings_(settings) {}
+
+  virtual Status IsInitialized() const { return true; }
+
+  // Informs the stage about the total size of each channel. Few stages will
+  // actually need to use this information.
+  virtual void SetInputSizes(
+      const std::vector<std::pair<size_t, size_t>>& input_sizes) {}
+
+  virtual Status PrepareForThreads(size_t num_threads) { return true; }
+
+  // Returns a pointer to the input row of channel `c` with offset `y`.
+  // `y` must be in [-settings_.border_y, settings_.border_y]. `c` must be such
+  // that `GetChannelMode(c) != kIgnored`. The returned pointer points to the
+  // offset-ed row (i.e. kRenderPipelineXOffset has been applied).
+  float* GetInputRow(const RowInfo& input_rows, size_t c, int offset) const {
+    JXL_DASSERT(GetChannelMode(c) != RenderPipelineChannelMode::kIgnored);
+    JXL_DASSERT(-offset <= static_cast<int>(settings_.border_y));
+    JXL_DASSERT(offset <= static_cast<int>(settings_.border_y));
+    return input_rows[c][settings_.border_y + offset] + kRenderPipelineXOffset;
+  }
+  // Similar to `GetInputRow`, but can only be used if `GetChannelMode(c) ==
+  // kInOut`. Offset must be less than `1<<settings_.shift_y`.. The returned
+  // pointer points to the offset-ed row (i.e. kRenderPipelineXOffset has been
+  // applied).
+  float* GetOutputRow(const RowInfo& output_rows, size_t c,
+                      size_t offset) const {
+    JXL_DASSERT(GetChannelMode(c) == RenderPipelineChannelMode::kInOut);
+    JXL_DASSERT(offset <= 1ul << settings_.shift_y);
+    return output_rows[c][offset] + kRenderPipelineXOffset;
+  }
+
+  // Indicates whether, from this stage on, the pipeline will operate on an
+  // image- rather than frame-sized buffer. Only one stage in the pipeline
+  // should return true, and it should implement ProcessPaddingRow below too.
+  // It is assumed that, if there is a SwitchToImageDimensions() == true stage,
+  // all kInput stages appear after it.
+  virtual bool SwitchToImageDimensions() const { return false; }
+
+  // If SwitchToImageDimensions returns true, then this should set xsize and
+  // ysize to the image size, and frame_origin to the location of the frame
+  // within the image. Otherwise, this is not called at all.
+  virtual void GetImageDimensions(size_t* xsize, size_t* ysize,
+                                  FrameOrigin* frame_origin) const {}
+
+  // Produces the appropriate output data outside of the frame dimensions. xpos
+  // and ypos are now relative to the full image.
+  virtual void ProcessPaddingRow(const RowInfo& output_rows, size_t xsize,
+                                 size_t xpos, size_t ypos) const {}
+
+  virtual const char* GetName() const = 0;
+
+  Settings settings_;
+  friend class RenderPipeline;
+  friend class SimpleRenderPipeline;
+  friend class LowMemoryRenderPipeline;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc
new file mode 100644
index 0000000000..f638807be9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc
@@ -0,0 +1,562 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/dec_frame.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/fake_parallel_runner_testonly.h"
+#include "lib/jxl/icc_codec.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/jpeg/enc_jpeg_data.h"
+#include "lib/jxl/render_pipeline/test_render_pipeline_stages.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+Status DecodeFile(const Span<const uint8_t> file, bool use_slow_pipeline,
+                  CodecInOut* io, ThreadPool* pool) {
+  Status ret = true;
+  {
+    BitReader reader(file);
+    BitReaderScopedCloser reader_closer(&reader, &ret);
+    JXL_RETURN_IF_ERROR(reader.ReadFixedBits<16>() == 0x0AFF);
+    JXL_RETURN_IF_ERROR(ReadSizeHeader(&reader, &io->metadata.size));
+    JXL_RETURN_IF_ERROR(ReadImageMetadata(&reader, &io->metadata.m));
+    io->metadata.transform_data.nonserialized_xyb_encoded =
+        io->metadata.m.xyb_encoded;
+    JXL_RETURN_IF_ERROR(Bundle::Read(&reader, &io->metadata.transform_data));
+    if (io->metadata.m.color_encoding.WantICC()) {
+      PaddedBytes icc;
+      JXL_RETURN_IF_ERROR(ReadICC(&reader, &icc));
+      JXL_RETURN_IF_ERROR(io->metadata.m.color_encoding.SetICC(std::move(icc)));
+    }
+    PassesDecoderState dec_state;
+    JXL_RETURN_IF_ERROR(
+        dec_state.output_encoding_info.SetFromMetadata(io->metadata));
+    JXL_RETURN_IF_ERROR(reader.JumpToByteBoundary());
+    io->frames.clear();
+    do {
+      io->frames.emplace_back(&io->metadata.m);
+      // Skip frames that are not displayed.
+      do {
+        size_t frame_start = reader.TotalBitsConsumed() / kBitsPerByte;
+        size_t size_left = file.size() - frame_start;
+        JXL_RETURN_IF_ERROR(
+            DecodeFrame(&dec_state, pool, file.data() + frame_start, size_left,
+                        &io->frames.back(), io->metadata, use_slow_pipeline));
+        reader.SkipBits(io->frames.back().decoded_bytes() * kBitsPerByte);
+      } while (dec_state.shared->frame_header.frame_type !=
+                   FrameType::kRegularFrame &&
+               dec_state.shared->frame_header.frame_type !=
+                   FrameType::kSkipProgressive);
+    } while (!dec_state.shared->frame_header.is_last);
+
+    if (io->frames.empty()) return JXL_FAILURE("Not enough data.");
+
+    if (reader.TotalBitsConsumed() != file.size() * kBitsPerByte) {
+      return JXL_FAILURE("Reader position not at EOF.");
+    }
+    if (!reader.AllReadsWithinBounds()) {
+      return JXL_FAILURE("Reader out of bounds read.");
+    }
+    io->CheckMetadata();
+    // reader is closed here.
+  }
+  return ret;
+}
+
+TEST(RenderPipelineTest, Build) {
+  RenderPipeline::Builder builder(/*num_c=*/1);
+  builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+  builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+  builder.AddStage(jxl::make_unique<Check0FinalStage>());
+  builder.UseSimpleImplementation();
+  FrameDimensions frame_dimensions;
+  frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+                       /*max_hshift=*/0, /*max_vshift=*/0,
+                       /*modular_mode=*/false, /*upsampling=*/1);
+  std::move(builder).Finalize(frame_dimensions);
+}
+
+TEST(RenderPipelineTest, CallAllGroups) {
+  RenderPipeline::Builder builder(/*num_c=*/1);
+  builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+  builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+  builder.AddStage(jxl::make_unique<Check0FinalStage>());
+  builder.UseSimpleImplementation();
+  FrameDimensions frame_dimensions;
+  frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+                       /*max_hshift=*/0, /*max_vshift=*/0,
+                       /*modular_mode=*/false, /*upsampling=*/1);
+  auto pipeline = std::move(builder).Finalize(frame_dimensions);
+  ASSERT_TRUE(pipeline->PrepareForThreads(1, /*use_group_ids=*/false));
+
+  for (size_t i = 0; i < frame_dimensions.num_groups; i++) {
+    auto input_buffers = pipeline->GetInputBuffers(i, 0);
+    FillPlane(0.0f, input_buffers.GetBuffer(0).first,
+              input_buffers.GetBuffer(0).second);
+    input_buffers.Done();
+  }
+
+  EXPECT_EQ(pipeline->PassesWithAllInput(), 1);
+}
+
+TEST(RenderPipelineTest, BuildFast) {
+  RenderPipeline::Builder builder(/*num_c=*/1);
+  builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+  builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+  builder.AddStage(jxl::make_unique<Check0FinalStage>());
+  FrameDimensions frame_dimensions;
+  frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+                       /*max_hshift=*/0, /*max_vshift=*/0,
+                       /*modular_mode=*/false, /*upsampling=*/1);
+  std::move(builder).Finalize(frame_dimensions);
+}
+
+TEST(RenderPipelineTest, CallAllGroupsFast) {
+  RenderPipeline::Builder builder(/*num_c=*/1);
+  builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+  builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+  builder.AddStage(jxl::make_unique<Check0FinalStage>());
+  builder.UseSimpleImplementation();
+  FrameDimensions frame_dimensions;
+  frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+                       /*max_hshift=*/0, /*max_vshift=*/0,
+                       /*modular_mode=*/false, /*upsampling=*/1);
+  auto pipeline = std::move(builder).Finalize(frame_dimensions);
+  ASSERT_TRUE(pipeline->PrepareForThreads(1, /*use_group_ids=*/false));
+
+  for (size_t i = 0; i < frame_dimensions.num_groups; i++) {
+    auto input_buffers = pipeline->GetInputBuffers(i, 0);
+    FillPlane(0.0f, input_buffers.GetBuffer(0).first,
+              input_buffers.GetBuffer(0).second);
+    input_buffers.Done();
+  }
+
+  EXPECT_EQ(pipeline->PassesWithAllInput(), 1);
+}
+
+struct RenderPipelineTestInputSettings {
+  // Input image.
+  std::string input_path;
+  size_t xsize, ysize;
+  bool jpeg_transcode = false;
+  // Encoding settings.
+  CompressParams cparams;
+  // Short name for the encoder settings.
+  std::string cparams_descr;
+
+  bool add_spot_color = false;
+
+  Splines splines;
+};
+
+class RenderPipelineTestParam
+    : public ::testing::TestWithParam<RenderPipelineTestInputSettings> {};
+
+TEST_P(RenderPipelineTestParam, PipelineTest) {
+  RenderPipelineTestInputSettings config = GetParam();
+
+  // Use a parallel runner that randomly shuffles tasks to detect possible
+  // border handling bugs.
+  FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8);
+  ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
+  const PaddedBytes orig = jxl::test::ReadTestData(config.input_path);
+
+  CodecInOut io;
+  if (config.jpeg_transcode) {
+    ASSERT_TRUE(jpeg::DecodeImageJPG(Span<const uint8_t>(orig), &io));
+  } else {
+    ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  }
+  io.ShrinkTo(config.xsize, config.ysize);
+
+  if (config.add_spot_color) {
+    jxl::ImageF spot(config.xsize, config.ysize);
+    jxl::ZeroFillImage(&spot);
+
+    for (size_t y = 0; y < config.ysize; y++) {
+      float* JXL_RESTRICT row = spot.Row(y);
+      for (size_t x = 0; x < config.xsize; x++) {
+        row[x] = ((x ^ y) & 255) * (1.f / 255.f);
+      }
+    }
+    ExtraChannelInfo info;
+    info.bit_depth.bits_per_sample = 8;
+    info.dim_shift = 0;
+    info.type = jxl::ExtraChannel::kSpotColor;
+    info.spot_color[0] = 0.5f;
+    info.spot_color[1] = 0.2f;
+    info.spot_color[2] = 1.f;
+    info.spot_color[3] = 0.5f;
+
+    io.metadata.m.extra_channel_info.push_back(info);
+    std::vector<jxl::ImageF> ec;
+    ec.push_back(std::move(spot));
+    io.frames[0].SetExtraChannels(std::move(ec));
+  }
+
+  PaddedBytes compressed;
+
+  PassesEncoderState enc_state;
+  enc_state.shared.image_features.splines = config.splines;
+  ASSERT_TRUE(EncodeFile(config.cparams, &io, &enc_state, &compressed,
+                         GetJxlCms(), /*aux_out=*/nullptr, &pool));
+
+
+  CodecInOut io_default;
+  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+                         /*use_slow_pipeline=*/false, &io_default, &pool));
+  CodecInOut io_slow_pipeline;
+  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+                         /*use_slow_pipeline=*/true, &io_slow_pipeline, &pool));
+
+  ASSERT_EQ(io_default.frames.size(), io_slow_pipeline.frames.size());
+  for (size_t i = 0; i < io_default.frames.size(); i++) {
+#if JXL_HIGH_PRECISION
+    constexpr float kMaxError = 1e-5;
+#else
+    constexpr float kMaxError = 1e-4;
+#endif
+    Image3F def = std::move(*io_default.frames[i].color());
+    Image3F pip = std::move(*io_slow_pipeline.frames[i].color());
+    JXL_ASSERT_OK(VerifyRelativeError(pip, def, kMaxError, kMaxError, _));
+    for (size_t ec = 0; ec < io_default.frames[i].extra_channels().size();
+         ec++) {
+      JXL_ASSERT_OK(VerifyRelativeError(
+          io_slow_pipeline.frames[i].extra_channels()[ec],
+          io_default.frames[i].extra_channels()[ec], kMaxError, kMaxError, _));
+    }
+  }
+}
+
+Splines CreateTestSplines() {
+  const ColorCorrelationMap cmap;
+  std::vector<Spline::Point> control_points{{9, 54},  {118, 159}, {97, 3},
+                                            {10, 40}, {150, 25},  {120, 300}};
+  const Spline spline{
+      control_points,
+      /*color_dct=*/
+      {{0.03125f, 0.00625f, 0.003125f}, {1.f, 0.321875f}, {1.f, 0.24375f}},
+      /*sigma_dct=*/{0.3125f, 0.f, 0.f, 0.0625f}};
+  std::vector<Spline> spline_data = {spline};
+  std::vector<QuantizedSpline> quantized_splines;
+  std::vector<Spline::Point> starting_points;
+  for (const Spline& spline : spline_data) {
+    quantized_splines.emplace_back(spline, /*quantization_adjustment=*/0,
+                                   cmap.YtoXRatio(0), cmap.YtoBRatio(0));
+    starting_points.push_back(spline.control_points.front());
+  }
+  return Splines(/*quantization_adjustment=*/0, std::move(quantized_splines),
+                 std::move(starting_points));
+}
+
+std::vector<RenderPipelineTestInputSettings> GeneratePipelineTests() {
+  std::vector<RenderPipelineTestInputSettings> all_tests;
+
+  std::pair<size_t, size_t> sizes[] = {
+      {3, 8}, {128, 128}, {256, 256}, {258, 258}, {533, 401}, {777, 777},
+  };
+
+  for (auto size : sizes) {
+    RenderPipelineTestInputSettings settings;
+    settings.input_path = "jxl/flower/flower.png";
+    settings.xsize = size.first;
+    settings.ysize = size.second;
+
+    // Base settings.
+    settings.cparams.butteraugli_distance = 1.0;
+    settings.cparams.patches = Override::kOff;
+    settings.cparams.dots = Override::kOff;
+    settings.cparams.gaborish = Override::kOff;
+    settings.cparams.epf = 0;
+    settings.cparams.color_transform = ColorTransform::kXYB;
+
+    {
+      auto s = settings;
+      s.cparams_descr = "NoGabNoEpfNoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.color_transform = ColorTransform::kNone;
+      s.cparams_descr = "NoGabNoEpfNoPatchesNoXYB";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.gaborish = Override::kOn;
+      s.cparams_descr = "GabNoEpfNoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.epf = 1;
+      s.cparams_descr = "NoGabEpf1NoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.epf = 2;
+      s.cparams_descr = "NoGabEpf2NoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.epf = 3;
+      s.cparams_descr = "NoGabEpf3NoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.gaborish = Override::kOn;
+      s.cparams.epf = 3;
+      s.cparams_descr = "GabEpf3NoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "Splines";
+      s.splines = CreateTestSplines();
+      all_tests.push_back(s);
+    }
+
+    for (size_t ups : {2, 4, 8}) {
+      {
+        auto s = settings;
+        s.cparams.resampling = ups;
+        s.cparams_descr = "Ups" + std::to_string(ups);
+        all_tests.push_back(s);
+      }
+      {
+        auto s = settings;
+        s.cparams.resampling = ups;
+        s.cparams.epf = 1;
+        s.cparams_descr = "Ups" + std::to_string(ups) + "EPF1";
+        all_tests.push_back(s);
+      }
+      {
+        auto s = settings;
+        s.cparams.resampling = ups;
+        s.cparams.gaborish = Override::kOn;
+        s.cparams.epf = 1;
+        s.cparams_descr = "Ups" + std::to_string(ups) + "GabEPF1";
+        all_tests.push_back(s);
+      }
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "Noise";
+      s.cparams.photon_noise_iso = 3200;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "NoiseUps";
+      s.cparams.photon_noise_iso = 3200;
+      s.cparams.resampling = 2;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "ModularLossless";
+      s.cparams.modular_mode = true;
+      s.cparams.butteraugli_distance = 0;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "ProgressiveDC";
+      s.cparams.progressive_dc = 1;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "ModularLossy";
+      s.cparams.modular_mode = true;
+      s.cparams.butteraugli_distance = 1.f;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.input_path = "jxl/flower/flower_alpha.png";
+      s.cparams_descr = "AlphaVarDCT";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.input_path = "jxl/flower/flower_alpha.png";
+      s.cparams_descr = "AlphaVarDCTUpsamplingEPF";
+      s.cparams.epf = 1;
+      s.cparams.ec_resampling = 2;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.modular_mode = true;
+      s.cparams.butteraugli_distance = 0;
+      s.input_path = "jxl/flower/flower_alpha.png";
+      s.cparams_descr = "AlphaLossless";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.input_path = "jxl/flower/flower_alpha.png";
+      s.cparams_descr = "AlphaDownsample";
+      s.cparams.ec_resampling = 2;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "SpotColor";
+      s.add_spot_color = true;
+      all_tests.push_back(s);
+    }
+  }
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+  for (const char* input : {"jxl/flower/flower.png.im_q85_444.jpg",
+                            "jxl/flower/flower.png.im_q85_420.jpg",
+                            "jxl/flower/flower.png.im_q85_422.jpg",
+                            "jxl/flower/flower.png.im_q85_440.jpg"}) {
+    RenderPipelineTestInputSettings settings;
+    settings.input_path = input;
+    settings.jpeg_transcode = true;
+    settings.xsize = 2268;
+    settings.ysize = 1512;
+    settings.cparams_descr = "Default";
+    all_tests.push_back(settings);
+  }
+
+#endif
+
+  {
+    RenderPipelineTestInputSettings settings;
+    settings.input_path = "jxl/grayscale_patches.png";
+    settings.xsize = 1011;
+    settings.ysize = 277;
+    settings.cparams_descr = "Patches";
+    all_tests.push_back(settings);
+  }
+
+  {
+    RenderPipelineTestInputSettings settings;
+    settings.input_path = "jxl/grayscale_patches.png";
+    settings.xsize = 1011;
+    settings.ysize = 277;
+    settings.cparams.photon_noise_iso = 1000;
+    settings.cparams_descr = "PatchesAndNoise";
+    all_tests.push_back(settings);
+  }
+
+  {
+    RenderPipelineTestInputSettings settings;
+    settings.input_path = "jxl/grayscale_patches.png";
+    settings.xsize = 1011;
+    settings.ysize = 277;
+    settings.cparams.resampling = 2;
+    settings.cparams_descr = "PatchesAndUps2";
+    all_tests.push_back(settings);
+  }
+
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         const RenderPipelineTestInputSettings& c) {
+  std::string filename;
+  size_t pos = c.input_path.find_last_of('/');
+  if (pos == std::string::npos) {
+    filename = c.input_path;
+  } else {
+    filename = c.input_path.substr(pos + 1);
+  }
+  std::replace_if(
+      filename.begin(), filename.end(), [](char c) { return !isalnum(c); },
+      '_');
+  os << filename << "_" << (c.jpeg_transcode ? "JPEG_" : "") << c.xsize << "x"
+     << c.ysize << "_" << c.cparams_descr;
+  return os;
+}
+
+std::string PipelineTestDescription(
+    const testing::TestParamInfo<RenderPipelineTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(RenderPipelineTest, RenderPipelineTestParam,
+                                   testing::ValuesIn(GeneratePipelineTests()),
+                                   PipelineTestDescription);
+
+TEST(RenderPipelineDecodingTest, Animation) {
+  FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8);
+  ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
+
+  PaddedBytes compressed =
+      jxl::test::ReadTestData("jxl/blending/cropped_traffic_light.jxl");
+
+  CodecInOut io_default;
+  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+                         /*use_slow_pipeline=*/false, &io_default, &pool));
+  CodecInOut io_slow_pipeline;
+  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+                         /*use_slow_pipeline=*/true, &io_slow_pipeline, &pool));
+
+  ASSERT_EQ(io_default.frames.size(), io_slow_pipeline.frames.size());
+  for (size_t i = 0; i < io_default.frames.size(); i++) {
+#if JXL_HIGH_PRECISION
+    constexpr float kMaxError = 1e-5;
+#else
+    constexpr float kMaxError = 1e-4;
+#endif
+
+    Image3F fast_pipeline = std::move(*io_default.frames[i].color());
+    Image3F slow_pipeline = std::move(*io_slow_pipeline.frames[i].color());
+    JXL_ASSERT_OK(VerifyRelativeError(slow_pipeline, fast_pipeline, kMaxError,
+                                      kMaxError, _))
+    for (size_t ec = 0; ec < io_default.frames[i].extra_channels().size();
+         ec++) {
+      JXL_ASSERT_OK(VerifyRelativeError(
+          io_slow_pipeline.frames[i].extra_channels()[ec],
+          io_default.frames[i].extra_channels()[ec], kMaxError, kMaxError, _));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc
new file mode 100644
index 0000000000..4495288860
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc
@@ -0,0 +1,266 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/simple_render_pipeline.h"
+
+#include <hwy/base.h>
+
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+
+void SimpleRenderPipeline::PrepareForThreadsInternal(size_t num,
+                                                     bool use_group_ids) {
+  if (!channel_data_.empty()) {
+    return;
+  }
+  auto ch_size = [](size_t frame_size, size_t shift) {
+    return DivCeil(frame_size, 1 << shift) + kRenderPipelineXOffset * 2;
+  };
+  for (size_t c = 0; c < channel_shifts_[0].size(); c++) {
+    channel_data_.push_back(ImageF(
+        ch_size(frame_dimensions_.xsize_upsampled, channel_shifts_[0][c].first),
+        ch_size(frame_dimensions_.ysize_upsampled,
+                channel_shifts_[0][c].second)));
+    msan::PoisonImage(channel_data_.back());
+  }
+}
+
+Rect SimpleRenderPipeline::MakeChannelRect(size_t group_id, size_t channel) {
+  size_t base_color_shift =
+      CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded /
+                      frame_dimensions_.xsize_padded);
+
+  const size_t gx = group_id % frame_dimensions_.xsize_groups;
+  const size_t gy = group_id / frame_dimensions_.xsize_groups;
+  size_t xgroupdim = (frame_dimensions_.group_dim << base_color_shift) >>
+                     channel_shifts_[0][channel].first;
+  size_t ygroupdim = (frame_dimensions_.group_dim << base_color_shift) >>
+                     channel_shifts_[0][channel].second;
+  return Rect(
+      kRenderPipelineXOffset + gx * xgroupdim,
+      kRenderPipelineXOffset + gy * ygroupdim, xgroupdim, ygroupdim,
+      kRenderPipelineXOffset + DivCeil(frame_dimensions_.xsize_upsampled,
+                                       1 << channel_shifts_[0][channel].first),
+      kRenderPipelineXOffset +
+          DivCeil(frame_dimensions_.ysize_upsampled,
+                  1 << channel_shifts_[0][channel].second));
+}
+
+std::vector<std::pair<ImageF*, Rect>> SimpleRenderPipeline::PrepareBuffers(
+    size_t group_id, size_t thread_id) {
+  std::vector<std::pair<ImageF*, Rect>> ret;
+  for (size_t c = 0; c < channel_data_.size(); c++) {
+    ret.emplace_back(&channel_data_[c], MakeChannelRect(group_id, c));
+  }
+  return ret;
+}
+
+void SimpleRenderPipeline::ProcessBuffers(size_t group_id, size_t thread_id) {
+  for (size_t c = 0; c < channel_data_.size(); c++) {
+    Rect r = MakeChannelRect(group_id, c);
+    (void)r;
+    JXL_CHECK_PLANE_INITIALIZED(channel_data_[c], r, c);
+  }
+
+  if (PassesWithAllInput() <= processed_passes_) return;
+  processed_passes_++;
+
+  for (size_t stage_id = 0; stage_id < stages_.size(); stage_id++) {
+    const auto& stage = stages_[stage_id];
+    // Prepare buffers for kInOut channels.
+    std::vector<ImageF> new_channels(channel_data_.size());
+    std::vector<ImageF*> output_channels(channel_data_.size());
+
+    std::vector<std::pair<size_t, size_t>> input_sizes(channel_data_.size());
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      input_sizes[c] =
+          std::make_pair(channel_data_[c].xsize() - kRenderPipelineXOffset * 2,
+                         channel_data_[c].ysize() - kRenderPipelineXOffset * 2);
+    }
+
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) {
+        continue;
+      }
+      // Ensure that the newly allocated channels are large enough to avoid
+      // problems with padding.
+      new_channels[c] =
+          ImageF(frame_dimensions_.xsize_upsampled_padded +
+                     kRenderPipelineXOffset * 2 + hwy::kMaxVectorSize * 8,
+                 frame_dimensions_.ysize_upsampled_padded +
+                     kRenderPipelineXOffset * 2);
+      new_channels[c].ShrinkTo(
+          (input_sizes[c].first << stage->settings_.shift_x) +
+              kRenderPipelineXOffset * 2,
+          (input_sizes[c].second << stage->settings_.shift_y) +
+              kRenderPipelineXOffset * 2);
+      output_channels[c] = &new_channels[c];
+    }
+
+    auto get_row = [&](size_t c, int64_t y) {
+      return channel_data_[c].Row(kRenderPipelineXOffset + y) +
+             kRenderPipelineXOffset;
+    };
+
+    // Add mirrored pixes to all kInOut channels.
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) {
+        continue;
+      }
+      // Horizontal mirroring.
+      for (size_t y = 0; y < input_sizes[c].second; y++) {
+        float* row = get_row(c, y);
+        for (size_t ix = 0; ix < stage->settings_.border_x; ix++) {
+          *(row - ix - 1) = row[Mirror(-ssize_t(ix) - 1, input_sizes[c].first)];
+        }
+        for (size_t ix = 0; ix < stage->settings_.border_x; ix++) {
+          *(row + ix + input_sizes[c].first) =
+              row[Mirror(ix + input_sizes[c].first, input_sizes[c].first)];
+        }
+      }
+      // Vertical mirroring.
+      for (int y = 0; y < static_cast<int>(stage->settings_.border_y); y++) {
+        memcpy(get_row(c, -y - 1) - stage->settings_.border_x,
+               get_row(c, Mirror(-ssize_t(y) - 1, input_sizes[c].second)) -
+                   stage->settings_.border_x,
+               sizeof(float) *
+                   (input_sizes[c].first + 2 * stage->settings_.border_x));
+      }
+      for (int y = 0; y < static_cast<int>(stage->settings_.border_y); y++) {
+        memcpy(
+            get_row(c, input_sizes[c].second + y) - stage->settings_.border_x,
+            get_row(c,
+                    Mirror(input_sizes[c].second + y, input_sizes[c].second)) -
+                stage->settings_.border_x,
+            sizeof(float) *
+                (input_sizes[c].first + 2 * stage->settings_.border_x));
+      }
+    }
+
+    size_t ysize = 0;
+    size_t xsize = 0;
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kIgnored) {
+        continue;
+      }
+      ysize = std::max(input_sizes[c].second, ysize);
+      xsize = std::max(input_sizes[c].first, xsize);
+    }
+
+    JXL_ASSERT(ysize != 0);
+    JXL_ASSERT(xsize != 0);
+
+    RenderPipelineStage::RowInfo input_rows(channel_data_.size());
+    RenderPipelineStage::RowInfo output_rows(channel_data_.size());
+
+    // Run the pipeline.
+    {
+      stage->SetInputSizes(input_sizes);
+      int border_y = stage->settings_.border_y;
+      for (size_t y = 0; y < ysize; y++) {
+        // Prepare input rows.
+        for (size_t c = 0; c < channel_data_.size(); c++) {
+          if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kIgnored) {
+            continue;
+          }
+          input_rows[c].resize(2 * border_y + 1);
+          for (int iy = -border_y; iy <= border_y; iy++) {
+            input_rows[c][iy + border_y] =
+                channel_data_[c].Row(y + kRenderPipelineXOffset + iy);
+          }
+        }
+        // Prepare output rows.
+        for (size_t c = 0; c < channel_data_.size(); c++) {
+          if (!output_channels[c]) continue;
+          output_rows[c].resize(1 << stage->settings_.shift_y);
+          for (size_t iy = 0; iy < output_rows[c].size(); iy++) {
+            output_rows[c][iy] = output_channels[c]->Row(
+                (y << stage->settings_.shift_y) + iy + kRenderPipelineXOffset);
+          }
+        }
+        stage->ProcessRow(input_rows, output_rows, /*xextra=*/0, xsize,
+                          /*xpos=*/0, y, thread_id);
+      }
+    }
+
+    // Move new channels to current channels.
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) {
+        continue;
+      }
+      channel_data_[c] = std::move(new_channels[c]);
+    }
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      size_t next_stage = std::min(stage_id + 1, channel_shifts_.size() - 1);
+      size_t xsize = DivCeil(frame_dimensions_.xsize_upsampled,
+                             1 << channel_shifts_[next_stage][c].first);
+      size_t ysize = DivCeil(frame_dimensions_.ysize_upsampled,
+                             1 << channel_shifts_[next_stage][c].second);
+      channel_data_[c].ShrinkTo(xsize + 2 * kRenderPipelineXOffset,
+                                ysize + 2 * kRenderPipelineXOffset);
+      JXL_CHECK_PLANE_INITIALIZED(
+          channel_data_[c],
+          Rect(kRenderPipelineXOffset, kRenderPipelineXOffset, xsize, ysize),
+          c);
+    }
+
+    if (stage->SwitchToImageDimensions()) {
+      size_t image_xsize, image_ysize;
+      FrameOrigin frame_origin;
+      stage->GetImageDimensions(&image_xsize, &image_ysize, &frame_origin);
+      frame_dimensions_.Set(image_xsize, image_ysize, 0, 0, 0, false, 1);
+      std::vector<ImageF> old_channels = std::move(channel_data_);
+      channel_data_.clear();
+      channel_data_.reserve(old_channels.size());
+      for (size_t c = 0; c < old_channels.size(); c++) {
+        channel_data_.emplace_back(2 * kRenderPipelineXOffset + image_xsize,
+                                   2 * kRenderPipelineXOffset + image_ysize);
+      }
+      for (size_t y = 0; y < image_ysize; ++y) {
+        for (size_t c = 0; c < channel_data_.size(); c++) {
+          output_rows[c].resize(1);
+          output_rows[c][0] = channel_data_[c].Row(kRenderPipelineXOffset + y);
+        }
+        // TODO(sboukortt): consider doing this only on the parts of the
+        // background that won't be occluded.
+        stage->ProcessPaddingRow(output_rows, image_xsize, 0, y);
+      }
+      ssize_t x0 = frame_origin.x0;
+      ssize_t y0 = frame_origin.y0;
+      size_t x0_fg = 0;
+      size_t y0_fg = 0;
+      if (x0 < 0) {
+        xsize += x0;
+        x0_fg -= x0;
+        x0 = 0;
+      }
+      if (x0 + xsize > image_xsize) {
+        xsize = image_xsize - x0;
+      }
+      if (y0 < 0) {
+        ysize += y0;
+        y0_fg -= x0;
+        y0 = 0;
+      }
+      if (y0 + ysize > image_ysize) {
+        ysize = image_ysize - y0;
+      }
+      const Rect rect_fg_relative_to_image =
+          Rect(x0, y0, xsize, ysize)
+              .Translate(kRenderPipelineXOffset, kRenderPipelineXOffset);
+      const Rect rect_fg =
+          Rect(x0_fg, y0_fg, xsize, ysize)
+              .Translate(kRenderPipelineXOffset, kRenderPipelineXOffset);
+      for (size_t c = 0; c < channel_data_.size(); c++) {
+        CopyImageTo(rect_fg, old_channels[c], rect_fg_relative_to_image,
+                    &channel_data_[c]);
+      }
+    }
+  }
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h
new file mode 100644
index 0000000000..10f4505912
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_
+#define LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+namespace jxl {
+
+// A RenderPipeline that is "obviously correct"; it may use potentially large
+// amounts of memory and be slow. It is intended to be used mostly for testing
+// purposes.
+class SimpleRenderPipeline : public RenderPipeline {
+  std::vector<std::pair<ImageF*, Rect>> PrepareBuffers(
+      size_t group_id, size_t thread_id) override;
+
+  void ProcessBuffers(size_t group_id, size_t thread_id) override;
+
+  void PrepareForThreadsInternal(size_t num, bool use_group_ids) override;
+
+  // Full frame buffers. Both X and Y dimensions are padded by
+  // kRenderPipelineXOffset.
+  std::vector<ImageF> channel_data_;
+  size_t processed_passes_ = 0;
+
+ private:
+  Rect MakeChannelRect(size_t group_id, size_t channel);
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc
new file mode 100644
index 0000000000..b6668c5625
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc
@@ -0,0 +1,247 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_blending.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_blending.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/blending.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class BlendingStage : public RenderPipelineStage {
+ public:
+  explicit BlendingStage(const PassesDecoderState* dec_state,
+                         const ColorEncoding& frame_color_encoding)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        state_(*dec_state->shared) {
+    image_xsize_ = state_.frame_header.nonserialized_metadata->xsize();
+    image_ysize_ = state_.frame_header.nonserialized_metadata->ysize();
+    extra_channel_info_ =
+        &state_.frame_header.nonserialized_metadata->m.extra_channel_info;
+    info_ = state_.frame_header.blending_info;
+    const std::vector<BlendingInfo>& ec_info =
+        state_.frame_header.extra_channel_blending_info;
+    const ImageBundle& bg = state_.reference_frames[info_.source].frame;
+    bg_ = &bg;
+    if (bg.xsize() == 0 || bg.ysize() == 0) {
+      zeroes_.resize(image_xsize_, 0.f);
+    } else if (state_.reference_frames[info_.source].ib_is_in_xyb) {
+      initialized_ = JXL_FAILURE(
+          "Trying to blend XYB reference frame %i and non-XYB frame",
+          info_.source);
+      return;
+    } else if (std::any_of(ec_info.begin(), ec_info.end(),
+                           [this](const BlendingInfo& info) {
+                             const ImageBundle& bg =
+                                 state_.reference_frames[info.source].frame;
+                             return bg.xsize() == 0 || bg.ysize() == 0;
+                           })) {
+      zeroes_.resize(image_xsize_, 0.f);
+    }
+
+    auto verify_bg_size = [&](const ImageBundle& bg) -> Status {
+      if (bg.xsize() != 0 && bg.ysize() != 0 &&
+          (bg.xsize() < image_xsize_ || bg.ysize() < image_ysize_ ||
+           bg.origin.x0 != 0 || bg.origin.y0 != 0)) {
+        return JXL_FAILURE("Trying to use a %" PRIuS "x%" PRIuS
+                           " crop as a background",
+                           bg.xsize(), bg.ysize());
+      }
+      return true;
+    };
+
+    Status ok = verify_bg_size(bg);
+    for (const auto& info : ec_info) {
+      const ImageBundle& bg = state_.reference_frames[info.source].frame;
+      if (!!ok) ok = verify_bg_size(bg);
+    }
+    if (!ok) {
+      initialized_ = ok;
+      return;
+    }
+
+    if (state_.metadata->m.xyb_encoded) {
+      if (!dec_state->output_encoding_info.color_encoding_is_original) {
+        initialized_ = JXL_FAILURE("Blending in unsupported color space");
+        return;
+      }
+    }
+
+    blending_info_.resize(ec_info.size() + 1);
+    auto make_blending = [&](const BlendingInfo& info, PatchBlending* pb) {
+      pb->alpha_channel = info.alpha_channel;
+      pb->clamp = info.clamp;
+      switch (info.mode) {
+        case BlendMode::kReplace: {
+          pb->mode = PatchBlendMode::kReplace;
+          break;
+        }
+        case BlendMode::kAdd: {
+          pb->mode = PatchBlendMode::kAdd;
+          break;
+        }
+        case BlendMode::kMul: {
+          pb->mode = PatchBlendMode::kMul;
+          break;
+        }
+        case BlendMode::kBlend: {
+          pb->mode = PatchBlendMode::kBlendAbove;
+          break;
+        }
+        case BlendMode::kAlphaWeightedAdd: {
+          pb->mode = PatchBlendMode::kAlphaWeightedAddAbove;
+          break;
+        }
+        default: {
+          JXL_ABORT("Invalid blend mode");  // should have failed to decode
+        }
+      }
+    };
+    make_blending(info_, &blending_info_[0]);
+    for (size_t i = 0; i < ec_info.size(); i++) {
+      make_blending(ec_info[i], &blending_info_[1 + i]);
+    }
+  }
+
+  Status IsInitialized() const override { return initialized_; }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("Blend");
+    JXL_ASSERT(initialized_);
+    const FrameOrigin& frame_origin = state_.frame_header.frame_origin;
+    ssize_t bg_xpos = frame_origin.x0 + static_cast<ssize_t>(xpos);
+    ssize_t bg_ypos = frame_origin.y0 + static_cast<ssize_t>(ypos);
+    int offset = 0;
+    if (bg_xpos + static_cast<ssize_t>(xsize) <= 0 ||
+        frame_origin.x0 >= static_cast<ssize_t>(image_xsize_) || bg_ypos < 0 ||
+        bg_ypos >= static_cast<ssize_t>(image_ysize_)) {
+      return;
+    }
+    if (bg_xpos < 0) {
+      offset -= bg_xpos;
+      xsize += bg_xpos;
+      bg_xpos = 0;
+    }
+    if (bg_xpos + xsize > image_xsize_) {
+      xsize =
+          std::max<ssize_t>(0, static_cast<ssize_t>(image_xsize_) - bg_xpos);
+    }
+    std::vector<const float*> bg_row_ptrs_(input_rows.size());
+    std::vector<float*> fg_row_ptrs_(input_rows.size());
+    size_t num_c = std::min(input_rows.size(), extra_channel_info_->size() + 3);
+    for (size_t c = 0; c < num_c; ++c) {
+      fg_row_ptrs_[c] = GetInputRow(input_rows, c, 0) + offset;
+      if (c < 3) {
+        bg_row_ptrs_[c] = bg_->xsize() != 0 && bg_->ysize() != 0
+                              ? bg_->color().ConstPlaneRow(c, bg_ypos) + bg_xpos
+                              : zeroes_.data();
+      } else {
+        const ImageBundle& ec_bg =
+            state_
+                .reference_frames[state_.frame_header
+                                      .extra_channel_blending_info[c - 3]
+                                      .source]
+                .frame;
+        bg_row_ptrs_[c] =
+            ec_bg.xsize() != 0 && ec_bg.ysize() != 0
+                ? ec_bg.extra_channels()[c - 3].ConstRow(bg_ypos) + bg_xpos
+                : zeroes_.data();
+      }
+    }
+    PerformBlending(bg_row_ptrs_.data(), fg_row_ptrs_.data(),
+                    fg_row_ptrs_.data(), 0, xsize, blending_info_[0],
+                    blending_info_.data() + 1, *extra_channel_info_);
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return RenderPipelineChannelMode::kInPlace;
+  }
+
+  bool SwitchToImageDimensions() const override { return true; }
+
+  void GetImageDimensions(size_t* xsize, size_t* ysize,
+                          FrameOrigin* frame_origin) const override {
+    *xsize = image_xsize_;
+    *ysize = image_ysize_;
+    *frame_origin = state_.frame_header.frame_origin;
+  }
+
+  void ProcessPaddingRow(const RowInfo& output_rows, size_t xsize, size_t xpos,
+                         size_t ypos) const override {
+    if (bg_->xsize() == 0 || bg_->ysize() == 0) {
+      for (size_t c = 0; c < 3; ++c) {
+        memset(GetInputRow(output_rows, c, 0), 0, xsize * sizeof(float));
+      }
+    } else {
+      for (size_t c = 0; c < 3; ++c) {
+        memcpy(GetInputRow(output_rows, c, 0),
+               bg_->color().ConstPlaneRow(c, ypos) + xpos,
+               xsize * sizeof(float));
+      }
+    }
+    for (size_t ec = 0; ec < extra_channel_info_->size(); ++ec) {
+      const ImageBundle& ec_bg =
+          state_
+              .reference_frames
+                  [state_.frame_header.extra_channel_blending_info[ec].source]
+              .frame;
+      if (ec_bg.xsize() == 0 || ec_bg.ysize() == 0) {
+        memset(GetInputRow(output_rows, 3 + ec, 0), 0, xsize * sizeof(float));
+      } else {
+        memcpy(GetInputRow(output_rows, 3 + ec, 0),
+               ec_bg.extra_channels()[ec].ConstRow(ypos) + xpos,
+               xsize * sizeof(float));
+      }
+    }
+  }
+
+  const char* GetName() const override { return "Blending"; }
+
+ private:
+  const PassesSharedState& state_;
+  BlendingInfo info_;
+  const ImageBundle* bg_;
+  Status initialized_ = true;
+  size_t image_xsize_;
+  size_t image_ysize_;
+  std::vector<PatchBlending> blending_info_;
+  const std::vector<ExtraChannelInfo>* extra_channel_info_;
+  std::vector<float> zeroes_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetBlendingStage(
+    const PassesDecoderState* dec_state,
+    const ColorEncoding& frame_color_encoding) {
+  return jxl::make_unique<BlendingStage>(dec_state, frame_color_encoding);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetBlendingStage);
+
+std::unique_ptr<RenderPipelineStage> GetBlendingStage(
+    const PassesDecoderState* dec_state,
+    const ColorEncoding& frame_color_encoding) {
+  return HWY_DYNAMIC_DISPATCH(GetBlendingStage)(dec_state,
+                                                frame_color_encoding);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h
new file mode 100644
index 0000000000..c8db7490cd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_
+
+#include <utility>
+
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+#include "lib/jxl/splines.h"
+
+namespace jxl {
+
+// Applies blending if applicable.
+std::unique_ptr<RenderPipelineStage> GetBlendingStage(
+    const PassesDecoderState* dec_state,
+    const ColorEncoding& frame_color_encoding);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc
new file mode 100644
index 0000000000..9b73ee91f1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc
@@ -0,0 +1,129 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_chroma_upsampling.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_chroma_upsampling.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/simd_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class HorizontalChromaUpsamplingStage : public RenderPipelineStage {
+ public:
+  explicit HorizontalChromaUpsamplingStage(size_t channel)
+      : RenderPipelineStage(RenderPipelineStage::Settings::ShiftX(
+            /*shift=*/1, /*border=*/1)),
+        c_(channel) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("HorizontalChromaUpsampling");
+    HWY_FULL(float) df;
+    xextra = RoundUpTo(xextra, Lanes(df));
+    auto threefour = Set(df, 0.75f);
+    auto onefour = Set(df, 0.25f);
+    const float* row_in = GetInputRow(input_rows, c_, 0);
+    float* row_out = GetOutputRow(output_rows, c_, 0);
+    for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+         x += Lanes(df)) {
+      auto current = Mul(LoadU(df, row_in + x), threefour);
+      auto prev = LoadU(df, row_in + x - 1);
+      auto next = LoadU(df, row_in + x + 1);
+      auto left = MulAdd(onefour, prev, current);
+      auto right = MulAdd(onefour, next, current);
+      StoreInterleaved(df, left, right, row_out + x * 2);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c == c_ ? RenderPipelineChannelMode::kInOut
+                   : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "HChromaUps"; }
+
+ private:
+  size_t c_;
+};
+
+class VerticalChromaUpsamplingStage : public RenderPipelineStage {
+ public:
+  explicit VerticalChromaUpsamplingStage(size_t channel)
+      : RenderPipelineStage(RenderPipelineStage::Settings::ShiftY(
+            /*shift=*/1, /*border=*/1)),
+        c_(channel) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("VerticalChromaUpsampling");
+    HWY_FULL(float) df;
+    xextra = RoundUpTo(xextra, Lanes(df));
+    auto threefour = Set(df, 0.75f);
+    auto onefour = Set(df, 0.25f);
+    const float* row_top = GetInputRow(input_rows, c_, -1);
+    const float* row_mid = GetInputRow(input_rows, c_, 0);
+    const float* row_bot = GetInputRow(input_rows, c_, 1);
+    float* row_out0 = GetOutputRow(output_rows, c_, 0);
+    float* row_out1 = GetOutputRow(output_rows, c_, 1);
+    for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+         x += Lanes(df)) {
+      auto it = LoadU(df, row_top + x);
+      auto im = LoadU(df, row_mid + x);
+      auto ib = LoadU(df, row_bot + x);
+      auto im_scaled = Mul(im, threefour);
+      Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
+      Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c == c_ ? RenderPipelineChannelMode::kInOut
+                   : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "VChromaUps"; }
+
+ private:
+  size_t c_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetChromaUpsamplingStage(size_t channel,
+                                                              bool horizontal) {
+  if (horizontal) {
+    return jxl::make_unique<HorizontalChromaUpsamplingStage>(channel);
+  } else {
+    return jxl::make_unique<VerticalChromaUpsamplingStage>(channel);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetChromaUpsamplingStage);
+
+std::unique_ptr<RenderPipelineStage> GetChromaUpsamplingStage(size_t channel,
+                                                              bool horizontal) {
+  return HWY_DYNAMIC_DISPATCH(GetChromaUpsamplingStage)(channel, horizontal);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h
new file mode 100644
index 0000000000..b8bfc15f5f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h
@@ -0,0 +1,27 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Applies simple upsampling, either horizontal or vertical, to the given
+// channel.
+std::unique_ptr<RenderPipelineStage> GetChromaUpsamplingStage(size_t channel,
+                                                              bool horizontal);
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc
new file mode 100644
index 0000000000..d59c497843
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc
@@ -0,0 +1,524 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_epf.h"
+
+#include "lib/jxl/epf.h"
+#include "lib/jxl/sanitizers.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_epf.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+// TODO(veluca): In principle, vectors could be not capped, if we want to deal
+// with having two different sigma values in a single vector.
+using DF = HWY_CAPPED(float, 8);
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::AbsDiff;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::VFromD;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+JXL_INLINE Vec<DF> Weight(Vec<DF> sad, Vec<DF> inv_sigma, Vec<DF> thres) {
+  auto v = MulAdd(sad, inv_sigma, Set(DF(), 1.0f));
+  return ZeroIfNegative(v);
+}
+
+// 5x5 plus-shaped kernel with 5 SADs per pixel (3x3 plus-shaped). So this makes
+// this filter a 7x7 filter.
+class EPF0Stage : public RenderPipelineStage {
+ public:
+  EPF0Stage(const LoopFilter& lf, const ImageF& sigma)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/3)),
+        lf_(lf),
+        sigma_(&sigma) {}
+
+  template <bool aligned>
+  JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][7], ssize_t x,
+                           Vec<DF> sad, Vec<DF> inv_sigma,
+                           Vec<DF>* JXL_RESTRICT X, Vec<DF>* JXL_RESTRICT Y,
+                           Vec<DF>* JXL_RESTRICT B,
+                           Vec<DF>* JXL_RESTRICT w) const {
+    auto cx = aligned ? Load(DF(), rows[0][3 + row] + x)
+                      : LoadU(DF(), rows[0][3 + row] + x);
+    auto cy = aligned ? Load(DF(), rows[1][3 + row] + x)
+                      : LoadU(DF(), rows[1][3 + row] + x);
+    auto cb = aligned ? Load(DF(), rows[2][3 + row] + x)
+                      : LoadU(DF(), rows[2][3 + row] + x);
+
+    auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass1_zeroflush));
+    *w = Add(*w, weight);
+    *X = MulAdd(weight, cx, *X);
+    *Y = MulAdd(weight, cy, *Y);
+    *B = MulAdd(weight, cb, *B);
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    DF df;
+
+    using V = decltype(Zero(df));
+    V t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA, tB;
+    V* sads[12] = {&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9, &tA, &tB};
+
+    xextra = RoundUpTo(xextra, Lanes(df));
+    const float* JXL_RESTRICT row_sigma =
+        sigma_->Row(ypos / kBlockDim + kSigmaPadding);
+
+    float sm = lf_.epf_pass0_sigma_scale * 1.65;
+    float bsm = sm * lf_.epf_border_sad_mul;
+
+    HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm,
+                                                 sm,  sm, sm, bsm};
+    HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm,
+                                                 bsm, bsm, bsm, bsm};
+    float* JXL_RESTRICT rows[3][7];
+    for (size_t c = 0; c < 3; c++) {
+      for (int i = 0; i < 7; i++) {
+        rows[c][i] = GetInputRow(input_rows, c, i - 3);
+      }
+    }
+
+    const float* sad_mul =
+        (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1)
+            ? sad_mul_border
+            : sad_mul_center;
+
+    for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+         x += Lanes(df)) {
+      size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim;
+      size_t ix = (x + xpos) % kBlockDim;
+
+      if (row_sigma[bx] < kMinSigma) {
+        for (size_t c = 0; c < 3; c++) {
+          auto px = Load(df, rows[c][3 + 0] + x);
+          StoreU(px, df, GetOutputRow(output_rows, c, 0) + x);
+        }
+        continue;
+      }
+
+      const auto sm = Load(df, sad_mul + ix);
+      const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm);
+
+      for (size_t i = 0; i < 12; i++) *sads[i] = Zero(df);
+      constexpr std::array<int, 2> sads_off[12] = {
+          {{-2, 0}}, {{-1, -1}}, {{-1, 0}}, {{-1, 1}}, {{0, -2}}, {{0, -1}},
+          {{0, 1}},  {{0, 2}},   {{1, -1}}, {{1, 0}},  {{1, 1}},  {{2, 0}},
+      };
+
+      // compute sads
+      // TODO(veluca): consider unrolling and optimizing this.
+      for (size_t c = 0; c < 3; c++) {
+        auto scale = Set(df, lf_.epf_channel_scale[c]);
+        for (size_t i = 0; i < 12; i++) {
+          auto sad = Zero(df);
+          constexpr std::array<int, 2> plus_off[] = {
+              {{0, 0}}, {{-1, 0}}, {{0, -1}}, {{1, 0}}, {{0, 1}}};
+          for (size_t j = 0; j < 5; j++) {
+            const auto r11 =
+                LoadU(df, rows[c][3 + plus_off[j][0]] + x + plus_off[j][1]);
+            const auto c11 =
+                LoadU(df, rows[c][3 + sads_off[i][0] + plus_off[j][0]] + x +
+                              sads_off[i][1] + plus_off[j][1]);
+            sad = Add(sad, AbsDiff(r11, c11));
+          }
+          *sads[i] = MulAdd(sad, scale, *sads[i]);
+        }
+      }
+      const auto x_cc = Load(df, rows[0][3 + 0] + x);
+      const auto y_cc = Load(df, rows[1][3 + 0] + x);
+      const auto b_cc = Load(df, rows[2][3 + 0] + x);
+
+      auto w = Set(df, 1);
+      auto X = x_cc;
+      auto Y = y_cc;
+      auto B = b_cc;
+
+      for (size_t i = 0; i < 12; i++) {
+        AddPixel</*aligned=*/false>(/*row=*/sads_off[i][0], rows,
+                                    x + sads_off[i][1], *sads[i], inv_sigma, &X,
+                                    &Y, &B, &w);
+      }
+#if JXL_HIGH_PRECISION
+      auto inv_w = Div(Set(df, 1.0f), w);
+#else
+      auto inv_w = ApproximateReciprocal(w);
+#endif
+      StoreU(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x);
+      StoreU(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x);
+      StoreU(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInOut
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "EPF0"; }
+
+ private:
+  LoopFilter lf_;
+  const ImageF* sigma_;
+};
+
+// 3x3 plus-shaped kernel with 5 SADs per pixel (also 3x3 plus-shaped). So this
+// makes this filter a 5x5 filter.
+class EPF1Stage : public RenderPipelineStage {
+ public:
+  EPF1Stage(const LoopFilter& lf, const ImageF& sigma)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/2)),
+        lf_(lf),
+        sigma_(&sigma) {}
+
+  template <bool aligned>
+  JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][5], ssize_t x,
+                           Vec<DF> sad, Vec<DF> inv_sigma,
+                           Vec<DF>* JXL_RESTRICT X, Vec<DF>* JXL_RESTRICT Y,
+                           Vec<DF>* JXL_RESTRICT B,
+                           Vec<DF>* JXL_RESTRICT w) const {
+    auto cx = aligned ? Load(DF(), rows[0][2 + row] + x)
+                      : LoadU(DF(), rows[0][2 + row] + x);
+    auto cy = aligned ? Load(DF(), rows[1][2 + row] + x)
+                      : LoadU(DF(), rows[1][2 + row] + x);
+    auto cb = aligned ? Load(DF(), rows[2][2 + row] + x)
+                      : LoadU(DF(), rows[2][2 + row] + x);
+
+    auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass1_zeroflush));
+    *w = Add(*w, weight);
+    *X = MulAdd(weight, cx, *X);
+    *Y = MulAdd(weight, cy, *Y);
+    *B = MulAdd(weight, cb, *B);
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    DF df;
+    xextra = RoundUpTo(xextra, Lanes(df));
+    const float* JXL_RESTRICT row_sigma =
+        sigma_->Row(ypos / kBlockDim + kSigmaPadding);
+
+    float sm = 1.65f;
+    float bsm = sm * lf_.epf_border_sad_mul;
+
+    HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm,
+                                                 sm,  sm, sm, bsm};
+    HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm,
+                                                 bsm, bsm, bsm, bsm};
+
+    float* JXL_RESTRICT rows[3][5];
+    for (size_t c = 0; c < 3; c++) {
+      for (int i = 0; i < 5; i++) {
+        rows[c][i] = GetInputRow(input_rows, c, i - 2);
+      }
+    }
+
+    const float* sad_mul =
+        (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1)
+            ? sad_mul_border
+            : sad_mul_center;
+
+    for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+         x += Lanes(df)) {
+      size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim;
+      size_t ix = (x + xpos) % kBlockDim;
+
+      if (row_sigma[bx] < kMinSigma) {
+        for (size_t c = 0; c < 3; c++) {
+          auto px = Load(df, rows[c][2 + 0] + x);
+          Store(px, df, GetOutputRow(output_rows, c, 0) + x);
+        }
+        continue;
+      }
+
+      const auto sm = Load(df, sad_mul + ix);
+      const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm);
+      auto sad0 = Zero(df);
+      auto sad1 = Zero(df);
+      auto sad2 = Zero(df);
+      auto sad3 = Zero(df);
+
+      // compute sads
+      for (size_t c = 0; c < 3; c++) {
+        // center px = 22, px above = 21
+        auto t = Undefined(df);
+
+        const auto p20 = Load(df, rows[c][2 + -2] + x);
+        const auto p21 = Load(df, rows[c][2 + -1] + x);
+        auto sad0c = AbsDiff(p20, p21);  // SAD 2, 1
+
+        const auto p11 = LoadU(df, rows[c][2 + -1] + x - 1);
+        auto sad1c = AbsDiff(p11, p21);  // SAD 1, 2
+
+        const auto p31 = LoadU(df, rows[c][2 + -1] + x + 1);
+        auto sad2c = AbsDiff(p31, p21);  // SAD 3, 2
+
+        const auto p02 = LoadU(df, rows[c][2 + 0] + x - 2);
+        const auto p12 = LoadU(df, rows[c][2 + 0] + x - 1);
+        sad1c = Add(sad1c, AbsDiff(p02, p12));  // SAD 1, 2
+        sad0c = Add(sad0c, AbsDiff(p11, p12));  // SAD 2, 1
+
+        const auto p22 = LoadU(df, rows[c][2 + 0] + x);
+        t = AbsDiff(p12, p22);
+        sad1c = Add(sad1c, t);  // SAD 1, 2
+        sad2c = Add(sad2c, t);  // SAD 3, 2
+        t = AbsDiff(p22, p21);
+        auto sad3c = t;  // SAD 2, 3
+        sad0c = Add(sad0c, t);  // SAD 2, 1
+
+        const auto p32 = LoadU(df, rows[c][2 + 0] + x + 1);
+        sad0c = Add(sad0c, AbsDiff(p31, p32));  // SAD 2, 1
+        t = AbsDiff(p22, p32);
+        sad1c = Add(sad1c, t);  // SAD 1, 2
+        sad2c = Add(sad2c, t);  // SAD 3, 2
+
+        const auto p42 = LoadU(df, rows[c][2 + 0] + x + 2);
+        sad2c = Add(sad2c, AbsDiff(p42, p32));  // SAD 3, 2
+
+        const auto p13 = LoadU(df, rows[c][2 + 1] + x - 1);
+        sad3c = Add(sad3c, AbsDiff(p13, p12));  // SAD 2, 3
+
+        const auto p23 = Load(df, rows[c][2 + 1] + x);
+        t = AbsDiff(p22, p23);
+        sad0c = Add(sad0c, t);                  // SAD 2, 1
+        sad3c = Add(sad3c, t);                  // SAD 2, 3
+        sad1c = Add(sad1c, AbsDiff(p13, p23));  // SAD 1, 2
+
+        const auto p33 = LoadU(df, rows[c][2 + 1] + x + 1);
+        sad2c = Add(sad2c, AbsDiff(p33, p23));  // SAD 3, 2
+        sad3c = Add(sad3c, AbsDiff(p33, p32));  // SAD 2, 3
+
+        const auto p24 = Load(df, rows[c][2 + 2] + x);
+        sad3c = Add(sad3c, AbsDiff(p24, p23));  // SAD 2, 3
+
+        auto scale = Set(df, lf_.epf_channel_scale[c]);
+        sad0 = MulAdd(sad0c, scale, sad0);
+        sad1 = MulAdd(sad1c, scale, sad1);
+        sad2 = MulAdd(sad2c, scale, sad2);
+        sad3 = MulAdd(sad3c, scale, sad3);
+      }
+      const auto x_cc = Load(df, rows[0][2 + 0] + x);
+      const auto y_cc = Load(df, rows[1][2 + 0] + x);
+      const auto b_cc = Load(df, rows[2][2 + 0] + x);
+
+      auto w = Set(df, 1);
+      auto X = x_cc;
+      auto Y = y_cc;
+      auto B = b_cc;
+
+      // Top row
+      AddPixel</*aligned=*/true>(/*row=*/-1, rows, x, sad0, inv_sigma, &X, &Y,
+                                 &B, &w);
+      // Center
+      AddPixel</*aligned=*/false>(/*row=*/0, rows, x - 1, sad1, inv_sigma, &X,
+                                  &Y, &B, &w);
+      AddPixel</*aligned=*/false>(/*row=*/0, rows, x + 1, sad2, inv_sigma, &X,
+                                  &Y, &B, &w);
+      // Bottom
+      AddPixel</*aligned=*/true>(/*row=*/1, rows, x, sad3, inv_sigma, &X, &Y,
+                                 &B, &w);
+#if JXL_HIGH_PRECISION
+      auto inv_w = Div(Set(df, 1.0f), w);
+#else
+      auto inv_w = ApproximateReciprocal(w);
+#endif
+      Store(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x);
+      Store(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x);
+      Store(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInOut
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "EPF1"; }
+
+ private:
+  LoopFilter lf_;
+  const ImageF* sigma_;
+};
+
+// 3x3 plus-shaped kernel with 1 SAD per pixel. So this makes this filter a 3x3
+// filter.
+class EPF2Stage : public RenderPipelineStage {
+ public:
+  EPF2Stage(const LoopFilter& lf, const ImageF& sigma)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/1)),
+        lf_(lf),
+        sigma_(&sigma) {}
+
+  template <bool aligned>
+  JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][3], ssize_t x,
+                           Vec<DF> rx, Vec<DF> ry, Vec<DF> rb,
+                           Vec<DF> inv_sigma, Vec<DF>* JXL_RESTRICT X,
+                           Vec<DF>* JXL_RESTRICT Y, Vec<DF>* JXL_RESTRICT B,
+                           Vec<DF>* JXL_RESTRICT w) const {
+    auto cx = aligned ? Load(DF(), rows[0][1 + row] + x)
+                      : LoadU(DF(), rows[0][1 + row] + x);
+    auto cy = aligned ? Load(DF(), rows[1][1 + row] + x)
+                      : LoadU(DF(), rows[1][1 + row] + x);
+    auto cb = aligned ? Load(DF(), rows[2][1 + row] + x)
+                      : LoadU(DF(), rows[2][1 + row] + x);
+
+    auto sad = Mul(AbsDiff(cx, rx), Set(DF(), lf_.epf_channel_scale[0]));
+    sad = MulAdd(AbsDiff(cy, ry), Set(DF(), lf_.epf_channel_scale[1]), sad);
+    sad = MulAdd(AbsDiff(cb, rb), Set(DF(), lf_.epf_channel_scale[2]), sad);
+
+    auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass2_zeroflush));
+
+    *w = Add(*w, weight);
+    *X = MulAdd(weight, cx, *X);
+    *Y = MulAdd(weight, cy, *Y);
+    *B = MulAdd(weight, cb, *B);
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    DF df;
+    xextra = RoundUpTo(xextra, Lanes(df));
+    const float* JXL_RESTRICT row_sigma =
+        sigma_->Row(ypos / kBlockDim + kSigmaPadding);
+
+    float sm = lf_.epf_pass2_sigma_scale * 1.65;
+    float bsm = sm * lf_.epf_border_sad_mul;
+
+    HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm,
+                                                 sm,  sm, sm, bsm};
+    HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm,
+                                                 bsm, bsm, bsm, bsm};
+
+    float* JXL_RESTRICT rows[3][3];
+    for (size_t c = 0; c < 3; c++) {
+      for (int i = 0; i < 3; i++) {
+        rows[c][i] = GetInputRow(input_rows, c, i - 1);
+      }
+    }
+
+    const float* sad_mul =
+        (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1)
+            ? sad_mul_border
+            : sad_mul_center;
+
+    for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+         x += Lanes(df)) {
+      size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim;
+      size_t ix = (x + xpos) % kBlockDim;
+
+      if (row_sigma[bx] < kMinSigma) {
+        for (size_t c = 0; c < 3; c++) {
+          auto px = Load(df, rows[c][1 + 0] + x);
+          Store(px, df, GetOutputRow(output_rows, c, 0) + x);
+        }
+        continue;
+      }
+
+      const auto sm = Load(df, sad_mul + ix);
+      const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm);
+
+      const auto x_cc = Load(df, rows[0][1 + 0] + x);
+      const auto y_cc = Load(df, rows[1][1 + 0] + x);
+      const auto b_cc = Load(df, rows[2][1 + 0] + x);
+
+      auto w = Set(df, 1);
+      auto X = x_cc;
+      auto Y = y_cc;
+      auto B = b_cc;
+
+      // Top row
+      AddPixel</*aligned=*/true>(/*row=*/-1, rows, x, x_cc, y_cc, b_cc,
+                                 inv_sigma, &X, &Y, &B, &w);
+      // Center
+      AddPixel</*aligned=*/false>(/*row=*/0, rows, x - 1, x_cc, y_cc, b_cc,
+                                  inv_sigma, &X, &Y, &B, &w);
+      AddPixel</*aligned=*/false>(/*row=*/0, rows, x + 1, x_cc, y_cc, b_cc,
+                                  inv_sigma, &X, &Y, &B, &w);
+      // Bottom
+      AddPixel</*aligned=*/true>(/*row=*/1, rows, x, x_cc, y_cc, b_cc,
+                                 inv_sigma, &X, &Y, &B, &w);
+#if JXL_HIGH_PRECISION
+      auto inv_w = Div(Set(df, 1.0f), w);
+#else
+      auto inv_w = ApproximateReciprocal(w);
+#endif
+      Store(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x);
+      Store(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x);
+      Store(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInOut
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "EPF2"; }
+
+ private:
+  LoopFilter lf_;
+  const ImageF* sigma_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage0(const LoopFilter& lf,
+                                                  const ImageF& sigma) {
+  return jxl::make_unique<EPF0Stage>(lf, sigma);
+}
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage1(const LoopFilter& lf,
+                                                  const ImageF& sigma) {
+  return jxl::make_unique<EPF1Stage>(lf, sigma);
+}
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage2(const LoopFilter& lf,
+                                                  const ImageF& sigma) {
+  return jxl::make_unique<EPF2Stage>(lf, sigma);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetEPFStage0);
+HWY_EXPORT(GetEPFStage1);
+HWY_EXPORT(GetEPFStage2);
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage(const LoopFilter& lf,
+                                                 const ImageF& sigma,
+                                                 size_t epf_stage) {
+  JXL_ASSERT(lf.epf_iters != 0);
+  switch (epf_stage) {
+    case 0:
+      return HWY_DYNAMIC_DISPATCH(GetEPFStage0)(lf, sigma);
+    case 1:
+      return HWY_DYNAMIC_DISPATCH(GetEPFStage1)(lf, sigma);
+    case 2:
+      return HWY_DYNAMIC_DISPATCH(GetEPFStage2)(lf, sigma);
+    default:
+      JXL_ABORT("Invalid EPF stage");
+  }
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h
new file mode 100644
index 0000000000..c9d0d0c785
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/image.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Applies the `epf_stage`-th EPF step with the given settings and `sigma`.
+// `sigma` will be accessed with an offset of (kSigmaPadding, kSigmaPadding),
+// and should have (kSigmaBorder, kSigmaBorder) mirrored sigma values available
+// around the main image. See also filters.(h|cc)
+std::unique_ptr<RenderPipelineStage> GetEPFStage(const LoopFilter& lf,
+                                                 const ImageF& sigma,
+                                                 size_t epf_stage);
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc
new file mode 100644
index 0000000000..c7b22c663b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc
@@ -0,0 +1,191 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_from_linear.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_from_linear.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_tone_mapping-inl.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+
+template <typename Op>
+struct PerChannelOp {
+  explicit PerChannelOp(Op op) : op(op) {}
+  template <typename D, typename T>
+  void Transform(D d, T* r, T* g, T* b) const {
+    *r = op.Transform(d, *r);
+    *g = op.Transform(d, *g);
+    *b = op.Transform(d, *b);
+  }
+
+  Op op;
+};
+template <typename Op>
+PerChannelOp<Op> MakePerChannelOp(Op&& op) {
+  return PerChannelOp<Op>(std::forward<Op>(op));
+}
+
+struct OpLinear {
+  template <typename D, typename T>
+  T Transform(D d, const T& linear) const {
+    return linear;
+  }
+};
+
+struct OpRgb {
+  template <typename D, typename T>
+  T Transform(D d, const T& linear) const {
+#if JXL_HIGH_PRECISION
+    return TF_SRGB().EncodedFromDisplay(d, linear);
+#else
+    return FastLinearToSRGB(d, linear);
+#endif
+  }
+};
+
+struct OpPq {
+  template <typename D, typename T>
+  T Transform(D d, const T& linear) const {
+    return TF_PQ().EncodedFromDisplay(d, linear);
+  }
+};
+
+struct OpHlg {
+  explicit OpHlg(const float luminances[3], const float intensity_target)
+      : hlg_ootf_(HlgOOTF::ToSceneLight(/*display_luminance=*/intensity_target,
+                                        luminances)) {}
+
+  template <typename D, typename T>
+  void Transform(D d, T* r, T* g, T* b) const {
+    hlg_ootf_.Apply(r, g, b);
+    *r = TF_HLG().EncodedFromDisplay(d, *r);
+    *g = TF_HLG().EncodedFromDisplay(d, *g);
+    *b = TF_HLG().EncodedFromDisplay(d, *b);
+  }
+  HlgOOTF hlg_ootf_;
+};
+
+struct Op709 {
+  template <typename D, typename T>
+  T Transform(D d, const T& linear) const {
+    return TF_709().EncodedFromDisplay(d, linear);
+  }
+};
+
+struct OpGamma {
+  const float inverse_gamma;
+  template <typename D, typename T>
+  T Transform(D d, const T& linear) const {
+    return IfThenZeroElse(Le(linear, Set(d, 1e-5f)),
+                          FastPowf(d, linear, Set(d, inverse_gamma)));
+  }
+};
+
+template <typename Op>
+class FromLinearStage : public RenderPipelineStage {
+ public:
+  explicit FromLinearStage(Op op)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        op_(std::move(op)) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("FromLinear");
+    const HWY_FULL(float) d;
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    // All calculations are lane-wise, still some might require
+    // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+    // vector tail.
+    msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+    for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+      auto r = LoadU(d, row0 + x);
+      auto g = LoadU(d, row1 + x);
+      auto b = LoadU(d, row2 + x);
+      op_.Transform(d, &r, &g, &b);
+      StoreU(r, d, row0 + x);
+      StoreU(g, d, row1 + x);
+      StoreU(b, d, row2 + x);
+    }
+    msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "FromLinear"; }
+
+ private:
+  Op op_;
+};
+
+template <typename Op>
+std::unique_ptr<FromLinearStage<Op>> MakeFromLinearStage(Op&& op) {
+  return jxl::make_unique<FromLinearStage<Op>>(std::forward<Op>(op));
+}
+
+std::unique_ptr<RenderPipelineStage> GetFromLinearStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  if (output_encoding_info.color_encoding.tf.IsLinear()) {
+    return MakeFromLinearStage(MakePerChannelOp(OpLinear()));
+  } else if (output_encoding_info.color_encoding.tf.IsSRGB()) {
+    return MakeFromLinearStage(MakePerChannelOp(OpRgb()));
+  } else if (output_encoding_info.color_encoding.tf.IsPQ()) {
+    return MakeFromLinearStage(MakePerChannelOp(OpPq()));
+  } else if (output_encoding_info.color_encoding.tf.IsHLG()) {
+    return MakeFromLinearStage(
+        OpHlg(output_encoding_info.luminances,
+              output_encoding_info.desired_intensity_target));
+  } else if (output_encoding_info.color_encoding.tf.Is709()) {
+    return MakeFromLinearStage(MakePerChannelOp(Op709()));
+  } else if (output_encoding_info.color_encoding.tf.IsGamma() ||
+             output_encoding_info.color_encoding.tf.IsDCI()) {
+    return MakeFromLinearStage(
+        MakePerChannelOp(OpGamma{output_encoding_info.inverse_gamma}));
+  } else {
+    // This is a programming error.
+    JXL_ABORT("Invalid target encoding");
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetFromLinearStage);
+
+std::unique_ptr<RenderPipelineStage> GetFromLinearStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return HWY_DYNAMIC_DISPATCH(GetFromLinearStage)(output_encoding_info);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h
new file mode 100644
index 0000000000..548ab50b8c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h
@@ -0,0 +1,20 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from linear to the specified output encoding.
+std::unique_ptr<RenderPipelineStage> GetFromLinearStage(
+    const OutputEncodingInfo& output_encoding_info);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc
new file mode 100644
index 0000000000..fc90acb476
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc
@@ -0,0 +1,122 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_gaborish.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_gaborish.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class GaborishStage : public RenderPipelineStage {
+ public:
+  explicit GaborishStage(const LoopFilter& lf)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/1)) {
+    weights_[0] = 1;
+    weights_[1] = lf.gab_x_weight1;
+    weights_[2] = lf.gab_x_weight2;
+    weights_[3] = 1;
+    weights_[4] = lf.gab_y_weight1;
+    weights_[5] = lf.gab_y_weight2;
+    weights_[6] = 1;
+    weights_[7] = lf.gab_b_weight1;
+    weights_[8] = lf.gab_b_weight2;
+    // Normalize
+    for (size_t c = 0; c < 3; c++) {
+      const float div =
+          weights_[3 * c] + 4 * (weights_[3 * c + 1] + weights_[3 * c + 2]);
+      const float mul = 1.0f / div;
+      weights_[3 * c] *= mul;
+      weights_[3 * c + 1] *= mul;
+      weights_[3 * c + 2] *= mul;
+    }
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("Gaborish");
+
+    const HWY_FULL(float) d;
+    for (size_t c = 0; c < 3; c++) {
+      float* JXL_RESTRICT row_t = GetInputRow(input_rows, c, -1);
+      float* JXL_RESTRICT row_m = GetInputRow(input_rows, c, 0);
+      float* JXL_RESTRICT row_b = GetInputRow(input_rows, c, 1);
+      float* JXL_RESTRICT row_out = GetOutputRow(output_rows, c, 0);
+      const auto w0 = Set(d, weights_[3 * c + 0]);
+      const auto w1 = Set(d, weights_[3 * c + 1]);
+      const auto w2 = Set(d, weights_[3 * c + 2]);
+// Group data need only be aligned to a block; for >=512 bit vectors, this may
+// result in unaligned loads.
+#if HWY_CAP_GE512
+#define LoadMaybeU LoadU
+#else
+#define LoadMaybeU Load
+#endif
+      // Since GetInputRow(input_rows, c, {-1, 0, 1}) is aligned, rounding
+      // xextra up to Lanes(d) doesn't access anything problematic.
+      for (ssize_t x = -RoundUpTo(xextra, Lanes(d));
+           x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+        const auto t = LoadMaybeU(d, row_t + x);
+        const auto tl = LoadU(d, row_t + x - 1);
+        const auto tr = LoadU(d, row_t + x + 1);
+        const auto m = LoadMaybeU(d, row_m + x);
+        const auto l = LoadU(d, row_m + x - 1);
+        const auto r = LoadU(d, row_m + x + 1);
+        const auto b = LoadMaybeU(d, row_b + x);
+        const auto bl = LoadU(d, row_b + x - 1);
+        const auto br = LoadU(d, row_b + x + 1);
+        const auto sum0 = m;
+        const auto sum1 = Add(Add(l, r), Add(t, b));
+        const auto sum2 = Add(Add(tl, tr), Add(bl, br));
+        auto pixels = MulAdd(sum2, w2, MulAdd(sum1, w1, Mul(sum0, w0)));
+        Store(pixels, d, row_out + x);
+      }
+    }
+  }
+#undef LoadMaybeU
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInOut
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Gab"; }
+
+ private:
+  float weights_[9];
+};
+
+std::unique_ptr<RenderPipelineStage> GetGaborishStage(const LoopFilter& lf) {
+  return jxl::make_unique<GaborishStage>(lf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetGaborishStage);
+
+std::unique_ptr<RenderPipelineStage> GetGaborishStage(const LoopFilter& lf) {
+  JXL_ASSERT(lf.gab == 1);
+  return HWY_DYNAMIC_DISPATCH(GetGaborishStage)(lf);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h
new file mode 100644
index 0000000000..761800f668
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h
@@ -0,0 +1,25 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Applies decoder-side Gaborish with the given settings. `lf.gab` must be 1.
+std::unique_ptr<RenderPipelineStage> GetGaborishStage(const LoopFilter& lf);
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc
new file mode 100644
index 0000000000..187095cf61
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc
@@ -0,0 +1,311 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_noise.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_noise.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+using D = HWY_CAPPED(float, kBlockDim);
+using DI = hwy::HWY_NAMESPACE::Rebind<int32_t, D>;
+using DI8 = hwy::HWY_NAMESPACE::Repartition<uint8_t, D>;
+
+// [0, max_value]
+template <class D, class V>
+static HWY_INLINE V Clamp0ToMax(D d, const V x, const V max_value) {
+  const auto clamped = Min(x, max_value);
+  return ZeroIfNegative(clamped);
+}
+
+// x is in [0+delta, 1+delta], delta ~= 0.06
+template <class StrengthEval>
+typename StrengthEval::V NoiseStrength(const StrengthEval& eval,
+                                       const typename StrengthEval::V x) {
+  return Clamp0ToMax(D(), eval(x), Set(D(), 1.0f));
+}
+
+// TODO(veluca): SIMD-fy.
+class StrengthEvalLut {
+ public:
+  using V = Vec<D>;
+
+  explicit StrengthEvalLut(const NoiseParams& noise_params)
+#if HWY_TARGET == HWY_SCALAR
+      : noise_params_(noise_params)
+#endif
+  {
+#if HWY_TARGET != HWY_SCALAR
+    uint32_t lut[8];
+    memcpy(lut, noise_params.lut, sizeof(lut));
+    for (size_t i = 0; i < 8; i++) {
+      low16_lut[2 * i] = (lut[i] >> 0) & 0xFF;
+      low16_lut[2 * i + 1] = (lut[i] >> 8) & 0xFF;
+      high16_lut[2 * i] = (lut[i] >> 16) & 0xFF;
+      high16_lut[2 * i + 1] = (lut[i] >> 24) & 0xFF;
+    }
+#endif
+  }
+
+  V operator()(const V vx) const {
+    constexpr size_t kScale = NoiseParams::kNumNoisePoints - 2;
+    auto scaled_vx = Max(Zero(D()), Mul(vx, Set(D(), kScale)));
+    auto floor_x = Floor(scaled_vx);
+    auto frac_x = Sub(scaled_vx, floor_x);
+    floor_x = IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), kScale),
+                         floor_x);
+    frac_x =
+        IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), 1), frac_x);
+    auto floor_x_int = ConvertTo(DI(), floor_x);
+#if HWY_TARGET == HWY_SCALAR
+    auto low = Set(D(), noise_params_.lut[floor_x_int.raw]);
+    auto hi = Set(D(), noise_params_.lut[floor_x_int.raw + 1]);
+#else
+    // Set each lane's bytes to {0, 0, 2x+1, 2x}.
+    auto floorx_indices_low =
+        Add(Mul(floor_x_int, Set(DI(), 0x0202)), Set(DI(), 0x0100));
+    // Set each lane's bytes to {2x+1, 2x, 0, 0}.
+    auto floorx_indices_hi =
+        Add(Mul(floor_x_int, Set(DI(), 0x02020000)), Set(DI(), 0x01000000));
+    // load LUT
+    auto low16 = BitCast(DI(), LoadDup128(DI8(), low16_lut));
+    auto lowm = Set(DI(), 0xFFFF);
+    auto hi16 = BitCast(DI(), LoadDup128(DI8(), high16_lut));
+    auto him = Set(DI(), 0xFFFF0000);
+    // low = noise_params.lut[floor_x]
+    auto low =
+        BitCast(D(), Or(And(TableLookupBytes(low16, floorx_indices_low), lowm),
+                        And(TableLookupBytes(hi16, floorx_indices_hi), him)));
+    // hi = noise_params.lut[floor_x+1]
+    floorx_indices_low = Add(floorx_indices_low, Set(DI(), 0x0202));
+    floorx_indices_hi = Add(floorx_indices_hi, Set(DI(), 0x02020000));
+    auto hi =
+        BitCast(D(), Or(And(TableLookupBytes(low16, floorx_indices_low), lowm),
+                        And(TableLookupBytes(hi16, floorx_indices_hi), him)));
+#endif
+    return MulAdd(Sub(hi, low), frac_x, low);
+  }
+
+ private:
+#if HWY_TARGET != HWY_SCALAR
+  // noise_params.lut transformed into two 16-bit lookup tables.
+  HWY_ALIGN uint8_t high16_lut[16];
+  HWY_ALIGN uint8_t low16_lut[16];
+#else
+  const NoiseParams& noise_params_;
+#endif
+};
+
+template <class D>
+void AddNoiseToRGB(const D d, const Vec<D> rnd_noise_r,
+                   const Vec<D> rnd_noise_g, const Vec<D> rnd_noise_cor,
+                   const Vec<D> noise_strength_g, const Vec<D> noise_strength_r,
+                   float ytox, float ytob, float* JXL_RESTRICT out_x,
+                   float* JXL_RESTRICT out_y, float* JXL_RESTRICT out_b) {
+  const auto kRGCorr = Set(d, 0.9921875f);   // 127/128
+  const auto kRGNCorr = Set(d, 0.0078125f);  // 1/128
+
+  const auto red_noise =
+      Mul(noise_strength_r,
+          MulAdd(kRGNCorr, rnd_noise_r, Mul(kRGCorr, rnd_noise_cor)));
+  const auto green_noise =
+      Mul(noise_strength_g,
+          MulAdd(kRGNCorr, rnd_noise_g, Mul(kRGCorr, rnd_noise_cor)));
+
+  auto vx = LoadU(d, out_x);
+  auto vy = LoadU(d, out_y);
+  auto vb = LoadU(d, out_b);
+
+  const auto rg_noise = Add(red_noise, green_noise);
+  vx = Add(MulAdd(Set(d, ytox), rg_noise, Sub(red_noise, green_noise)), vx);
+  vy = Add(vy, rg_noise);
+  vb = MulAdd(Set(d, ytob), rg_noise, vb);
+
+  StoreU(vx, d, out_x);
+  StoreU(vy, d, out_y);
+  StoreU(vb, d, out_b);
+}
+
+class AddNoiseStage : public RenderPipelineStage {
+ public:
+  AddNoiseStage(const NoiseParams& noise_params,
+                const ColorCorrelationMap& cmap, size_t first_c)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/0)),
+        noise_params_(noise_params),
+        cmap_(cmap),
+        first_c_(first_c) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("Noise apply");
+
+    if (!noise_params_.HasAny()) return;
+    const StrengthEvalLut noise_model(noise_params_);
+    D d;
+    const auto half = Set(d, 0.5f);
+
+    // With the prior subtract-random Laplacian approximation, rnd_* ranges were
+    // about [-1.5, 1.6]; Laplacian3 about doubles this to [-3.6, 3.6], so the
+    // normalizer is half of what it was before (0.5).
+    const auto norm_const = Set(d, 0.22f);
+
+    float ytox = cmap_.YtoXRatio(0);
+    float ytob = cmap_.YtoBRatio(0);
+
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+
+    float* JXL_RESTRICT row_x = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row_y = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row_b = GetInputRow(input_rows, 2, 0);
+    const float* JXL_RESTRICT row_rnd_r =
+        GetInputRow(input_rows, first_c_ + 0, 0);
+    const float* JXL_RESTRICT row_rnd_g =
+        GetInputRow(input_rows, first_c_ + 1, 0);
+    const float* JXL_RESTRICT row_rnd_c =
+        GetInputRow(input_rows, first_c_ + 2, 0);
+    // Needed by the calls to Floor() in StrengthEvalLut. Only arithmetic and
+    // shuffles are otherwise done on the data, so this is safe.
+    msan::UnpoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float));
+    msan::UnpoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float));
+    for (size_t x = 0; x < xsize_v; x += Lanes(d)) {
+      const auto vx = LoadU(d, row_x + x);
+      const auto vy = LoadU(d, row_y + x);
+      const auto in_g = Sub(vy, vx);
+      const auto in_r = Add(vy, vx);
+      const auto noise_strength_g = NoiseStrength(noise_model, Mul(in_g, half));
+      const auto noise_strength_r = NoiseStrength(noise_model, Mul(in_r, half));
+      const auto addit_rnd_noise_red = Mul(LoadU(d, row_rnd_r + x), norm_const);
+      const auto addit_rnd_noise_green =
+          Mul(LoadU(d, row_rnd_g + x), norm_const);
+      const auto addit_rnd_noise_correlated =
+          Mul(LoadU(d, row_rnd_c + x), norm_const);
+      AddNoiseToRGB(D(), addit_rnd_noise_red, addit_rnd_noise_green,
+                    addit_rnd_noise_correlated, noise_strength_g,
+                    noise_strength_r, ytox, ytob, row_x + x, row_y + x,
+                    row_b + x);
+    }
+    msan::PoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float));
+    msan::PoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float));
+    msan::PoisonMemory(row_b + xsize, (xsize_v - xsize) * sizeof(float));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c >= first_c_ ? RenderPipelineChannelMode::kInput
+           : c < 3       ? RenderPipelineChannelMode::kInPlace
+                         : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "AddNoise"; }
+
+ private:
+  const NoiseParams& noise_params_;
+  const ColorCorrelationMap& cmap_;
+  size_t first_c_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
+    const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
+    size_t noise_c_start) {
+  return jxl::make_unique<AddNoiseStage>(noise_params, cmap, noise_c_start);
+}
+
+class ConvolveNoiseStage : public RenderPipelineStage {
+ public:
+  explicit ConvolveNoiseStage(size_t first_c)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/2)),
+        first_c_(first_c) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("Noise convolve");
+
+    const HWY_FULL(float) d;
+    for (size_t c = first_c_; c < first_c_ + 3; c++) {
+      float* JXL_RESTRICT rows[5];
+      for (size_t i = 0; i < 5; i++) {
+        rows[i] = GetInputRow(input_rows, c, i - 2);
+      }
+      float* JXL_RESTRICT row_out = GetOutputRow(output_rows, c, 0);
+      for (ssize_t x = -RoundUpTo(xextra, Lanes(d));
+           x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+        const auto p00 = LoadU(d, rows[2] + x);
+        auto others = Zero(d);
+        // TODO(eustas): sum loaded values to reduce the calculation chain
+        for (ssize_t i = -2; i <= 2; i++) {
+          others = Add(others, LoadU(d, rows[0] + x + i));
+          others = Add(others, LoadU(d, rows[1] + x + i));
+          others = Add(others, LoadU(d, rows[3] + x + i));
+          others = Add(others, LoadU(d, rows[4] + x + i));
+        }
+        others = Add(others, LoadU(d, rows[2] + x - 2));
+        others = Add(others, LoadU(d, rows[2] + x - 1));
+        others = Add(others, LoadU(d, rows[2] + x + 1));
+        others = Add(others, LoadU(d, rows[2] + x + 2));
+        // 4 * (1 - box kernel)
+        auto pixels = MulAdd(others, Set(d, 0.16), Mul(p00, Set(d, -3.84)));
+        StoreU(pixels, d, row_out + x);
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c >= first_c_ ? RenderPipelineChannelMode::kInOut
+                         : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "ConvNoise"; }
+
+ private:
+  size_t first_c_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
+    size_t noise_c_start) {
+  return jxl::make_unique<ConvolveNoiseStage>(noise_c_start);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetAddNoiseStage);
+HWY_EXPORT(GetConvolveNoiseStage);
+
+std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
+    const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
+    size_t noise_c_start) {
+  return HWY_DYNAMIC_DISPATCH(GetAddNoiseStage)(noise_params, cmap,
+                                                noise_c_start);
+}
+
+std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
+    size_t noise_c_start) {
+  return HWY_DYNAMIC_DISPATCH(GetConvolveNoiseStage)(noise_c_start);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h
new file mode 100644
index 0000000000..bd7797f991
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/dec_noise.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Adds noise to color channels.
+std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
+    const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
+    size_t noise_c_start);
+
+// Applies a 5x5 subtract-box-filter convolution to the noise input channels.
+std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
+    size_t noise_c_start);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc
new file mode 100644
index 0000000000..527be03839
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc
@@ -0,0 +1,48 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_patches.h"
+
+namespace jxl {
+namespace {
+class PatchDictionaryStage : public RenderPipelineStage {
+ public:
+  PatchDictionaryStage(const PatchDictionary* patches, size_t num_channels)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        patches_(*patches),
+        num_channels_(num_channels) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("RenderPatches");
+    JXL_ASSERT(xpos == 0 || xpos >= xextra);
+    size_t x0 = xpos ? xpos - xextra : 0;
+    std::vector<float*> row_ptrs(num_channels_);
+    for (size_t i = 0; i < num_channels_; i++) {
+      row_ptrs[i] = GetInputRow(input_rows, i, 0) + x0 - xpos;
+    }
+    patches_.AddOneRow(row_ptrs.data(), ypos, x0, xsize + xextra + xpos - x0);
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < num_channels_ ? RenderPipelineChannelMode::kInPlace
+                             : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Patches"; }
+
+ private:
+  const PatchDictionary& patches_;
+  const size_t num_channels_;
+};
+}  // namespace
+
+std::unique_ptr<RenderPipelineStage> GetPatchesStage(
+    const PatchDictionary* patches, size_t num_channels) {
+  return jxl::make_unique<PatchDictionaryStage>(patches, num_channels);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h
new file mode 100644
index 0000000000..b35abdc2eb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_
+
+#include <utility>
+
+#include "lib/jxl/patch_dictionary_internal.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Draws patches if applicable.
+std::unique_ptr<RenderPipelineStage> GetPatchesStage(
+    const PatchDictionary* patches, size_t num_channels);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc
new file mode 100644
index 0000000000..d97d97e5f2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc
@@ -0,0 +1,63 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_splines.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_splines.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class SplineStage : public RenderPipelineStage {
+ public:
+  explicit SplineStage(const Splines* splines)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        splines_(*splines) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("RenderSplines");
+    float* row_x = GetInputRow(input_rows, 0, 0);
+    float* row_y = GetInputRow(input_rows, 1, 0);
+    float* row_b = GetInputRow(input_rows, 2, 0);
+    splines_.AddToRow(row_x, row_y, row_b, Rect(xpos, ypos, xsize, 1));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Splines"; }
+
+ private:
+  const Splines& splines_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetSplineStage(const Splines* splines) {
+  return jxl::make_unique<SplineStage>(splines);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetSplineStage);
+
+std::unique_ptr<RenderPipelineStage> GetSplineStage(const Splines* splines) {
+  return HWY_DYNAMIC_DISPATCH(GetSplineStage)(splines);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h
new file mode 100644
index 0000000000..363af393ec
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_
+
+#include <utility>
+
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+#include "lib/jxl/splines.h"
+
+namespace jxl {
+
+// Draws splines if applicable.
+std::unique_ptr<RenderPipelineStage> GetSplineStage(const Splines* splines);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc
new file mode 100644
index 0000000000..d4f6152994
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc
@@ -0,0 +1,52 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_spot.h"
+
+namespace jxl {
+class SpotColorStage : public RenderPipelineStage {
+ public:
+  explicit SpotColorStage(size_t spot_c, const float* spot_color)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        spot_c_(spot_c),
+        spot_color_(spot_color) {
+    JXL_ASSERT(spot_c_ >= 3);
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    // TODO(veluca): add SIMD.
+    PROFILER_ZONE("RenderSpotColors");
+    float scale = spot_color_[3];
+    for (size_t c = 0; c < 3; c++) {
+      float* JXL_RESTRICT p = GetInputRow(input_rows, c, 0);
+      const float* JXL_RESTRICT s = GetInputRow(input_rows, spot_c_, 0);
+      for (ssize_t x = -xextra; x < ssize_t(xsize + xextra); x++) {
+        float mix = scale * s[x];
+        p[x] = mix * spot_color_[c] + (1.0f - mix) * p[x];
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3          ? RenderPipelineChannelMode::kInPlace
+           : c == spot_c_ ? RenderPipelineChannelMode::kInput
+                          : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Spot"; }
+
+ private:
+  size_t spot_c_;
+  const float* spot_color_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetSpotColorStage(
+    size_t spot_c, const float* spot_color) {
+  return jxl::make_unique<SpotColorStage>(spot_c, spot_color);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h
new file mode 100644
index 0000000000..3e79c75823
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_
+
+#include <utility>
+
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Render the spot color channels.
+std::unique_ptr<RenderPipelineStage> GetSpotColorStage(size_t spot_c,
+                                                       const float* spot_color);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc
new file mode 100644
index 0000000000..9f5b2b73dc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc
@@ -0,0 +1,202 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_to_linear.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_to_linear.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_tone_mapping-inl.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+
+template <typename Op>
+struct PerChannelOp {
+  explicit PerChannelOp(Op op) : op(op) {}
+  template <typename D, typename T>
+  void Transform(D d, T* r, T* g, T* b) const {
+    *r = op.Transform(d, *r);
+    *g = op.Transform(d, *g);
+    *b = op.Transform(d, *b);
+  }
+
+  Op op;
+};
+template <typename Op>
+PerChannelOp<Op> MakePerChannelOp(Op&& op) {
+  return PerChannelOp<Op>(std::forward<Op>(op));
+}
+
+struct OpLinear {
+  template <typename D, typename T>
+  T Transform(D d, const T& encoded) const {
+    return encoded;
+  }
+};
+
+struct OpRgb {
+  template <typename D, typename T>
+  T Transform(D d, const T& encoded) const {
+    return TF_SRGB().DisplayFromEncoded(encoded);
+  }
+};
+
+struct OpPq {
+  template <typename D, typename T>
+  T Transform(D d, const T& encoded) const {
+    return TF_PQ().DisplayFromEncoded(d, encoded);
+  }
+};
+
+struct OpHlg {
+  explicit OpHlg(const float luminances[3], const float intensity_target)
+      : hlg_ootf_(HlgOOTF::FromSceneLight(
+            /*display_luminance=*/intensity_target, luminances)) {}
+
+  template <typename D, typename T>
+  void Transform(D d, T* r, T* g, T* b) const {
+    for (T* val : {r, g, b}) {
+      HWY_ALIGN float vals[MaxLanes(d)];
+      Store(*val, d, vals);
+      for (size_t i = 0; i < Lanes(d); ++i) {
+        vals[i] = TF_HLG().DisplayFromEncoded(vals[i]);
+      }
+      *val = Load(d, vals);
+    }
+    hlg_ootf_.Apply(r, g, b);
+  }
+  HlgOOTF hlg_ootf_;
+};
+
+struct Op709 {
+  template <typename D, typename T>
+  T Transform(D d, const T& encoded) const {
+    return TF_709().DisplayFromEncoded(d, encoded);
+  }
+};
+
+struct OpGamma {
+  const float gamma;
+  template <typename D, typename T>
+  T Transform(D d, const T& encoded) const {
+    return IfThenZeroElse(Le(encoded, Set(d, 1e-5f)),
+                          FastPowf(d, encoded, Set(d, gamma)));
+  }
+};
+
+struct OpInvalid {
+  template <typename D, typename T>
+  void Transform(D d, T* r, T* g, T* b) const {}
+};
+
+template <typename Op>
+class ToLinearStage : public RenderPipelineStage {
+ public:
+  explicit ToLinearStage(Op op)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        op_(std::move(op)) {}
+
+  explicit ToLinearStage()
+      : RenderPipelineStage(RenderPipelineStage::Settings()), valid_(false) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("ToLinear");
+
+    const HWY_FULL(float) d;
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    // All calculations are lane-wise, still some might require
+    // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+    // vector tail.
+    msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+    for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+      auto r = LoadU(d, row0 + x);
+      auto g = LoadU(d, row1 + x);
+      auto b = LoadU(d, row2 + x);
+      op_.Transform(d, &r, &g, &b);
+      StoreU(r, d, row0 + x);
+      StoreU(g, d, row1 + x);
+      StoreU(b, d, row2 + x);
+    }
+    msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "ToLinear"; }
+
+ private:
+  Status IsInitialized() const override { return valid_; }
+
+  Op op_;
+  bool valid_ = true;
+};
+
+template <typename Op>
+std::unique_ptr<ToLinearStage<Op>> MakeToLinearStage(Op&& op) {
+  return jxl::make_unique<ToLinearStage<Op>>(std::forward<Op>(op));
+}
+
+std::unique_ptr<RenderPipelineStage> GetToLinearStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  if (output_encoding_info.color_encoding.tf.IsLinear()) {
+    return MakeToLinearStage(MakePerChannelOp(OpLinear()));
+  } else if (output_encoding_info.color_encoding.tf.IsSRGB()) {
+    return MakeToLinearStage(MakePerChannelOp(OpRgb()));
+  } else if (output_encoding_info.color_encoding.tf.IsPQ()) {
+    return MakeToLinearStage(MakePerChannelOp(OpPq()));
+  } else if (output_encoding_info.color_encoding.tf.IsHLG()) {
+    return MakeToLinearStage(OpHlg(output_encoding_info.luminances,
+                                   output_encoding_info.orig_intensity_target));
+  } else if (output_encoding_info.color_encoding.tf.Is709()) {
+    return MakeToLinearStage(MakePerChannelOp(Op709()));
+  } else if (output_encoding_info.color_encoding.tf.IsGamma() ||
+             output_encoding_info.color_encoding.tf.IsDCI()) {
+    return MakeToLinearStage(
+        MakePerChannelOp(OpGamma{1.f / output_encoding_info.inverse_gamma}));
+  } else {
+    return jxl::make_unique<ToLinearStage<OpInvalid>>();
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetToLinearStage);
+
+std::unique_ptr<RenderPipelineStage> GetToLinearStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return HWY_DYNAMIC_DISPATCH(GetToLinearStage)(output_encoding_info);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h
new file mode 100644
index 0000000000..ccee7b09f0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from `output_encoding_info.color_encoding` to
+// linear.
+std::unique_ptr<RenderPipelineStage> GetToLinearStage(
+    const OutputEncodingInfo& output_encoding_info);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc
new file mode 100644
index 0000000000..7609534a5b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc
@@ -0,0 +1,151 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_tone_mapping.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_tone_mapping.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_tone_mapping-inl.h"
+#include "lib/jxl/dec_xyb-inl.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class ToneMappingStage : public RenderPipelineStage {
+ public:
+  explicit ToneMappingStage(OutputEncodingInfo output_encoding_info)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        output_encoding_info_(std::move(output_encoding_info)) {
+    if (output_encoding_info_.desired_intensity_target ==
+        output_encoding_info_.orig_intensity_target) {
+      // No tone mapping requested.
+      return;
+    }
+    if (output_encoding_info_.orig_color_encoding.tf.IsPQ() &&
+        output_encoding_info_.desired_intensity_target <
+            output_encoding_info_.orig_intensity_target) {
+      tone_mapper_ = jxl::make_unique<ToneMapper>(
+          /*source_range=*/std::pair<float, float>(
+              0, output_encoding_info_.orig_intensity_target),
+          /*target_range=*/
+          std::pair<float, float>(
+              0, output_encoding_info_.desired_intensity_target),
+          output_encoding_info_.luminances);
+    } else if (output_encoding_info_.orig_color_encoding.tf.IsHLG() &&
+               !output_encoding_info_.color_encoding.tf.IsHLG()) {
+      hlg_ootf_ = jxl::make_unique<HlgOOTF>(
+          /*source_luminance=*/output_encoding_info_.orig_intensity_target,
+          /*target_luminance=*/output_encoding_info_.desired_intensity_target,
+          output_encoding_info_.luminances);
+    }
+
+    if (output_encoding_info_.color_encoding.tf.IsPQ() &&
+        (tone_mapper_ || hlg_ootf_)) {
+      to_intensity_target_ =
+          10000.f / output_encoding_info_.orig_intensity_target;
+      from_desired_intensity_target_ =
+          output_encoding_info_.desired_intensity_target / 10000.f;
+    }
+  }
+
+  bool IsNeeded() const { return tone_mapper_ || hlg_ootf_; }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("ToneMapping");
+
+    if (!(tone_mapper_ || hlg_ootf_)) return;
+
+    const HWY_FULL(float) d;
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    // All calculations are lane-wise, still some might require
+    // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+    // vector tail.
+    msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+    for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+      auto r = LoadU(d, row0 + x);
+      auto g = LoadU(d, row1 + x);
+      auto b = LoadU(d, row2 + x);
+      if (tone_mapper_ || hlg_ootf_) {
+        r = Mul(r, Set(d, to_intensity_target_));
+        g = Mul(g, Set(d, to_intensity_target_));
+        b = Mul(b, Set(d, to_intensity_target_));
+        if (tone_mapper_) {
+          tone_mapper_->ToneMap(&r, &g, &b);
+        } else {
+          JXL_ASSERT(hlg_ootf_);
+          hlg_ootf_->Apply(&r, &g, &b);
+        }
+        if (tone_mapper_ || hlg_ootf_->WarrantsGamutMapping()) {
+          GamutMap(&r, &g, &b, output_encoding_info_.luminances);
+        }
+        r = Mul(r, Set(d, from_desired_intensity_target_));
+        g = Mul(g, Set(d, from_desired_intensity_target_));
+        b = Mul(b, Set(d, from_desired_intensity_target_));
+      }
+      StoreU(r, d, row0 + x);
+      StoreU(g, d, row1 + x);
+      StoreU(b, d, row2 + x);
+    }
+    msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "ToneMapping"; }
+
+ private:
+  using ToneMapper = Rec2408ToneMapper<HWY_FULL(float)>;
+  OutputEncodingInfo output_encoding_info_;
+  std::unique_ptr<ToneMapper> tone_mapper_;
+  std::unique_ptr<HlgOOTF> hlg_ootf_;
+  // When the target colorspace is PQ, 1 represents 10000 nits instead of
+  // orig_intensity_target. This temporarily changes this if the tone mappers
+  // require it.
+  float to_intensity_target_ = 1.f;
+  float from_desired_intensity_target_ = 1.f;
+};
+
+std::unique_ptr<RenderPipelineStage> GetToneMappingStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  auto stage = jxl::make_unique<ToneMappingStage>(output_encoding_info);
+  if (!stage->IsNeeded()) return nullptr;
+  return stage;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetToneMappingStage);
+
+std::unique_ptr<RenderPipelineStage> GetToneMappingStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return HWY_DYNAMIC_DISPATCH(GetToneMappingStage)(output_encoding_info);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h
new file mode 100644
index 0000000000..99824f8511
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Tone maps the image if appropriate. It must be in linear space and
+// `output_encoding_info.luminances` must contain the luminance for the
+// primaries of that space. It must also be encoded such that (1, 1, 1)
+// represents `output_encoding_info.orig_intensity_target` nits, unless
+// `output_encoding_info.color_encoding.tf.IsPQ()`, in which case (1, 1, 1) must
+// represent 10000 nits. This corresponds to what XYBStage outputs. After this
+// stage, (1, 1, 1) will represent
+// `output_encoding_info.desired_intensity_target` nits, except in the PQ
+// special case in which it remains 10000.
+//
+// If no tone mapping is necessary, this will return nullptr.
+std::unique_ptr<RenderPipelineStage> GetToneMappingStage(
+    const OutputEncodingInfo& output_encoding_info);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc
new file mode 100644
index 0000000000..a75e259865
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc
@@ -0,0 +1,187 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_upsampling.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_upsampling.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/simd_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Min;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class UpsamplingStage : public RenderPipelineStage {
+ public:
+  explicit UpsamplingStage(const CustomTransformData& ups_factors, size_t c,
+                           size_t shift)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/shift, /*border=*/2)),
+        c_(c) {
+    const float* weights = shift == 1   ? ups_factors.upsampling2_weights
+                           : shift == 2 ? ups_factors.upsampling4_weights
+                                        : ups_factors.upsampling8_weights;
+    size_t N = 1 << (shift - 1);
+    for (size_t i = 0; i < 5 * N; i++) {
+      for (size_t j = 0; j < 5 * N; j++) {
+        size_t y = std::min(i, j);
+        size_t x = std::max(i, j);
+        kernel_[j / 5][i / 5][j % 5][i % 5] =
+            weights[5 * N * y - y * (y - 1) / 2 + x - y];
+      }
+    }
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("Upsampling");
+    static HWY_FULL(float) df;
+    size_t shift = settings_.shift_x;
+    size_t N = 1 << shift;
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(df));
+    for (ssize_t iy = -2; iy <= 2; iy++) {
+      msan::UnpoisonMemory(GetInputRow(input_rows, c_, iy) + xsize + 2,
+                           sizeof(float) * (xsize_v - xsize));
+    }
+    JXL_ASSERT(xextra == 0);
+    ssize_t x0 = 0;
+    ssize_t x1 = xsize;
+    if (N == 2) {
+      ProcessRowImpl<2>(input_rows, output_rows, x0, x1);
+    }
+    if (N == 4) {
+      ProcessRowImpl<4>(input_rows, output_rows, x0, x1);
+    }
+    if (N == 8) {
+      ProcessRowImpl<8>(input_rows, output_rows, x0, x1);
+    }
+    for (size_t oy = 0; oy < N; oy++) {
+      float* dst_row = GetOutputRow(output_rows, c_, oy);
+      msan::PoisonMemory(dst_row + xsize * N,
+                         sizeof(float) * (xsize_v - xsize) * N);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c == c_ ? RenderPipelineChannelMode::kInOut
+                   : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Upsample"; }
+
+ private:
+  template <size_t N>
+  JXL_INLINE float Kernel(size_t x, size_t y, ssize_t ix, ssize_t iy) const {
+    ix += 2;
+    iy += 2;
+    if (N == 2) {
+      return kernel_[0][0][y % 2 ? 4 - iy : iy][x % 2 ? 4 - ix : ix];
+    }
+    if (N == 4) {
+      return kernel_[y % 4 < 2 ? y % 2 : 1 - y % 2]
+                    [x % 4 < 2 ? x % 2 : 1 - x % 2][y % 4 < 2 ? iy : 4 - iy]
+                    [x % 4 < 2 ? ix : 4 - ix];
+    }
+    if (N == 8) {
+      return kernel_[y % 8 < 4 ? y % 4 : 3 - y % 4]
+                    [x % 8 < 4 ? x % 4 : 3 - x % 4][y % 8 < 4 ? iy : 4 - iy]
+                    [x % 8 < 4 ? ix : 4 - ix];
+    }
+    JXL_ABORT("Invalid upsample");
+  }
+
+  template <ssize_t N>
+  void ProcessRowImpl(const RowInfo& input_rows, const RowInfo& output_rows,
+                      ssize_t x0, ssize_t x1) const {
+    static HWY_FULL(float) df;
+    using V = hwy::HWY_NAMESPACE::Vec<HWY_FULL(float)>;
+    V ups0, ups1, ups2, ups3, ups4, ups5, ups6, ups7;
+    (void)ups2, (void)ups3, (void)ups4, (void)ups5, (void)ups6, (void)ups7;
+    V* ups[N];
+    if (N >= 2) {
+      ups[0] = &ups0;
+      ups[1] = &ups1;
+    }
+    if (N >= 4) {
+      ups[2] = &ups2;
+      ups[3] = &ups3;
+    }
+    if (N == 8) {
+      ups[4] = &ups4;
+      ups[5] = &ups5;
+      ups[6] = &ups6;
+      ups[7] = &ups7;
+    }
+    for (size_t oy = 0; oy < N; oy++) {
+      float* dst_row = GetOutputRow(output_rows, c_, oy);
+      for (ssize_t x = x0; x < x1; x += Lanes(df)) {
+        for (size_t ox = 0; ox < N; ox++) {
+          auto result = Zero(df);
+          auto min = LoadU(df, GetInputRow(input_rows, c_, 0) + x);
+          auto max = min;
+          for (ssize_t iy = -2; iy <= 2; iy++) {
+            for (ssize_t ix = -2; ix <= 2; ix++) {
+              auto v = LoadU(df, GetInputRow(input_rows, c_, iy) + x + ix);
+              result = MulAdd(Set(df, Kernel<N>(ox, oy, ix, iy)), v, result);
+              min = Min(v, min);
+              max = Max(v, max);
+            }
+          }
+          // Avoid overshooting.
+          *ups[ox] = Clamp(result, min, max);
+        }
+        if (N == 2) {
+          StoreInterleaved(df, ups0, ups1, dst_row + x * N);
+        }
+        if (N == 4) {
+          StoreInterleaved(df, ups0, ups1, ups2, ups3, dst_row + x * N);
+        }
+        if (N == 8) {
+          StoreInterleaved(df, ups0, ups1, ups2, ups3, ups4, ups5, ups6, ups7,
+                           dst_row + x * N);
+        }
+      }
+    }
+  }
+
+  size_t c_;
+  float kernel_[4][4][5][5];
+};
+
+std::unique_ptr<RenderPipelineStage> GetUpsamplingStage(
+    const CustomTransformData& ups_factors, size_t c, size_t shift) {
+  return jxl::make_unique<UpsamplingStage>(ups_factors, c, shift);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetUpsamplingStage);
+
+std::unique_ptr<RenderPipelineStage> GetUpsamplingStage(
+    const CustomTransformData& ups_factors, size_t c, size_t shift) {
+  JXL_ASSERT(shift != 0);
+  JXL_ASSERT(shift <= 3);
+  return HWY_DYNAMIC_DISPATCH(GetUpsamplingStage)(ups_factors, c, shift);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h
new file mode 100644
index 0000000000..7d5defd23c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Upsamples the given channel by the given factor.
+std::unique_ptr<RenderPipelineStage> GetUpsamplingStage(
+    const CustomTransformData& ups_factors, size_t c, size_t shift);
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc
new file mode 100644
index 0000000000..902fc33b7e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc
@@ -0,0 +1,601 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_write.h"
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/sanitizers.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_write.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::NearestInt;
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ShiftLeftSame;
+using hwy::HWY_NAMESPACE::ShiftRightSame;
+
+class WriteToOutputStage : public RenderPipelineStage {
+ public:
+  WriteToOutputStage(const ImageOutput& main_output, size_t width,
+                     size_t height, bool has_alpha, bool unpremul_alpha,
+                     size_t alpha_c, Orientation undo_orientation,
+                     const std::vector<ImageOutput>& extra_output)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        width_(width),
+        height_(height),
+        main_(main_output),
+        num_color_(main_.num_channels_ < 3 ? 1 : 3),
+        want_alpha_(main_.num_channels_ == 2 || main_.num_channels_ == 4),
+        has_alpha_(has_alpha),
+        unpremul_alpha_(unpremul_alpha),
+        alpha_c_(alpha_c),
+        flip_x_(ShouldFlipX(undo_orientation)),
+        flip_y_(ShouldFlipY(undo_orientation)),
+        transpose_(ShouldTranspose(undo_orientation)),
+        opaque_alpha_(kMaxPixelsPerCall, 1.0f) {
+    for (size_t ec = 0; ec < extra_output.size(); ++ec) {
+      if (extra_output[ec].callback.IsPresent() || extra_output[ec].buffer) {
+        Output extra(extra_output[ec]);
+        extra.channel_index_ = 3 + ec;
+        extra_channels_.push_back(extra);
+      }
+    }
+  }
+
+  WriteToOutputStage(const WriteToOutputStage&) = delete;
+  WriteToOutputStage& operator=(const WriteToOutputStage&) = delete;
+  WriteToOutputStage(WriteToOutputStage&&) = delete;
+  WriteToOutputStage& operator=(WriteToOutputStage&&) = delete;
+
+  ~WriteToOutputStage() override {
+    if (main_.run_opaque_) {
+      main_.pixel_callback_.destroy(main_.run_opaque_);
+    }
+    for (auto& extra : extra_channels_) {
+      if (extra.run_opaque_) {
+        extra.pixel_callback_.destroy(extra.run_opaque_);
+      }
+    }
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    JXL_DASSERT(xextra == 0);
+    JXL_DASSERT(main_.run_opaque_ || main_.buffer_);
+    if (ypos >= height_) return;
+    if (xpos >= width_) return;
+    if (flip_y_) {
+      ypos = height_ - 1u - ypos;
+    }
+    size_t limit = std::min(xsize, width_ - xpos);
+    for (size_t x0 = 0; x0 < limit; x0 += kMaxPixelsPerCall) {
+      size_t xstart = xpos + x0;
+      size_t len = std::min<size_t>(kMaxPixelsPerCall, limit - x0);
+
+      const float* line_buffers[4];
+      for (size_t c = 0; c < num_color_; c++) {
+        line_buffers[c] = GetInputRow(input_rows, c, 0) + x0;
+      }
+      if (has_alpha_) {
+        line_buffers[num_color_] = GetInputRow(input_rows, alpha_c_, 0) + x0;
+      } else {
+        // opaque_alpha_ is a way to set all values to 1.0f.
+        line_buffers[num_color_] = opaque_alpha_.data();
+      }
+      if (has_alpha_ && want_alpha_ && unpremul_alpha_) {
+        UnpremulAlpha(thread_id, len, line_buffers);
+      }
+      OutputBuffers(main_, thread_id, ypos, xstart, len, line_buffers);
+      for (const auto& extra : extra_channels_) {
+        line_buffers[0] = GetInputRow(input_rows, extra.channel_index_, 0) + x0;
+        OutputBuffers(extra, thread_id, ypos, xstart, len, line_buffers);
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    if (c < num_color_ || (has_alpha_ && c == alpha_c_)) {
+      return RenderPipelineChannelMode::kInput;
+    }
+    for (const auto& extra : extra_channels_) {
+      if (c == extra.channel_index_) {
+        return RenderPipelineChannelMode::kInput;
+      }
+    }
+    return RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "WritePixelCB"; }
+
+ private:
+  struct Output {
+    Output(const ImageOutput& image_out)
+        : pixel_callback_(image_out.callback),
+          buffer_(image_out.buffer),
+          buffer_size_(image_out.buffer_size),
+          stride_(image_out.stride),
+          num_channels_(image_out.format.num_channels),
+          swap_endianness_(SwapEndianness(image_out.format.endianness)),
+          data_type_(image_out.format.data_type),
+          bits_per_sample_(image_out.bits_per_sample) {}
+
+    Status PrepareForThreads(size_t num_threads) {
+      if (pixel_callback_.IsPresent()) {
+        run_opaque_ =
+            pixel_callback_.Init(num_threads, /*num_pixels=*/kMaxPixelsPerCall);
+        JXL_RETURN_IF_ERROR(run_opaque_ != nullptr);
+      } else {
+        JXL_RETURN_IF_ERROR(buffer_ != nullptr);
+      }
+      return true;
+    }
+
+    PixelCallback pixel_callback_;
+    void* run_opaque_ = nullptr;
+    void* buffer_ = nullptr;
+    size_t buffer_size_;
+    size_t stride_;
+    size_t num_channels_;
+    bool swap_endianness_;
+    JxlDataType data_type_;
+    size_t bits_per_sample_;
+    size_t channel_index_;  // used for extra_channels
+  };
+
+  Status PrepareForThreads(size_t num_threads) override {
+    JXL_RETURN_IF_ERROR(main_.PrepareForThreads(num_threads));
+    for (auto& extra : extra_channels_) {
+      JXL_RETURN_IF_ERROR(extra.PrepareForThreads(num_threads));
+    }
+    temp_out_.resize(num_threads);
+    for (CacheAlignedUniquePtr& temp : temp_out_) {
+      temp = AllocateArray(sizeof(float) * kMaxPixelsPerCall *
+                           main_.num_channels_);
+    }
+    if ((has_alpha_ && want_alpha_ && unpremul_alpha_) || flip_x_) {
+      temp_in_.resize(num_threads * main_.num_channels_);
+      for (CacheAlignedUniquePtr& temp : temp_in_) {
+        temp = AllocateArray(sizeof(float) * kMaxPixelsPerCall);
+      }
+    }
+    return true;
+  }
+  static bool ShouldFlipX(Orientation undo_orientation) {
+    return (undo_orientation == Orientation::kFlipHorizontal ||
+            undo_orientation == Orientation::kRotate180 ||
+            undo_orientation == Orientation::kRotate270 ||
+            undo_orientation == Orientation::kAntiTranspose);
+  }
+  static bool ShouldFlipY(Orientation undo_orientation) {
+    return (undo_orientation == Orientation::kFlipVertical ||
+            undo_orientation == Orientation::kRotate180 ||
+            undo_orientation == Orientation::kRotate90 ||
+            undo_orientation == Orientation::kAntiTranspose);
+  }
+  static bool ShouldTranspose(Orientation undo_orientation) {
+    return (undo_orientation == Orientation::kTranspose ||
+            undo_orientation == Orientation::kRotate90 ||
+            undo_orientation == Orientation::kRotate270 ||
+            undo_orientation == Orientation::kAntiTranspose);
+  }
+
+  void UnpremulAlpha(size_t thread_id, size_t len,
+                     const float** line_buffers) const {
+    const HWY_FULL(float) d;
+    auto one = Set(d, 1.0f);
+    float* temp_in[4];
+    for (size_t c = 0; c < main_.num_channels_; ++c) {
+      size_t tix = thread_id * main_.num_channels_ + c;
+      temp_in[c] = reinterpret_cast<float*>(temp_in_[tix].get());
+      memcpy(temp_in[c], line_buffers[c], sizeof(float) * len);
+    }
+    auto small_alpha = Set(d, kSmallAlpha);
+    for (size_t ix = 0; ix < len; ix += Lanes(d)) {
+      auto alpha = LoadU(d, temp_in[num_color_] + ix);
+      auto mul = Div(one, Max(small_alpha, alpha));
+      for (size_t c = 0; c < num_color_; ++c) {
+        auto val = LoadU(d, temp_in[c] + ix);
+        StoreU(Mul(val, mul), d, temp_in[c] + ix);
+      }
+    }
+    for (size_t c = 0; c < main_.num_channels_; ++c) {
+      line_buffers[c] = temp_in[c];
+    }
+  }
+
+  void OutputBuffers(const Output& out, size_t thread_id, size_t ypos,
+                     size_t xstart, size_t len, const float* input[4]) const {
+    if (flip_x_) {
+      FlipX(out, thread_id, len, &xstart, input);
+    }
+    if (out.data_type_ == JXL_TYPE_UINT8) {
+      uint8_t* JXL_RESTRICT temp =
+          reinterpret_cast<uint8_t*>(temp_out_[thread_id].get());
+      StoreUnsignedRow(out, input, len, temp);
+      WriteToOutput(out, thread_id, ypos, xstart, len, temp);
+    } else if (out.data_type_ == JXL_TYPE_UINT16 ||
+               out.data_type_ == JXL_TYPE_FLOAT16) {
+      uint16_t* JXL_RESTRICT temp =
+          reinterpret_cast<uint16_t*>(temp_out_[thread_id].get());
+      if (out.data_type_ == JXL_TYPE_UINT16) {
+        StoreUnsignedRow(out, input, len, temp);
+      } else {
+        StoreFloat16Row(out, input, len, temp);
+      }
+      if (out.swap_endianness_) {
+        const HWY_FULL(uint16_t) du;
+        size_t output_len = len * out.num_channels_;
+        for (size_t j = 0; j < output_len; j += Lanes(du)) {
+          auto v = LoadU(du, temp + j);
+          auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8));
+          StoreU(vswap, du, temp + j);
+        }
+      }
+      WriteToOutput(out, thread_id, ypos, xstart, len, temp);
+    } else if (out.data_type_ == JXL_TYPE_FLOAT) {
+      float* JXL_RESTRICT temp =
+          reinterpret_cast<float*>(temp_out_[thread_id].get());
+      StoreFloatRow(out, input, len, temp);
+      if (out.swap_endianness_) {
+        size_t output_len = len * out.num_channels_;
+        for (size_t j = 0; j < output_len; ++j) {
+          temp[j] = BSwapFloat(temp[j]);
+        }
+      }
+      WriteToOutput(out, thread_id, ypos, xstart, len, temp);
+    }
+  }
+
+  void FlipX(const Output& out, size_t thread_id, size_t len, size_t* xstart,
+             const float** line_buffers) const {
+    float* temp_in[4];
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      size_t tix = thread_id * main_.num_channels_ + c;
+      temp_in[c] = reinterpret_cast<float*>(temp_in_[tix].get());
+      if (temp_in[c] != line_buffers[c]) {
+        memcpy(temp_in[c], line_buffers[c], sizeof(float) * len);
+      }
+    }
+    size_t last = (len - 1u);
+    size_t num = (len / 2);
+    for (size_t i = 0; i < num; ++i) {
+      for (size_t c = 0; c < out.num_channels_; ++c) {
+        std::swap(temp_in[c][i], temp_in[c][last - i]);
+      }
+    }
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      line_buffers[c] = temp_in[c];
+    }
+    *xstart = width_ - *xstart - len;
+  }
+
+  template <typename T>
+  void StoreUnsignedRow(const Output& out, const float* input[4], size_t len,
+                        T* output) const {
+    const HWY_FULL(float) d;
+    auto zero = Zero(d);
+    auto one = Set(d, 1.0f);
+    auto mul = Set(d, (1u << (out.bits_per_sample_)) - 1);
+    const Rebind<T, decltype(d)> du;
+    const size_t padding = RoundUpTo(len, Lanes(d)) - len;
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      msan::UnpoisonMemory(input[c] + len, sizeof(input[c][0]) * padding);
+    }
+    if (out.num_channels_ == 1) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        StoreU(DemoteTo(du, NearestInt(v0)), du, &output[i]);
+      }
+    } else if (out.num_channels_ == 2) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul);
+        StoreInterleaved2(DemoteTo(du, NearestInt(v0)),
+                          DemoteTo(du, NearestInt(v1)), du, &output[2 * i]);
+      }
+    } else if (out.num_channels_ == 3) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul);
+        auto v2 = Mul(Clamp(zero, LoadU(d, &input[2][i]), one), mul);
+        StoreInterleaved3(DemoteTo(du, NearestInt(v0)),
+                          DemoteTo(du, NearestInt(v1)),
+                          DemoteTo(du, NearestInt(v2)), du, &output[3 * i]);
+      }
+    } else if (out.num_channels_ == 4) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul);
+        auto v2 = Mul(Clamp(zero, LoadU(d, &input[2][i]), one), mul);
+        auto v3 = Mul(Clamp(zero, LoadU(d, &input[3][i]), one), mul);
+        StoreInterleaved4(DemoteTo(du, NearestInt(v0)),
+                          DemoteTo(du, NearestInt(v1)),
+                          DemoteTo(du, NearestInt(v2)),
+                          DemoteTo(du, NearestInt(v3)), du, &output[4 * i]);
+      }
+    }
+    msan::PoisonMemory(output + out.num_channels_ * len,
+                       sizeof(output[0]) * out.num_channels_ * padding);
+  }
+
+  void StoreFloat16Row(const Output& out, const float* input[4], size_t len,
+                       uint16_t* output) const {
+    const HWY_FULL(float) d;
+    const Rebind<uint16_t, decltype(d)> du;
+    const Rebind<hwy::float16_t, decltype(d)> df16;
+    const size_t padding = RoundUpTo(len, Lanes(d)) - len;
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      msan::UnpoisonMemory(input[c] + len, sizeof(input[c][0]) * padding);
+    }
+    if (out.num_channels_ == 1) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        StoreU(BitCast(du, DemoteTo(df16, v0)), du, &output[i]);
+      }
+    } else if (out.num_channels_ == 2) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        auto v1 = LoadU(d, &input[1][i]);
+        StoreInterleaved2(BitCast(du, DemoteTo(df16, v0)),
+                          BitCast(du, DemoteTo(df16, v1)), du, &output[2 * i]);
+      }
+    } else if (out.num_channels_ == 3) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        auto v1 = LoadU(d, &input[1][i]);
+        auto v2 = LoadU(d, &input[2][i]);
+        StoreInterleaved3(BitCast(du, DemoteTo(df16, v0)),
+                          BitCast(du, DemoteTo(df16, v1)),
+                          BitCast(du, DemoteTo(df16, v2)), du, &output[3 * i]);
+      }
+    } else if (out.num_channels_ == 4) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        auto v1 = LoadU(d, &input[1][i]);
+        auto v2 = LoadU(d, &input[2][i]);
+        auto v3 = LoadU(d, &input[3][i]);
+        StoreInterleaved4(BitCast(du, DemoteTo(df16, v0)),
+                          BitCast(du, DemoteTo(df16, v1)),
+                          BitCast(du, DemoteTo(df16, v2)),
+                          BitCast(du, DemoteTo(df16, v3)), du, &output[4 * i]);
+      }
+    }
+    msan::PoisonMemory(output + out.num_channels_ * len,
+                       sizeof(output[0]) * out.num_channels_ * padding);
+  }
+
+  void StoreFloatRow(const Output& out, const float* input[4], size_t len,
+                     float* output) const {
+    const HWY_FULL(float) d;
+    if (out.num_channels_ == 1) {
+      memcpy(output, input[0], len * sizeof(output[0]));
+    } else if (out.num_channels_ == 2) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        StoreInterleaved2(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]), d,
+                          &output[2 * i]);
+      }
+    } else if (out.num_channels_ == 3) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        StoreInterleaved3(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]),
+                          LoadU(d, &input[2][i]), d, &output[3 * i]);
+      }
+    } else {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        StoreInterleaved4(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]),
+                          LoadU(d, &input[2][i]), LoadU(d, &input[3][i]), d,
+                          &output[4 * i]);
+      }
+    }
+  }
+
+  template <typename T>
+  void WriteToOutput(const Output& out, size_t thread_id, size_t ypos,
+                     size_t xstart, size_t len, T* output) const {
+    if (transpose_) {
+      // TODO(szabadka) Buffer 8x8 chunks and transpose with SIMD.
+      if (out.run_opaque_) {
+        for (size_t i = 0, j = 0; i < len; ++i, j += out.num_channels_) {
+          out.pixel_callback_.run(out.run_opaque_, thread_id, ypos, xstart + i,
+                                  1, output + j);
+        }
+      } else {
+        const size_t pixel_stride = out.num_channels_ * sizeof(T);
+        const size_t offset = xstart * out.stride_ + ypos * pixel_stride;
+        for (size_t i = 0, j = 0; i < len; ++i, j += out.num_channels_) {
+          const size_t ix = offset + i * out.stride_;
+          JXL_DASSERT(ix + pixel_stride <= out.buffer_size_);
+          memcpy(reinterpret_cast<uint8_t*>(out.buffer_) + ix, output + j,
+                 pixel_stride);
+        }
+      }
+    } else {
+      if (out.run_opaque_) {
+        out.pixel_callback_.run(out.run_opaque_, thread_id, xstart, ypos, len,
+                                output);
+      } else {
+        const size_t pixel_stride = out.num_channels_ * sizeof(T);
+        const size_t offset = ypos * out.stride_ + xstart * pixel_stride;
+        JXL_DASSERT(offset + len * pixel_stride <= out.buffer_size_);
+        memcpy(reinterpret_cast<uint8_t*>(out.buffer_) + offset, output,
+               len * pixel_stride);
+      }
+    }
+  }
+
+  static constexpr size_t kMaxPixelsPerCall = 1024;
+  size_t width_;
+  size_t height_;
+  Output main_;  // color + alpha
+  size_t num_color_;
+  bool want_alpha_;
+  bool has_alpha_;
+  bool unpremul_alpha_;
+  size_t alpha_c_;
+  bool flip_x_;
+  bool flip_y_;
+  bool transpose_;
+  std::vector<Output> extra_channels_;
+  std::vector<float> opaque_alpha_;
+  std::vector<CacheAlignedUniquePtr> temp_in_;
+  std::vector<CacheAlignedUniquePtr> temp_out_;
+};
+
+constexpr size_t WriteToOutputStage::kMaxPixelsPerCall;
+
+std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage(
+    const ImageOutput& main_output, size_t width, size_t height, bool has_alpha,
+    bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,
+    std::vector<ImageOutput>& extra_output) {
+  return jxl::make_unique<WriteToOutputStage>(
+      main_output, width, height, has_alpha, unpremul_alpha, alpha_c,
+      undo_orientation, extra_output);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace jxl {
+
+HWY_EXPORT(GetWriteToOutputStage);
+
+namespace {
+class WriteToImageBundleStage : public RenderPipelineStage {
+ public:
+  explicit WriteToImageBundleStage(ImageBundle* image_bundle,
+                                   ColorEncoding color_encoding)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        image_bundle_(image_bundle),
+        color_encoding_(std::move(color_encoding)) {}
+
+  void SetInputSizes(
+      const std::vector<std::pair<size_t, size_t>>& input_sizes) override {
+#if JXL_ENABLE_ASSERT
+    JXL_ASSERT(input_sizes.size() >= 3);
+    for (size_t c = 1; c < input_sizes.size(); c++) {
+      JXL_ASSERT(input_sizes[c].first == input_sizes[0].first);
+      JXL_ASSERT(input_sizes[c].second == input_sizes[0].second);
+    }
+#endif
+    // TODO(eustas): what should we do in the case of "want only ECs"?
+    image_bundle_->SetFromImage(
+        Image3F(input_sizes[0].first, input_sizes[0].second), color_encoding_);
+    // TODO(veluca): consider not reallocating ECs if not needed.
+    image_bundle_->extra_channels().clear();
+    for (size_t c = 3; c < input_sizes.size(); c++) {
+      image_bundle_->extra_channels().emplace_back(input_sizes[c].first,
+                                                   input_sizes[c].second);
+    }
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    for (size_t c = 0; c < 3; c++) {
+      memcpy(image_bundle_->color()->PlaneRow(c, ypos) + xpos - xextra,
+             GetInputRow(input_rows, c, 0) - xextra,
+             sizeof(float) * (xsize + 2 * xextra));
+    }
+    for (size_t ec = 0; ec < image_bundle_->extra_channels().size(); ec++) {
+      JXL_ASSERT(image_bundle_->extra_channels()[ec].xsize() >=
+                 xpos + xsize + xextra);
+      memcpy(image_bundle_->extra_channels()[ec].Row(ypos) + xpos - xextra,
+             GetInputRow(input_rows, 3 + ec, 0) - xextra,
+             sizeof(float) * (xsize + 2 * xextra));
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return RenderPipelineChannelMode::kInput;
+  }
+
+  const char* GetName() const override { return "WriteIB"; }
+
+ private:
+  ImageBundle* image_bundle_;
+  ColorEncoding color_encoding_;
+};
+
+class WriteToImage3FStage : public RenderPipelineStage {
+ public:
+  explicit WriteToImage3FStage(Image3F* image)
+      : RenderPipelineStage(RenderPipelineStage::Settings()), image_(image) {}
+
+  void SetInputSizes(
+      const std::vector<std::pair<size_t, size_t>>& input_sizes) override {
+#if JXL_ENABLE_ASSERT
+    JXL_ASSERT(input_sizes.size() >= 3);
+    for (size_t c = 1; c < 3; ++c) {
+      JXL_ASSERT(input_sizes[c].first == input_sizes[0].first);
+      JXL_ASSERT(input_sizes[c].second == input_sizes[0].second);
+    }
+#endif
+    *image_ = Image3F(input_sizes[0].first, input_sizes[0].second);
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    for (size_t c = 0; c < 3; c++) {
+      memcpy(image_->PlaneRow(c, ypos) + xpos - xextra,
+             GetInputRow(input_rows, c, 0) - xextra,
+             sizeof(float) * (xsize + 2 * xextra));
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInput
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "WriteI3F"; }
+
+ private:
+  Image3F* image_;
+};
+
+}  // namespace
+
+std::unique_ptr<RenderPipelineStage> GetWriteToImageBundleStage(
+    ImageBundle* image_bundle, ColorEncoding color_encoding) {
+  return jxl::make_unique<WriteToImageBundleStage>(image_bundle,
+                                                   std::move(color_encoding));
+}
+
+std::unique_ptr<RenderPipelineStage> GetWriteToImage3FStage(Image3F* image) {
+  return jxl::make_unique<WriteToImage3FStage>(image);
+}
+
+std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage(
+    const ImageOutput& main_output, size_t width, size_t height, bool has_alpha,
+    bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,
+    std::vector<ImageOutput>& extra_output) {
+  return HWY_DYNAMIC_DISPATCH(GetWriteToOutputStage)(
+      main_output, width, height, has_alpha, unpremul_alpha, alpha_c,
+      undo_orientation, extra_output);
+}
+
+}  // namespace jxl
+
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h
new file mode 100644
index 0000000000..c5f844ebe8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_
+
+#include <functional>
+
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+std::unique_ptr<RenderPipelineStage> GetWriteToImageBundleStage(
+    ImageBundle* image_bundle, ColorEncoding color_encoding);
+
+// Gets a stage to write color channels to an Image3F.
+std::unique_ptr<RenderPipelineStage> GetWriteToImage3FStage(Image3F* image);
+
+// Gets a stage to write to a pixel callback or image buffer.
+std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage(
+    const ImageOutput& main_output, size_t width, size_t height, bool has_alpha,
+    bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,
+    std::vector<ImageOutput>& extra_output);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc
new file mode 100644
index 0000000000..15cfc75b18
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc
@@ -0,0 +1,176 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_xyb.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_xyb.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_xyb-inl.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/sanitizers.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class XYBStage : public RenderPipelineStage {
+ public:
+  explicit XYBStage(const OutputEncodingInfo& output_encoding_info)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        opsin_params_(output_encoding_info.opsin_params),
+        output_is_xyb_(output_encoding_info.color_encoding.GetColorSpace() ==
+                       ColorSpace::kXYB) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("UndoXYB");
+
+    const HWY_FULL(float) d;
+    JXL_ASSERT(xextra == 0);
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    // All calculations are lane-wise, still some might require
+    // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+    // vector tail.
+    msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+    // TODO(eustas): when using frame origin, addresses might be unaligned;
+    //               making them aligned will void performance penalty.
+    if (output_is_xyb_) {
+      const auto scale_x = Set(d, kScaledXYBScale[0]);
+      const auto scale_y = Set(d, kScaledXYBScale[1]);
+      const auto scale_bmy = Set(d, kScaledXYBScale[2]);
+      const auto offset_x = Set(d, kScaledXYBOffset[0]);
+      const auto offset_y = Set(d, kScaledXYBOffset[1]);
+      const auto offset_bmy = Set(d, kScaledXYBOffset[2]);
+      for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+        const auto in_x = LoadU(d, row0 + x);
+        const auto in_y = LoadU(d, row1 + x);
+        const auto in_b = LoadU(d, row2 + x);
+        auto out_x = Mul(Add(in_x, offset_x), scale_x);
+        auto out_y = Mul(Add(in_y, offset_y), scale_y);
+        auto out_b = Mul(Add(Sub(in_b, in_y), offset_bmy), scale_bmy);
+        StoreU(out_x, d, row0 + x);
+        StoreU(out_y, d, row1 + x);
+        StoreU(out_b, d, row2 + x);
+      }
+    } else {
+      for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+        const auto in_opsin_x = LoadU(d, row0 + x);
+        const auto in_opsin_y = LoadU(d, row1 + x);
+        const auto in_opsin_b = LoadU(d, row2 + x);
+        auto r = Undefined(d);
+        auto g = Undefined(d);
+        auto b = Undefined(d);
+        XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params_, &r, &g,
+                 &b);
+        StoreU(r, d, row0 + x);
+        StoreU(g, d, row1 + x);
+        StoreU(b, d, row2 + x);
+      }
+    }
+    msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "XYB"; }
+
+ private:
+  const OpsinParams opsin_params_;
+  const bool output_is_xyb_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetXYBStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return jxl::make_unique<XYBStage>(output_encoding_info);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetXYBStage);
+
+std::unique_ptr<RenderPipelineStage> GetXYBStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return HWY_DYNAMIC_DISPATCH(GetXYBStage)(output_encoding_info);
+}
+
+namespace {
+class FastXYBStage : public RenderPipelineStage {
+ public:
+  FastXYBStage(uint8_t* rgb, size_t stride, size_t width, size_t height,
+               bool rgba, bool has_alpha, size_t alpha_c)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        rgb_(rgb),
+        stride_(stride),
+        width_(width),
+        height_(height),
+        rgba_(rgba),
+        has_alpha_(has_alpha),
+        alpha_c_(alpha_c) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    if (ypos >= height_) return;
+    JXL_ASSERT(xextra == 0);
+    const float* xyba[4] = {
+        GetInputRow(input_rows, 0, 0), GetInputRow(input_rows, 1, 0),
+        GetInputRow(input_rows, 2, 0),
+        has_alpha_ ? GetInputRow(input_rows, alpha_c_, 0) : nullptr};
+    uint8_t* out_buf = rgb_ + stride_ * ypos + (rgba_ ? 4 : 3) * xpos;
+    FastXYBTosRGB8(xyba, out_buf, rgba_,
+                   xsize + xpos <= width_ ? xsize : width_ - xpos);
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 || (has_alpha_ && c == alpha_c_)
+               ? RenderPipelineChannelMode::kInput
+               : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "FastXYB"; }
+
+ private:
+  uint8_t* rgb_;
+  size_t stride_;
+  size_t width_;
+  size_t height_;
+  bool rgba_;
+  bool has_alpha_;
+  size_t alpha_c_;
+  std::vector<float> opaque_alpha_;
+};
+
+}  // namespace
+
+std::unique_ptr<RenderPipelineStage> GetFastXYBTosRGB8Stage(
+    uint8_t* rgb, size_t stride, size_t width, size_t height, bool rgba,
+    bool has_alpha, size_t alpha_c) {
+  JXL_ASSERT(HasFastXYBTosRGB8());
+  return make_unique<FastXYBStage>(rgb, stride, width, height, rgba, has_alpha,
+                                   alpha_c);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h
new file mode 100644
index 0000000000..7b06345c36
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_
+#include <stdint.h>
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from XYB to linear with appropriate primaries.
+std::unique_ptr<RenderPipelineStage> GetXYBStage(
+    const OutputEncodingInfo& output_encoding_info);
+
+// Gets a stage to convert with fixed point arithmetic from XYB to sRGB8 and
+// write to a uint8 buffer.
+std::unique_ptr<RenderPipelineStage> GetFastXYBTosRGB8Stage(
+    uint8_t* rgb, size_t stride, size_t width, size_t height, bool rgba,
+    bool has_alpha, size_t alpha_c);
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc
new file mode 100644
index 0000000000..5cba4a7d41
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc
@@ -0,0 +1,85 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_ycbcr.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_ycbcr.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class kYCbCrStage : public RenderPipelineStage {
+ public:
+  kYCbCrStage() : RenderPipelineStage(RenderPipelineStage::Settings()) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("UndoYCbCr");
+
+    const HWY_FULL(float) df;
+
+    // Full-range BT.601 as defined by JFIF Clause 7:
+    // https://www.itu.int/rec/T-REC-T.871-201105-I/en
+    const auto c128 = Set(df, 128.0f / 255);
+    const auto crcr = Set(df, 1.402f);
+    const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f);
+    const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f);
+    const auto cbcb = Set(df, 1.772f);
+
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    // TODO(eustas): when using frame origin, addresses might be unaligned;
+    //               making them aligned will void performance penalty.
+    for (size_t x = 0; x < xsize; x += Lanes(df)) {
+      const auto y_vec = Add(LoadU(df, row1 + x), c128);
+      const auto cb_vec = LoadU(df, row0 + x);
+      const auto cr_vec = LoadU(df, row2 + x);
+      const auto r_vec = MulAdd(crcr, cr_vec, y_vec);
+      const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec));
+      const auto b_vec = MulAdd(cbcb, cb_vec, y_vec);
+      StoreU(r_vec, df, row0 + x);
+      StoreU(g_vec, df, row1 + x);
+      StoreU(b_vec, df, row2 + x);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "YCbCr"; }
+};
+
+std::unique_ptr<RenderPipelineStage> GetYCbCrStage() {
+  return jxl::make_unique<kYCbCrStage>();
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetYCbCrStage);
+
+std::unique_ptr<RenderPipelineStage> GetYCbCrStage() {
+  return HWY_DYNAMIC_DISPATCH(GetYCbCrStage)();
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h
new file mode 100644
index 0000000000..9320c9723f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h
@@ -0,0 +1,25 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from YCbCr to RGB.
+std::unique_ptr<RenderPipelineStage> GetYCbCrStage();
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h
new file mode 100644
index 0000000000..789a52f8b2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h
@@ -0,0 +1,101 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+class UpsampleXSlowStage : public RenderPipelineStage {
+ public:
+  UpsampleXSlowStage()
+      : RenderPipelineStage(RenderPipelineStage::Settings::ShiftX(1, 1)) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    for (size_t c = 0; c < input_rows.size(); c++) {
+      const float* row = GetInputRow(input_rows, c, 0);
+      float* row_out = GetOutputRow(output_rows, c, 0);
+      for (int64_t x = -xextra; x < (int64_t)(xsize + xextra); x++) {
+        float xp = *(row + x - 1);
+        float xc = *(row + x);
+        float xn = *(row + x + 1);
+        float xout0 = xp * 0.25f + xc * 0.75f;
+        float xout1 = xc * 0.75f + xn * 0.25f;
+        *(row_out + 2 * x + 0) = xout0;
+        *(row_out + 2 * x + 1) = xout1;
+      }
+    }
+  }
+
+  const char* GetName() const override { return "TEST::UpsampleXSlowStage"; }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return RenderPipelineChannelMode::kInOut;
+  }
+};
+
+class UpsampleYSlowStage : public RenderPipelineStage {
+ public:
+  UpsampleYSlowStage()
+      : RenderPipelineStage(RenderPipelineStage::Settings::ShiftY(1, 1)) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    for (size_t c = 0; c < input_rows.size(); c++) {
+      const float* rowp = GetInputRow(input_rows, c, -1);
+      const float* rowc = GetInputRow(input_rows, c, 0);
+      const float* rown = GetInputRow(input_rows, c, 1);
+      float* row_out0 = GetOutputRow(output_rows, c, 0);
+      float* row_out1 = GetOutputRow(output_rows, c, 1);
+      for (int64_t x = -xextra; x < (int64_t)(xsize + xextra); x++) {
+        float xp = *(rowp + x);
+        float xc = *(rowc + x);
+        float xn = *(rown + x);
+        float yout0 = xp * 0.25f + xc * 0.75f;
+        float yout1 = xc * 0.75f + xn * 0.25f;
+        *(row_out0 + x) = yout0;
+        *(row_out1 + x) = yout1;
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return RenderPipelineChannelMode::kInOut;
+  }
+
+  const char* GetName() const override { return "TEST::UpsampleYSlowStage"; }
+};
+
+class Check0FinalStage : public RenderPipelineStage {
+ public:
+  Check0FinalStage() : RenderPipelineStage(RenderPipelineStage::Settings()) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    for (size_t c = 0; c < input_rows.size(); c++) {
+      for (size_t x = 0; x < xsize; x++) {
+        JXL_CHECK(fabsf(GetInputRow(input_rows, c, 0)[x]) < 1e-8);
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return RenderPipelineChannelMode::kInput;
+  }
+  const char* GetName() const override { return "TEST::Check0FinalStage"; }
+};
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/roundtrip_test.cc b/third_party/jpeg-xl/lib/jxl/roundtrip_test.cc
new file mode 100644
index 0000000000..f1529b500c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/roundtrip_test.cc
@@ -0,0 +1,839 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/codestream_header.h>
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/encode.h>
+#include <jxl/encode_cxx.h>
+#include <jxl/types.h>
+
+#include <cmath>  // std::abs
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <vector>
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/dec_external_image.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_comparator.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/encode_internal.h"
+#include "lib/jxl/icc_codec.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace {
+
+// Converts a test image to a CodecInOut.
+// icc_profile can be empty to automatically deduce profile from the pixel
+// format, or filled in to force this ICC profile
+jxl::CodecInOut ConvertTestImage(const std::vector<uint8_t>& buf,
+                                 const size_t xsize, const size_t ysize,
+                                 const JxlPixelFormat& pixel_format,
+                                 const jxl::PaddedBytes& icc_profile) {
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+
+  bool is_gray = pixel_format.num_channels < 3;
+  bool has_alpha =
+      pixel_format.num_channels == 2 || pixel_format.num_channels == 4;
+
+  io.metadata.m.color_encoding.SetColorSpace(is_gray ? jxl::ColorSpace::kGray
+                                                     : jxl::ColorSpace::kRGB);
+  if (has_alpha) {
+    // Note: alpha > 16 not yet supported by the C++ codec
+    switch (pixel_format.data_type) {
+      case JXL_TYPE_UINT8:
+        io.metadata.m.SetAlphaBits(8);
+        break;
+      case JXL_TYPE_UINT16:
+      case JXL_TYPE_FLOAT:
+      case JXL_TYPE_FLOAT16:
+        io.metadata.m.SetAlphaBits(16);
+        break;
+      default:
+        ADD_FAILURE() << "Roundtrip tests for data type "
+                      << pixel_format.data_type << " not yet implemented.";
+    }
+  }
+  size_t bitdepth = 0;
+  switch (pixel_format.data_type) {
+    case JXL_TYPE_FLOAT:
+      bitdepth = 32;
+      io.metadata.m.SetFloat32Samples();
+      break;
+    case JXL_TYPE_FLOAT16:
+      bitdepth = 16;
+      io.metadata.m.SetFloat16Samples();
+      break;
+    case JXL_TYPE_UINT8:
+      bitdepth = 8;
+      io.metadata.m.SetUintSamples(8);
+      break;
+    case JXL_TYPE_UINT16:
+      bitdepth = 16;
+      io.metadata.m.SetUintSamples(16);
+      break;
+    default:
+      ADD_FAILURE() << "Roundtrip tests for data type "
+                    << pixel_format.data_type << " not yet implemented.";
+  }
+  jxl::ColorEncoding color_encoding;
+  if (!icc_profile.empty()) {
+    jxl::PaddedBytes icc_profile_copy(icc_profile);
+    EXPECT_TRUE(color_encoding.SetICC(std::move(icc_profile_copy)));
+  } else if (pixel_format.data_type == JXL_TYPE_FLOAT) {
+    color_encoding = jxl::ColorEncoding::LinearSRGB(is_gray);
+  } else {
+    color_encoding = jxl::ColorEncoding::SRGB(is_gray);
+  }
+  EXPECT_TRUE(
+      ConvertFromExternal(jxl::Span<const uint8_t>(buf.data(), buf.size()),
+                          xsize, ysize, color_encoding,
+                          /*bits_per_sample=*/bitdepth, pixel_format,
+                          /*pool=*/nullptr, &io.Main()));
+  return io;
+}
+
+template <typename T>
+T ConvertTestPixel(float val);
+
+template <>
+float ConvertTestPixel<float>(const float val) {
+  return val;
+}
+
+template <>
+uint16_t ConvertTestPixel<uint16_t>(const float val) {
+  return (uint16_t)(val * UINT16_MAX);
+}
+
+template <>
+uint8_t ConvertTestPixel<uint8_t>(const float val) {
+  return (uint8_t)(val * UINT8_MAX);
+}
+
+// Returns a test image.
+template <typename T>
+std::vector<uint8_t> GetTestImage(const size_t xsize, const size_t ysize,
+                                  const JxlPixelFormat& pixel_format) {
+  std::vector<T> pixels(xsize * ysize * pixel_format.num_channels);
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      for (size_t chan = 0; chan < pixel_format.num_channels; chan++) {
+        float val;
+        switch (chan % 4) {
+          case 0:
+            val = static_cast<float>(y) / static_cast<float>(ysize);
+            break;
+          case 1:
+            val = static_cast<float>(x) / static_cast<float>(xsize);
+            break;
+          case 2:
+            val = static_cast<float>(x + y) / static_cast<float>(xsize + ysize);
+            break;
+          case 3:
+            val = static_cast<float>(x * y) / static_cast<float>(xsize * ysize);
+            break;
+        }
+        pixels[(y * xsize + x) * pixel_format.num_channels + chan] =
+            ConvertTestPixel<T>(val);
+      }
+    }
+  }
+  std::vector<uint8_t> bytes(pixels.size() * sizeof(T));
+  memcpy(bytes.data(), pixels.data(), sizeof(T) * pixels.size());
+  return bytes;
+}
+
+void EncodeWithEncoder(JxlEncoder* enc, std::vector<uint8_t>* compressed) {
+  compressed->resize(64);
+  uint8_t* next_out = compressed->data();
+  size_t avail_out = compressed->size() - (next_out - compressed->data());
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc, &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed->data();
+      compressed->resize(compressed->size() * 2);
+      next_out = compressed->data() + offset;
+      avail_out = compressed->size() - offset;
+    }
+  }
+  compressed->resize(next_out - compressed->data());
+  EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+}
+
+// Generates some pixels using some dimensions and pixel_format,
+// compresses them, and verifies that the decoded version is similar to the
+// original pixels.
+// TODO(firsching): change this to be a parameterized test, like in
+// decode_test.cc
+template <typename T>
+void VerifyRoundtripCompression(
+    const size_t xsize, const size_t ysize,
+    const JxlPixelFormat& input_pixel_format,
+    const JxlPixelFormat& output_pixel_format, const bool lossless,
+    const bool use_container, const uint32_t resampling = 1,
+    const bool already_downsampled = false,
+    const std::vector<std::pair<JxlExtraChannelType, std::string>>&
+        extra_channels = {}) {
+  size_t orig_xsize = xsize;
+  size_t orig_ysize = ysize;
+  if (already_downsampled) {
+    orig_xsize = jxl::DivCeil(xsize, resampling);
+    orig_ysize = jxl::DivCeil(ysize, resampling);
+  }
+
+  JxlPixelFormat extra_channel_pixel_format = input_pixel_format;
+  extra_channel_pixel_format.num_channels = 1;
+  const std::vector<uint8_t> extra_channel_bytes =
+      GetTestImage<T>(xsize, ysize, extra_channel_pixel_format);
+  const std::vector<uint8_t> original_bytes =
+      GetTestImage<T>(orig_xsize, orig_ysize, input_pixel_format);
+  jxl::CodecInOut original_io = ConvertTestImage(
+      original_bytes, orig_xsize, orig_ysize, input_pixel_format, {});
+
+  JxlEncoder* enc = JxlEncoderCreate(nullptr);
+  EXPECT_NE(nullptr, enc);
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc, 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc, use_container));
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &input_pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = lossless;
+  uint32_t num_channels = input_pixel_format.num_channels;
+  size_t has_interleaved_alpha = num_channels == 2 || num_channels == 4;
+  JxlPixelFormat output_pixel_format_with_extra_channel_alpha =
+      output_pixel_format;
+
+  // In the case where we have an alpha channel, but it is provided as an extra
+  // channel and not interleaved, we do two things here:
+  // 1. modify the original_io to have the correct alpha channel
+  // 2. change the output_format_with_extra_alpha to have an alpha channel
+  bool alpha_in_extra_channels_vector = false;
+  for (const auto& extra_channel : extra_channels) {
+    if (extra_channel.first == JXL_CHANNEL_ALPHA) {
+      alpha_in_extra_channels_vector = true;
+    }
+  }
+  if (alpha_in_extra_channels_vector && !has_interleaved_alpha) {
+    jxl::ImageF alpha_channel(xsize, ysize);
+    EXPECT_TRUE(jxl::ConvertFromExternal(
+        jxl::Span<const uint8_t>(extra_channel_bytes.data(),
+                                 extra_channel_bytes.size()),
+        xsize, ysize, basic_info.bits_per_sample, extra_channel_pixel_format, 0,
+        /*pool=*/nullptr, &alpha_channel));
+
+    original_io.metadata.m.SetAlphaBits(basic_info.bits_per_sample);
+    original_io.Main().SetAlpha(std::move(alpha_channel));
+    output_pixel_format_with_extra_channel_alpha.num_channels++;
+  }
+  // Those are the num_extra_channels including a potential alpha channel.
+  basic_info.num_extra_channels = extra_channels.size() + has_interleaved_alpha;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info));
+  EXPECT_EQ(enc->metadata.m.num_extra_channels,
+            extra_channels.size() + has_interleaved_alpha);
+  JxlColorEncoding color_encoding;
+  if (input_pixel_format.data_type == JXL_TYPE_FLOAT) {
+    JxlColorEncodingSetToLinearSRGB(
+        &color_encoding,
+        /*is_gray=*/input_pixel_format.num_channels < 3);
+  } else {
+    JxlColorEncodingSetToSRGB(&color_encoding,
+                              /*is_gray=*/input_pixel_format.num_channels < 3);
+  }
+
+  std::vector<JxlExtraChannelInfo> channel_infos;
+  for (const auto& extra_channel : extra_channels) {
+    auto channel_type = extra_channel.first;
+    JxlExtraChannelInfo channel_info;
+    JxlEncoderInitExtraChannelInfo(channel_type, &channel_info);
+    channel_info.bits_per_sample = (lossless ? basic_info.bits_per_sample : 8);
+    channel_info.exponent_bits_per_sample =
+        (lossless ? basic_info.exponent_bits_per_sample : 0);
+    channel_infos.push_back(channel_info);
+  }
+  for (size_t index = 0; index < channel_infos.size(); index++) {
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderSetExtraChannelInfo(enc, index + has_interleaved_alpha,
+                                            &channel_infos[index]));
+    std::string name = extra_channels[index].second;
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderSetExtraChannelName(enc, index + has_interleaved_alpha,
+                                            name.c_str(), name.length()));
+  }
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetColorEncoding(enc, &color_encoding));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc, nullptr);
+  JxlEncoderSetFrameLossless(frame_settings, lossless);
+  if (resampling > 1) {
+    EXPECT_EQ(
+        JXL_ENC_SUCCESS,
+        JxlEncoderFrameSettingsSetOption(
+            frame_settings, JXL_ENC_FRAME_SETTING_RESAMPLING, resampling));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_ALREADY_DOWNSAMPLED,
+                  already_downsampled));
+  }
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &input_pixel_format,
+                                    (void*)original_bytes.data(),
+                                    original_bytes.size()));
+  EXPECT_EQ(frame_settings->enc->input_queue.back()
+                .frame->frame.extra_channels()
+                .size(),
+            has_interleaved_alpha + extra_channels.size());
+  EXPECT_EQ(frame_settings->enc->input_queue.empty(), false);
+  for (size_t index = 0; index < channel_infos.size(); index++) {
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderSetExtraChannelBuffer(
+                  frame_settings, &extra_channel_pixel_format,
+                  (void*)extra_channel_bytes.data(), extra_channel_bytes.size(),
+                  index + has_interleaved_alpha));
+  }
+  JxlEncoderCloseInput(enc);
+  EXPECT_EQ(frame_settings->enc->input_queue.back()
+                .frame->frame.extra_channels()
+                .size(),
+            has_interleaved_alpha + extra_channels.size());
+  std::vector<uint8_t> compressed;
+  EncodeWithEncoder(enc, &compressed);
+  JxlEncoderDestroy(enc);
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_NE(nullptr, dec);
+
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO |
+                                               JXL_DEC_COLOR_ENCODING |
+                                               JXL_DEC_FULL_IMAGE));
+
+  JxlDecoderSetInput(dec, next_in, avail_in);
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(
+      JXL_DEC_SUCCESS,
+      JxlDecoderImageOutBufferSize(
+          dec, &output_pixel_format_with_extra_channel_alpha, &buffer_size));
+  if (&input_pixel_format == &output_pixel_format_with_extra_channel_alpha &&
+      !already_downsampled) {
+    EXPECT_EQ(buffer_size, original_bytes.size());
+  }
+
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(xsize, info.xsize);
+  EXPECT_EQ(ysize, info.ysize);
+  EXPECT_EQ(extra_channels.size() + has_interleaved_alpha,
+            info.num_extra_channels);
+
+  EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+
+  size_t icc_profile_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetICCProfileSize(
+                dec, &output_pixel_format_with_extra_channel_alpha,
+                JXL_COLOR_PROFILE_TARGET_DATA, &icc_profile_size));
+  jxl::PaddedBytes icc_profile(icc_profile_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsICCProfile(
+                dec, &output_pixel_format, JXL_COLOR_PROFILE_TARGET_DATA,
+                icc_profile.data(), icc_profile.size()));
+
+  std::vector<uint8_t> decoded_bytes(buffer_size);
+
+  EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetImageOutBuffer(
+                dec, &output_pixel_format_with_extra_channel_alpha,
+                decoded_bytes.data(), decoded_bytes.size()));
+  std::vector<std::vector<uint8_t>> extra_channel_decoded_bytes(
+      info.num_extra_channels - has_interleaved_alpha);
+
+  for (size_t index = has_interleaved_alpha; index < info.num_extra_channels;
+       index++) {
+    JxlExtraChannelInfo channel_info;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderGetExtraChannelInfo(dec, index, &channel_info));
+    EXPECT_EQ(channel_info.type,
+              extra_channels[index - has_interleaved_alpha].first);
+    std::string input_name =
+        extra_channels[index - has_interleaved_alpha].second;
+    const size_t name_length = channel_info.name_length;
+    EXPECT_EQ(input_name.size(), name_length);
+    std::vector<char> output_name(name_length + 1);
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderGetExtraChannelName(dec, index, output_name.data(),
+                                            output_name.size()));
+    EXPECT_EQ(0,
+              memcmp(input_name.data(), output_name.data(), input_name.size()));
+    size_t extra_buffer_size;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderExtraChannelBufferSize(dec, &output_pixel_format,
+                                               &extra_buffer_size, index));
+    std::vector<uint8_t> extra_decoded_bytes(extra_buffer_size);
+    extra_channel_decoded_bytes[index - has_interleaved_alpha] =
+        std::move(extra_decoded_bytes);
+    EXPECT_EQ(
+        JXL_DEC_SUCCESS,
+        JxlDecoderSetExtraChannelBuffer(
+            dec, &output_pixel_format,
+            extra_channel_decoded_bytes[index - has_interleaved_alpha].data(),
+            extra_channel_decoded_bytes[index - has_interleaved_alpha].size(),
+            index));
+  }
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+  // Check if there are no further errors after getting the full image, e.g.
+  // check that the final codestream box is actually marked as last.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  JxlDecoderDestroy(dec);
+
+  jxl::CodecInOut decoded_io = ConvertTestImage(
+      decoded_bytes, xsize, ysize, output_pixel_format_with_extra_channel_alpha,
+      icc_profile);
+
+  if (already_downsampled) {
+    jxl::Image3F* color = decoded_io.Main().color();
+    jxl::DownsampleImage(color, resampling);
+    if (decoded_io.Main().HasAlpha()) {
+      jxl::ImageF* alpha = decoded_io.Main().alpha();
+      jxl::DownsampleImage(alpha, resampling);
+    }
+    decoded_io.SetSize(color->xsize(), color->ysize());
+  }
+
+  if (lossless && !already_downsampled) {
+    JXL_EXPECT_OK(jxl::SamePixels(*original_io.Main().color(),
+                                  *decoded_io.Main().color(), _));
+  } else {
+    jxl::ButteraugliParams ba;
+    float butteraugli_score = ButteraugliDistance(
+        original_io.frames, decoded_io.frames, ba, jxl::GetJxlCms(),
+        /*distmap=*/nullptr, nullptr);
+    EXPECT_LE(butteraugli_score, 2.0f);
+  }
+  JxlPixelFormat extra_channel_output_pixel_format = output_pixel_format;
+  extra_channel_output_pixel_format.num_channels = 1;
+  for (auto& extra_channel : extra_channel_decoded_bytes) {
+    EXPECT_EQ(extra_channel.size(), extra_channel_bytes.size());
+    if (lossless) {
+      EXPECT_EQ(jxl::test::ComparePixels(extra_channel.data(),
+                                         extra_channel_bytes.data(), xsize,
+                                         ysize, extra_channel_pixel_format,
+                                         extra_channel_output_pixel_format),
+                0u);
+      EXPECT_EQ(extra_channel, extra_channel_bytes);
+    }
+  }
+}
+
+}  // namespace
+
+TEST(RoundtripTest, FloatFrameRoundtripTest) {
+  std::vector<std::vector<std::pair<JxlExtraChannelType, std::string>>>
+      extra_channels_cases = {{},
+                              {{JXL_CHANNEL_ALPHA, "my extra alpha channel"}},
+                              {{JXL_CHANNEL_CFA, "my cfa channel"}},
+                              {{JXL_CHANNEL_DEPTH, "depth"},
+                               {JXL_CHANNEL_SELECTION_MASK, "mask"},
+                               {JXL_CHANNEL_BLACK, "black"},
+                               {JXL_CHANNEL_CFA, "my cfa channel"},
+                               {JXL_CHANNEL_OPTIONAL, "optional channel"}},
+                              {{JXL_CHANNEL_DEPTH, "very deep"}}};
+  for (int use_container = 0; use_container < 2; use_container++) {
+    for (int lossless = 0; lossless < 2; lossless++) {
+      for (uint32_t num_channels = 1; num_channels < 5; num_channels++) {
+        for (auto& extra_channels : extra_channels_cases) {
+          uint32_t has_alpha = static_cast<uint32_t>(num_channels % 2 == 0);
+          uint32_t total_extra_channels = has_alpha + extra_channels.size();
+          // There's no support (yet) for lossless extra float
+          // channels, so we don't test it.
+          if (total_extra_channels == 0 || !lossless) {
+            JxlPixelFormat pixel_format = JxlPixelFormat{
+                num_channels, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0};
+            VerifyRoundtripCompression<float>(
+                63, 129, pixel_format, pixel_format, (bool)lossless,
+                (bool)use_container, 1, false, extra_channels);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(RoundtripTest, Uint16FrameRoundtripTest) {
+  std::vector<std::vector<std::pair<JxlExtraChannelType, std::string>>>
+      extra_channels_cases = {{},
+                              {{JXL_CHANNEL_ALPHA, "my extra alpha channel"}},
+                              {{JXL_CHANNEL_CFA, "my cfa channel"}},
+                              {{JXL_CHANNEL_CFA, "my cfa channel"},
+                               {JXL_CHANNEL_BLACK, "k_channel"}},
+                              {{JXL_CHANNEL_DEPTH, "very deep"}}};
+  for (int use_container = 0; use_container < 2; use_container++) {
+    for (int lossless = 0; lossless < 2; lossless++) {
+      for (uint32_t num_channels = 1; num_channels < 5; num_channels++) {
+        for (auto& extra_channels : extra_channels_cases) {
+          JxlPixelFormat pixel_format = JxlPixelFormat{
+              num_channels, JXL_TYPE_UINT16, JXL_NATIVE_ENDIAN, 0};
+          VerifyRoundtripCompression<uint16_t>(
+              63, 129, pixel_format, pixel_format, (bool)lossless,
+              (bool)use_container, 1, false, extra_channels);
+        }
+      }
+    }
+  }
+}
+
+TEST(RoundtripTest, Uint8FrameRoundtripTest) {
+  std::vector<std::vector<std::pair<JxlExtraChannelType, std::string>>>
+      extra_channels_cases = {{},
+                              {{JXL_CHANNEL_THERMAL, "temperature"}},
+                              {{JXL_CHANNEL_ALPHA, "my extra alpha channel"}},
+                              {{JXL_CHANNEL_CFA, "my cfa channel"}},
+                              {{JXL_CHANNEL_CFA, "my cfa channel"},
+                               {JXL_CHANNEL_BLACK, "k_channel"}},
+                              {{JXL_CHANNEL_DEPTH, "very deep"}}};
+  for (int use_container = 0; use_container < 2; use_container++) {
+    for (int lossless = 0; lossless < 2; lossless++) {
+      for (uint32_t num_channels = 1; num_channels < 5; num_channels++) {
+        for (auto& extra_channels : extra_channels_cases) {
+          JxlPixelFormat pixel_format = JxlPixelFormat{
+              num_channels, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
+          VerifyRoundtripCompression<uint8_t>(
+              63, 129, pixel_format, pixel_format, (bool)lossless,
+              (bool)use_container, 1, false, extra_channels);
+        }
+      }
+    }
+  }
+}
+
+TEST(RoundtripTest, TestNonlinearSrgbAsXybEncoded) {
+  for (int use_container = 0; use_container < 2; use_container++) {
+    for (uint32_t num_channels = 1; num_channels < 5; num_channels++) {
+      JxlPixelFormat pixel_format_in =
+          JxlPixelFormat{num_channels, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
+      JxlPixelFormat pixel_format_out =
+          JxlPixelFormat{num_channels, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0};
+      VerifyRoundtripCompression<uint8_t>(
+          63, 129, pixel_format_in, pixel_format_out,
+          /*lossless=*/false, (bool)use_container, {});
+    }
+  }
+}
+
+TEST(RoundtripTest, Resampling) {
+  JxlPixelFormat pixel_format =
+      JxlPixelFormat{3, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
+  VerifyRoundtripCompression<uint8_t>(63, 129, pixel_format, pixel_format,
+                                      /*lossless=*/false,
+                                      /*use_container=*/false, 2,
+                                      /*already_downsampled=*/false);
+
+  // TODO(lode): also make this work for odd sizes. This requires a fix in
+  // enc_frame.cc to not set custom_size_or_origin to true due to even/odd
+  // mismatch.
+  VerifyRoundtripCompression<uint8_t>(64, 128, pixel_format, pixel_format,
+                                      /*lossless=*/true,
+                                      /*use_container=*/false, 2,
+                                      /*already_downsampled=*/true);
+}
+
+TEST(RoundtripTest, ExtraBoxesTest) {
+  JxlPixelFormat pixel_format =
+      JxlPixelFormat{4, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0};
+  const size_t xsize = 61;
+  const size_t ysize = 71;
+
+  const std::vector<uint8_t> original_bytes =
+      GetTestImage<float>(xsize, ysize, pixel_format);
+  jxl::CodecInOut original_io =
+      ConvertTestImage(original_bytes, xsize, ysize, pixel_format, {});
+
+  JxlEncoder* enc = JxlEncoderCreate(nullptr);
+  EXPECT_NE(nullptr, enc);
+
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc, true));
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = false;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc, 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info));
+  JxlColorEncoding color_encoding;
+  if (pixel_format.data_type == JXL_TYPE_FLOAT) {
+    JxlColorEncodingSetToLinearSRGB(&color_encoding,
+                                    /*is_gray=*/pixel_format.num_channels < 3);
+  } else {
+    JxlColorEncodingSetToSRGB(&color_encoding,
+                              /*is_gray=*/pixel_format.num_channels < 3);
+  }
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetColorEncoding(enc, &color_encoding));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc, nullptr);
+  JxlEncoderSetFrameLossless(frame_settings, false);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    (void*)original_bytes.data(),
+                                    original_bytes.size()));
+  JxlEncoderCloseInput(enc);
+
+  std::vector<uint8_t> compressed;
+  EncodeWithEncoder(enc, &compressed);
+  JxlEncoderDestroy(enc);
+
+  std::vector<uint8_t> extra_data(1023);
+  jxl::AppendBoxHeader(jxl::MakeBoxType("crud"), extra_data.size(), false,
+                       &compressed);
+  compressed.insert(compressed.end(), extra_data.begin(), extra_data.end());
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_NE(nullptr, dec);
+
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO |
+                                               JXL_DEC_COLOR_ENCODING |
+                                               JXL_DEC_FULL_IMAGE));
+
+  JxlDecoderSetInput(dec, next_in, avail_in);
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &pixel_format, &buffer_size));
+  EXPECT_EQ(buffer_size, original_bytes.size());
+
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(xsize, info.xsize);
+  EXPECT_EQ(ysize, info.ysize);
+
+  EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+
+  size_t icc_profile_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetICCProfileSize(dec, &pixel_format,
+                                        JXL_COLOR_PROFILE_TARGET_DATA,
+                                        &icc_profile_size));
+  jxl::PaddedBytes icc_profile(icc_profile_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsICCProfile(
+                dec, &pixel_format, JXL_COLOR_PROFILE_TARGET_DATA,
+                icc_profile.data(), icc_profile.size()));
+
+  std::vector<uint8_t> decoded_bytes(buffer_size);
+
+  EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(dec, &pixel_format,
+                                                         decoded_bytes.data(),
+                                                         decoded_bytes.size()));
+
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+
+  JxlDecoderDestroy(dec);
+
+  jxl::CodecInOut decoded_io =
+      ConvertTestImage(decoded_bytes, xsize, ysize, pixel_format, icc_profile);
+
+  jxl::ButteraugliParams ba;
+  float butteraugli_score = ButteraugliDistance(
+      original_io.frames, decoded_io.frames, ba, jxl::GetJxlCms(),
+      /*distmap=*/nullptr, nullptr);
+  EXPECT_LE(butteraugli_score, 2.0f);
+}
+
+static const unsigned char kEncodedTestProfile[] = {
+    0x1f, 0x8b, 0x1,  0x13, 0x10, 0x0,  0x0,  0x0,  0x20, 0x4c, 0xcc, 0x3,
+    0xe7, 0xa0, 0xa5, 0xa2, 0x90, 0xa4, 0x27, 0xe8, 0x79, 0x1d, 0xe3, 0x26,
+    0x57, 0x54, 0xef, 0x0,  0xe8, 0x97, 0x2,  0xce, 0xa1, 0xd7, 0x85, 0x16,
+    0xb4, 0x29, 0x94, 0x58, 0xf2, 0x56, 0xc0, 0x76, 0xea, 0x23, 0xec, 0x7c,
+    0x73, 0x51, 0x41, 0x40, 0x23, 0x21, 0x95, 0x4,  0x75, 0x12, 0xc9, 0xcc,
+    0x16, 0xbd, 0xb6, 0x99, 0xad, 0xf8, 0x75, 0x35, 0xb6, 0x42, 0xae, 0xae,
+    0xae, 0x86, 0x56, 0xf8, 0xcc, 0x16, 0x30, 0xb3, 0x45, 0xad, 0xd,  0x40,
+    0xd6, 0xd1, 0xd6, 0x99, 0x40, 0xbe, 0xe2, 0xdc, 0x31, 0x7,  0xa6, 0xb9,
+    0x27, 0x92, 0x38, 0x0,  0x3,  0x5e, 0x2c, 0xbe, 0xe6, 0xfb, 0x19, 0xbf,
+    0xf3, 0x6d, 0xbc, 0x4d, 0x64, 0xe5, 0xba, 0x76, 0xde, 0x31, 0x65, 0x66,
+    0x14, 0xa6, 0x3a, 0xc5, 0x8f, 0xb1, 0xb4, 0xba, 0x1f, 0xb1, 0xb8, 0xd4,
+    0x75, 0xba, 0x18, 0x86, 0x95, 0x3c, 0x26, 0xf6, 0x25, 0x62, 0x53, 0xfd,
+    0x9c, 0x94, 0x76, 0xf6, 0x95, 0x2c, 0xb1, 0xfd, 0xdc, 0xc0, 0xe4, 0x3f,
+    0xb3, 0xff, 0x67, 0xde, 0xd5, 0x94, 0xcc, 0xb0, 0x83, 0x2f, 0x28, 0x93,
+    0x92, 0x3,  0xa1, 0x41, 0x64, 0x60, 0x62, 0x70, 0x80, 0x87, 0xaf, 0xe7,
+    0x60, 0x4a, 0x20, 0x23, 0xb3, 0x11, 0x7,  0x38, 0x38, 0xd4, 0xa,  0x66,
+    0xb5, 0x93, 0x41, 0x90, 0x19, 0x17, 0x18, 0x60, 0xa5, 0xb,  0x7a, 0x24,
+    0xaa, 0x20, 0x81, 0xac, 0xa9, 0xa1, 0x70, 0xa6, 0x12, 0x8a, 0x4a, 0xa3,
+    0xa0, 0xf9, 0x9a, 0x97, 0xe7, 0xa8, 0xac, 0x8,  0xa8, 0xc4, 0x2a, 0x86,
+    0xa7, 0x69, 0x1e, 0x67, 0xe6, 0xbe, 0xa4, 0xd3, 0xff, 0x91, 0x61, 0xf6,
+    0x8a, 0xe6, 0xb5, 0xb3, 0x61, 0x9f, 0x19, 0x17, 0x98, 0x27, 0x6b, 0xe9,
+    0x8,  0x98, 0xe1, 0x21, 0x4a, 0x9,  0xb5, 0xd7, 0xca, 0xfa, 0x94, 0xd0,
+    0x69, 0x1a, 0xeb, 0x52, 0x1,  0x4e, 0xf5, 0xf6, 0xdf, 0x7f, 0xe7, 0x29,
+    0x70, 0xee, 0x4,  0xda, 0x2f, 0xa4, 0xff, 0xfe, 0xbb, 0x6f, 0xa8, 0xff,
+    0xfe, 0xdb, 0xaf, 0x8,  0xf6, 0x72, 0xa1, 0x40, 0x5d, 0xf0, 0x2d, 0x8,
+    0x82, 0x5b, 0x87, 0xbd, 0x10, 0x8,  0xe9, 0x7,  0xee, 0x4b, 0x80, 0xda,
+    0x4a, 0x4,  0xc5, 0x5e, 0xa0, 0xb7, 0x1e, 0x60, 0xb0, 0x59, 0x76, 0x60,
+    0xb,  0x2e, 0x19, 0x8a, 0x2e, 0x1c, 0xe6, 0x6,  0x20, 0xb8, 0x64, 0x18,
+    0x2a, 0xcf, 0x51, 0x94, 0xd4, 0xee, 0xc3, 0xfe, 0x39, 0x74, 0xd4, 0x2b,
+    0x48, 0xc9, 0x83, 0x4c, 0x9b, 0xd0, 0x4c, 0x35, 0x10, 0xe3, 0x9,  0xf7,
+    0x72, 0xf0, 0x7a, 0xe,  0xbf, 0x7d, 0x36, 0x2e, 0x19, 0x7e, 0x3f, 0xc,
+    0xf7, 0x93, 0xe7, 0xf4, 0x1d, 0x32, 0xc6, 0xb0, 0x89, 0xad, 0xe0, 0x28,
+    0xc1, 0xa7, 0x59, 0xe3, 0x0,
+};
+
+TEST(RoundtripTest, TestICCProfile) {
+  // JxlEncoderSetICCProfile parses the ICC profile, so a valid profile is
+  // needed. The profile should be passed correctly through the roundtrip.
+  jxl::BitReader reader(jxl::Span<const uint8_t>(kEncodedTestProfile,
+                                                 sizeof(kEncodedTestProfile)));
+  jxl::PaddedBytes icc;
+  ASSERT_TRUE(ReadICC(&reader, &icc));
+  ASSERT_TRUE(reader.Close());
+
+  JxlPixelFormat format =
+      JxlPixelFormat{3, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
+
+  size_t xsize = 25;
+  size_t ysize = 37;
+  const std::vector<uint8_t> original_bytes =
+      GetTestImage<uint8_t>(xsize, ysize, format);
+
+  JxlEncoder* enc = JxlEncoderCreate(nullptr);
+  EXPECT_NE(nullptr, enc);
+
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = JXL_TRUE;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info));
+
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetICCProfile(enc, icc.data(), icc.size()));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc, nullptr);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &format,
+                                    (void*)original_bytes.data(),
+                                    original_bytes.size()));
+  JxlEncoderCloseInput(enc);
+
+  std::vector<uint8_t> compressed;
+  EncodeWithEncoder(enc, &compressed);
+  JxlEncoderDestroy(enc);
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_NE(nullptr, dec);
+
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO |
+                                               JXL_DEC_COLOR_ENCODING |
+                                               JXL_DEC_FULL_IMAGE));
+
+  JxlDecoderSetInput(dec, next_in, avail_in);
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  EXPECT_EQ(buffer_size, original_bytes.size());
+
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(xsize, info.xsize);
+  EXPECT_EQ(ysize, info.ysize);
+
+  EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+
+  size_t dec_icc_size;
+  EXPECT_EQ(
+      JXL_DEC_SUCCESS,
+      JxlDecoderGetICCProfileSize(
+          dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_icc_size));
+  EXPECT_EQ(icc.size(), dec_icc_size);
+  jxl::PaddedBytes dec_icc(dec_icc_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsICCProfile(dec, &format,
+                                           JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                           dec_icc.data(), dec_icc.size()));
+
+  std::vector<uint8_t> decoded_bytes(buffer_size);
+
+  EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetImageOutBuffer(dec, &format, decoded_bytes.data(),
+                                        decoded_bytes.size()));
+
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(icc, dec_icc);
+
+  JxlDecoderDestroy(dec);
+}
+
+#if JPEGXL_ENABLE_JPEG  // Loading .jpg files requires libjpeg support.
+TEST(RoundtripTest, JXL_TRANSCODE_JPEG_TEST(TestJPEGReconstruction)) {
+  const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg";
+  const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path);
+  jxl::CodecInOut orig_io;
+  ASSERT_TRUE(
+      SetFromBytes(jxl::Span<const uint8_t>(orig), &orig_io, /*pool=*/nullptr));
+
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc.get(), JXL_TRUE));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderStoreJPEGMetadata(enc.get(), JXL_TRUE));
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddJPEGFrame(frame_settings, orig.data(), orig.size()));
+  JxlEncoderCloseInput(enc.get());
+
+  std::vector<uint8_t> compressed;
+  EncodeWithEncoder(enc.get(), &compressed);
+
+  JxlDecoderPtr dec = JxlDecoderMake(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec.get(), JXL_DEC_JPEG_RECONSTRUCTION | JXL_DEC_FULL_IMAGE));
+  JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size());
+  EXPECT_EQ(JXL_DEC_JPEG_RECONSTRUCTION, JxlDecoderProcessInput(dec.get()));
+  std::vector<uint8_t> reconstructed_buffer(128);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data(),
+                                    reconstructed_buffer.size()));
+  size_t used = 0;
+  JxlDecoderStatus dec_process_result = JXL_DEC_JPEG_NEED_MORE_OUTPUT;
+  while (dec_process_result == JXL_DEC_JPEG_NEED_MORE_OUTPUT) {
+    used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get());
+    reconstructed_buffer.resize(reconstructed_buffer.size() * 2);
+    EXPECT_EQ(
+        JXL_DEC_SUCCESS,
+        JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data() + used,
+                                reconstructed_buffer.size() - used));
+    dec_process_result = JxlDecoderProcessInput(dec.get());
+  }
+  ASSERT_EQ(JXL_DEC_FULL_IMAGE, dec_process_result);
+  used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get());
+  ASSERT_EQ(used, orig.size());
+  EXPECT_EQ(0, memcmp(reconstructed_buffer.data(), orig.data(), used));
+}
+#endif  // JPEGXL_ENABLE_JPEG
diff --git a/third_party/jpeg-xl/lib/jxl/sanitizers.h b/third_party/jpeg-xl/lib/jxl/sanitizers.h
new file mode 100644
index 0000000000..ce0bd8dc63
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/sanitizers.h
@@ -0,0 +1,242 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_SANITIZERS_H_
+#define LIB_JXL_SANITIZERS_H_
+
+#include <inttypes.h>
+#include <stddef.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/sanitizer_definitions.h"
+#include "lib/jxl/image.h"
+
+#if JXL_MEMORY_SANITIZER
+#include <stdio.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+#include "sanitizer/msan_interface.h"
+#endif
+
+namespace jxl {
+namespace msan {
+
+#if JXL_MEMORY_SANITIZER
+
+// Chosen so that kSanitizerSentinel is four copies of kSanitizerSentinelByte.
+constexpr uint8_t kSanitizerSentinelByte = 0x48;
+constexpr float kSanitizerSentinel = 205089.125f;
+
+static JXL_INLINE JXL_MAYBE_UNUSED void PoisonMemory(const volatile void* m,
+                                                     size_t size) {
+  __msan_poison(m, size);
+}
+
+static JXL_INLINE JXL_MAYBE_UNUSED void UnpoisonMemory(const volatile void* m,
+                                                       size_t size) {
+  __msan_unpoison(m, size);
+}
+
+static JXL_INLINE JXL_MAYBE_UNUSED void UnpoisonCStr(const char* c) {
+  do {
+    UnpoisonMemory(c, 1);
+  } while (*c++);
+}
+
+static JXL_INLINE JXL_MAYBE_UNUSED void MemoryIsInitialized(
+    const volatile void* m, size_t size) {
+  __msan_check_mem_is_initialized(m, size);
+}
+
+// Mark all the bytes of an image (including padding) as poisoned bytes.
+static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const PlaneBase& im) {
+  PoisonMemory(im.bytes(), im.bytes_per_row() * im.ysize());
+}
+
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const Image3<T>& im) {
+  PoisonImage(im.Plane(0));
+  PoisonImage(im.Plane(1));
+  PoisonImage(im.Plane(2));
+}
+
+// Print the uninitialized regions of an image.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED void PrintImageUninitialized(
+    const Plane<T>& im) {
+  fprintf(stderr,
+          "Uninitialized regions for image of size %" PRIu64 "x%" PRIu64 ":\n",
+          static_cast<uint64_t>(im.xsize()), static_cast<uint64_t>(im.ysize()));
+
+  // A segment of uninitialized pixels in a row, in the format [first, second).
+  typedef std::pair<size_t, size_t> PixelSegment;
+
+  // Helper class to merge and print a list of rows of PixelSegment that may be
+  // the same over big ranges of rows. This compacts the output to ranges of
+  // rows like "[y0, y1): [x0, x1) [x2, x3)".
+  class RowsMerger {
+   public:
+    // Add a new row the list of rows. If the row is the same as the previous
+    // one it will be merged showing a range of rows [y0, y1), but if the new
+    // row is different the current range of rows (if any) will be printed and a
+    // new one will be started.
+    void AddRow(size_t y, std::vector<PixelSegment>&& new_row) {
+      if (start_y_ != -1 && new_row != segments_) {
+        PrintRow(y);
+      }
+      if (new_row.empty()) {
+        // Skip ranges with no uninitialized pixels.
+        start_y_ = -1;
+        segments_.clear();
+        return;
+      }
+      if (start_y_ == -1) {
+        start_y_ = y;
+        segments_ = std::move(new_row);
+      }
+    }
+
+    // Print the contents of the range of rows [start_y_, end_y) if any.
+    void PrintRow(size_t end_y) {
+      if (start_y_ == -1) return;
+      if (segments_.empty()) {
+        start_y_ = -1;
+        return;
+      }
+      if (end_y - start_y_ > 1) {
+        fprintf(stderr, " y=[%" PRId64 ", %" PRIu64 "):",
+                static_cast<int64_t>(start_y_), static_cast<uint64_t>(end_y));
+      } else {
+        fprintf(stderr, " y=[%" PRId64 "]:", static_cast<int64_t>(start_y_));
+      }
+      for (const auto& seg : segments_) {
+        if (seg.first + 1 == seg.second) {
+          fprintf(stderr, " [%" PRId64 "]", static_cast<int64_t>(seg.first));
+        } else {
+          fprintf(stderr, " [%" PRId64 ", %" PRIu64 ")",
+                  static_cast<int64_t>(seg.first),
+                  static_cast<uint64_t>(seg.second));
+        }
+      }
+      fprintf(stderr, "\n");
+      start_y_ = -1;
+    }
+
+   private:
+    std::vector<PixelSegment> segments_;
+    // Row number of the first row in the range of rows that have |segments| as
+    // the undefined segments.
+    ssize_t start_y_ = -1;
+  } rows_merger;
+
+  class SegmentsMerger {
+   public:
+    void AddValue(size_t x) {
+      if (row.empty() || row.back().second != x) {
+        row.emplace_back(x, x + 1);
+      } else {
+        row.back().second = x + 1;
+      }
+    }
+
+    std::vector<PixelSegment> row;
+  };
+
+  for (size_t y = 0; y < im.ysize(); y++) {
+    auto* row = im.Row(y);
+    SegmentsMerger seg_merger;
+    size_t x = 0;
+    while (x < im.xsize()) {
+      intptr_t ret =
+          __msan_test_shadow(row + x, (im.xsize() - x) * sizeof(row[0]));
+      if (ret < 0) break;
+      size_t next_x = x + ret / sizeof(row[0]);
+      seg_merger.AddValue(next_x);
+      x = next_x + 1;
+    }
+    rows_merger.AddRow(y, std::move(seg_merger.row));
+  }
+  rows_merger.PrintRow(im.ysize());
+}
+
+// Check that all the pixels in the provided rect of the image are initialized
+// (not poisoned). If any of the values is poisoned it will abort.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED void CheckImageInitialized(
+    const Plane<T>& im, const Rect& r, size_t c, const char* message) {
+  JXL_ASSERT(r.x0() <= im.xsize());
+  JXL_ASSERT(r.x0() + r.xsize() <= im.xsize());
+  JXL_ASSERT(r.y0() <= im.ysize());
+  JXL_ASSERT(r.y0() + r.ysize() <= im.ysize());
+  for (size_t y = r.y0(); y < r.y0() + r.ysize(); y++) {
+    const auto* row = im.Row(y);
+    intptr_t ret = __msan_test_shadow(row + r.x0(), sizeof(*row) * r.xsize());
+    if (ret != -1) {
+      JXL_DEBUG(
+          1,
+          "Checking an image of %" PRIu64 " x %" PRIu64 ", rect x0=%" PRIu64
+          ", y0=%" PRIu64
+          ", "
+          "xsize=%" PRIu64 ", ysize=%" PRIu64,
+          static_cast<uint64_t>(im.xsize()), static_cast<uint64_t>(im.ysize()),
+          static_cast<uint64_t>(r.x0()), static_cast<uint64_t>(r.y0()),
+          static_cast<uint64_t>(r.xsize()), static_cast<uint64_t>(r.ysize()));
+      size_t x = ret / sizeof(*row);
+      JXL_DEBUG(1,
+                "CheckImageInitialized failed at x=%" PRIu64 ", y=%" PRIu64
+                ", c=%" PRIu64 ": %s",
+                static_cast<uint64_t>(r.x0() + x), static_cast<uint64_t>(y),
+                static_cast<uint64_t>(c), message ? message : "");
+      PrintImageUninitialized(im);
+    }
+    // This will report an error if memory is not initialized.
+    __msan_check_mem_is_initialized(row + r.x0(), sizeof(*row) * r.xsize());
+  }
+}
+
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED void CheckImageInitialized(
+    const Image3<T>& im, const Rect& r, const char* message) {
+  for (size_t c = 0; c < 3; c++) {
+    std::string str_message(message);
+    str_message += " c=" + std::to_string(c);
+    CheckImageInitialized(im.Plane(c), r, c, str_message.c_str());
+  }
+}
+
+#define JXL_CHECK_IMAGE_INITIALIZED(im, r) \
+  ::jxl::msan::CheckImageInitialized(im, r, "im=" #im ", r=" #r);
+
+#define JXL_CHECK_PLANE_INITIALIZED(im, r, c) \
+  ::jxl::msan::CheckImageInitialized(im, r, c, "im=" #im ", r=" #r ", c=" #c);
+
+#else  // JXL_MEMORY_SANITIZER
+
+// In non-msan mode these functions don't use volatile since it is not needed
+// for the empty functions.
+
+static JXL_INLINE JXL_MAYBE_UNUSED void PoisonMemory(const void*, size_t) {}
+static JXL_INLINE JXL_MAYBE_UNUSED void UnpoisonMemory(const void*, size_t) {}
+static JXL_INLINE JXL_MAYBE_UNUSED void UnpoisonCStr(const char*) {}
+static JXL_INLINE JXL_MAYBE_UNUSED void MemoryIsInitialized(const void*,
+                                                            size_t) {}
+
+static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const PlaneBase& im) {}
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const Plane<T>& im) {}
+
+#define JXL_CHECK_IMAGE_INITIALIZED(im, r)
+#define JXL_CHECK_PLANE_INITIALIZED(im, r, c)
+
+#endif
+
+}  // namespace msan
+}  // namespace jxl
+
+#endif  // LIB_JXL_SANITIZERS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/simd_util-inl.h b/third_party/jpeg-xl/lib/jxl/simd_util-inl.h
new file mode 100644
index 0000000000..77b207ffe8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/simd_util-inl.h
@@ -0,0 +1,349 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Misc utilities for SIMD operations
+
+#if defined(LIB_JXL_SIMD_UTIL_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_SIMD_UTIL_INL_H_
+#undef LIB_JXL_SIMD_UTIL_INL_H_
+#else
+#define LIB_JXL_SIMD_UTIL_INL_H_
+#endif
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+#if HWY_CAP_GE512
+using hwy::HWY_NAMESPACE::Half;
+using hwy::HWY_NAMESPACE::Vec;
+template <size_t i, class DF, class V>
+HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) {
+  using HF = Half<DF>;
+  using HHF = Half<HF>;
+  auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v);
+  return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half);
+}
+
+template <class DF, class V>
+HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) {
+  using HF = Half<DF>;
+  return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0));
+}
+
+#endif
+
+// Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be
+// aligned.
+template <class DF, class V, typename T>
+void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
+  static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
+#if HWY_TARGET == HWY_SCALAR
+  Store(v0, df, mem);
+  Store(v1, df, mem + 1);
+#elif !HWY_CAP_GE256
+  Store(InterleaveLower(df, v0, v1), df, mem);
+  Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
+#else
+  if (!HWY_CAP_GE512 || Lanes(df) == 8) {
+    auto t0 = InterleaveLower(df, v0, v1);
+    auto t1 = InterleaveUpper(df, v0, v1);
+    Store(ConcatLowerLower(df, t1, t0), df, mem);
+    Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
+  } else {
+#if HWY_CAP_GE512
+    auto t0 = InterleaveLower(df, v0, v1);
+    auto t1 = InterleaveUpper(df, v0, v1);
+    Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
+                  Quarter<1>(df, t0), Quarter<1>(df, t1)),
+          df, mem);
+    Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
+                  Quarter<3>(df, t0), Quarter<3>(df, t1)),
+          df, mem + Lanes(df));
+#endif
+  }
+#endif
+}
+
+// Stores v0[0], v1[0], v2[0], v3[0], v0[1] ... to mem, in this order. Mem must
+// be aligned.
+template <class DF, class V, typename T>
+void StoreInterleaved(const DF df, V v0, V v1, V v2, V v3, T* mem) {
+  static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
+#if HWY_TARGET == HWY_SCALAR
+  Store(v0, df, mem);
+  Store(v1, df, mem + 1);
+  Store(v2, df, mem + 2);
+  Store(v3, df, mem + 3);
+#elif !HWY_CAP_GE256
+  auto t0 = InterleaveLower(df, v0, v2);
+  auto t1 = InterleaveLower(df, v1, v3);
+  auto t2 = InterleaveUpper(df, v0, v2);
+  auto t3 = InterleaveUpper(df, v1, v3);
+  Store(InterleaveLower(df, t0, t1), df, mem);
+  Store(InterleaveUpper(df, t0, t1), df, mem + Lanes(df));
+  Store(InterleaveLower(df, t2, t3), df, mem + 2 * Lanes(df));
+  Store(InterleaveUpper(df, t2, t3), df, mem + 3 * Lanes(df));
+#elif !HWY_CAP_GE512
+  auto t0 = InterleaveLower(df, v0, v2);
+  auto t1 = InterleaveLower(df, v1, v3);
+  auto t2 = InterleaveUpper(df, v0, v2);
+  auto t3 = InterleaveUpper(df, v1, v3);
+
+  auto m0 = InterleaveLower(df, t0, t1);
+  auto m1 = InterleaveUpper(df, t0, t1);
+  auto m2 = InterleaveLower(df, t2, t3);
+  auto m3 = InterleaveUpper(df, t2, t3);
+
+  Store(ConcatLowerLower(df, m1, m0), df, mem);
+  Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df));
+  Store(ConcatUpperUpper(df, m1, m0), df, mem + 2 * Lanes(df));
+  Store(ConcatUpperUpper(df, m3, m2), df, mem + 3 * Lanes(df));
+#else
+  auto t0 = InterleaveLower(df, v0, v2);
+  auto t1 = InterleaveLower(df, v1, v3);
+  auto t2 = InterleaveUpper(df, v0, v2);
+  auto t3 = InterleaveUpper(df, v1, v3);
+
+  auto m0 = InterleaveLower(df, t0, t1);
+  auto m1 = InterleaveUpper(df, t0, t1);
+  auto m2 = InterleaveLower(df, t2, t3);
+  auto m3 = InterleaveUpper(df, t2, t3);
+
+  Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2),
+                Quarter<0>(df, m3)),
+        df, mem);
+  Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2),
+                Quarter<1>(df, m3)),
+        df, mem + Lanes(df));
+  Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2),
+                Quarter<2>(df, m3)),
+        df, mem + 2 * Lanes(df));
+  Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2),
+                Quarter<3>(df, m3)),
+        df, mem + 3 * Lanes(df));
+#endif
+}
+
+// Stores v0[0], v1[0], v2[0], v3[0], v4[0], v5[0], v6[0], v7[0], v0[1] ... to
+// mem, in this order. Mem must be aligned.
+template <class DF, class V>
+void StoreInterleaved(const DF df, V v0, V v1, V v2, V v3, V v4, V v5, V v6,
+                      V v7, float* mem) {
+#if HWY_TARGET == HWY_SCALAR
+  Store(v0, df, mem);
+  Store(v1, df, mem + 1);
+  Store(v2, df, mem + 2);
+  Store(v3, df, mem + 3);
+  Store(v4, df, mem + 4);
+  Store(v5, df, mem + 5);
+  Store(v6, df, mem + 6);
+  Store(v7, df, mem + 7);
+#elif !HWY_CAP_GE256
+  auto t0 = InterleaveLower(df, v0, v4);
+  auto t1 = InterleaveLower(df, v1, v5);
+  auto t2 = InterleaveLower(df, v2, v6);
+  auto t3 = InterleaveLower(df, v3, v7);
+  auto t4 = InterleaveUpper(df, v0, v4);
+  auto t5 = InterleaveUpper(df, v1, v5);
+  auto t6 = InterleaveUpper(df, v2, v6);
+  auto t7 = InterleaveUpper(df, v3, v7);
+
+  auto w0 = InterleaveLower(df, t0, t2);
+  auto w1 = InterleaveLower(df, t1, t3);
+  auto w2 = InterleaveUpper(df, t0, t2);
+  auto w3 = InterleaveUpper(df, t1, t3);
+  auto w4 = InterleaveLower(df, t4, t6);
+  auto w5 = InterleaveLower(df, t5, t7);
+  auto w6 = InterleaveUpper(df, t4, t6);
+  auto w7 = InterleaveUpper(df, t5, t7);
+
+  Store(InterleaveLower(df, w0, w1), df, mem);
+  Store(InterleaveUpper(df, w0, w1), df, mem + Lanes(df));
+  Store(InterleaveLower(df, w2, w3), df, mem + 2 * Lanes(df));
+  Store(InterleaveUpper(df, w2, w3), df, mem + 3 * Lanes(df));
+  Store(InterleaveLower(df, w4, w5), df, mem + 4 * Lanes(df));
+  Store(InterleaveUpper(df, w4, w5), df, mem + 5 * Lanes(df));
+  Store(InterleaveLower(df, w6, w7), df, mem + 6 * Lanes(df));
+  Store(InterleaveUpper(df, w6, w7), df, mem + 7 * Lanes(df));
+#elif !HWY_CAP_GE512
+  auto t0 = InterleaveLower(df, v0, v4);
+  auto t1 = InterleaveLower(df, v1, v5);
+  auto t2 = InterleaveLower(df, v2, v6);
+  auto t3 = InterleaveLower(df, v3, v7);
+  auto t4 = InterleaveUpper(df, v0, v4);
+  auto t5 = InterleaveUpper(df, v1, v5);
+  auto t6 = InterleaveUpper(df, v2, v6);
+  auto t7 = InterleaveUpper(df, v3, v7);
+
+  auto w0 = InterleaveLower(df, t0, t2);
+  auto w1 = InterleaveLower(df, t1, t3);
+  auto w2 = InterleaveUpper(df, t0, t2);
+  auto w3 = InterleaveUpper(df, t1, t3);
+  auto w4 = InterleaveLower(df, t4, t6);
+  auto w5 = InterleaveLower(df, t5, t7);
+  auto w6 = InterleaveUpper(df, t4, t6);
+  auto w7 = InterleaveUpper(df, t5, t7);
+
+  auto m0 = InterleaveLower(df, w0, w1);
+  auto m1 = InterleaveUpper(df, w0, w1);
+  auto m2 = InterleaveLower(df, w2, w3);
+  auto m3 = InterleaveUpper(df, w2, w3);
+  auto m4 = InterleaveLower(df, w4, w5);
+  auto m5 = InterleaveUpper(df, w4, w5);
+  auto m6 = InterleaveLower(df, w6, w7);
+  auto m7 = InterleaveUpper(df, w6, w7);
+
+  Store(ConcatLowerLower(df, m1, m0), df, mem);
+  Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df));
+  Store(ConcatLowerLower(df, m5, m4), df, mem + 2 * Lanes(df));
+  Store(ConcatLowerLower(df, m7, m6), df, mem + 3 * Lanes(df));
+  Store(ConcatUpperUpper(df, m1, m0), df, mem + 4 * Lanes(df));
+  Store(ConcatUpperUpper(df, m3, m2), df, mem + 5 * Lanes(df));
+  Store(ConcatUpperUpper(df, m5, m4), df, mem + 6 * Lanes(df));
+  Store(ConcatUpperUpper(df, m7, m6), df, mem + 7 * Lanes(df));
+#else
+  auto t0 = InterleaveLower(df, v0, v4);
+  auto t1 = InterleaveLower(df, v1, v5);
+  auto t2 = InterleaveLower(df, v2, v6);
+  auto t3 = InterleaveLower(df, v3, v7);
+  auto t4 = InterleaveUpper(df, v0, v4);
+  auto t5 = InterleaveUpper(df, v1, v5);
+  auto t6 = InterleaveUpper(df, v2, v6);
+  auto t7 = InterleaveUpper(df, v3, v7);
+
+  auto w0 = InterleaveLower(df, t0, t2);
+  auto w1 = InterleaveLower(df, t1, t3);
+  auto w2 = InterleaveUpper(df, t0, t2);
+  auto w3 = InterleaveUpper(df, t1, t3);
+  auto w4 = InterleaveLower(df, t4, t6);
+  auto w5 = InterleaveLower(df, t5, t7);
+  auto w6 = InterleaveUpper(df, t4, t6);
+  auto w7 = InterleaveUpper(df, t5, t7);
+
+  auto m0 = InterleaveLower(df, w0, w1);
+  auto m1 = InterleaveUpper(df, w0, w1);
+  auto m2 = InterleaveLower(df, w2, w3);
+  auto m3 = InterleaveUpper(df, w2, w3);
+  auto m4 = InterleaveLower(df, w4, w5);
+  auto m5 = InterleaveUpper(df, w4, w5);
+  auto m6 = InterleaveLower(df, w6, w7);
+  auto m7 = InterleaveUpper(df, w6, w7);
+
+  Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2),
+                Quarter<0>(df, m3)),
+        df, mem);
+  Store(Concat4(df, Quarter<0>(df, m4), Quarter<0>(df, m5), Quarter<0>(df, m6),
+                Quarter<0>(df, m7)),
+        df, mem + Lanes(df));
+  Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2),
+                Quarter<1>(df, m3)),
+        df, mem + 2 * Lanes(df));
+  Store(Concat4(df, Quarter<1>(df, m4), Quarter<1>(df, m5), Quarter<1>(df, m6),
+                Quarter<1>(df, m7)),
+        df, mem + 3 * Lanes(df));
+  Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2),
+                Quarter<2>(df, m3)),
+        df, mem + 4 * Lanes(df));
+  Store(Concat4(df, Quarter<2>(df, m4), Quarter<2>(df, m5), Quarter<2>(df, m6),
+                Quarter<2>(df, m7)),
+        df, mem + 5 * Lanes(df));
+  Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2),
+                Quarter<3>(df, m3)),
+        df, mem + 6 * Lanes(df));
+  Store(Concat4(df, Quarter<3>(df, m4), Quarter<3>(df, m5), Quarter<3>(df, m6),
+                Quarter<3>(df, m7)),
+        df, mem + 7 * Lanes(df));
+#endif
+}
+
+#if HWY_CAP_GE256
+JXL_INLINE void Transpose8x8Block(const int32_t* JXL_RESTRICT from,
+                                  int32_t* JXL_RESTRICT to, size_t fromstride) {
+  const HWY_CAPPED(int32_t, 8) d;
+  auto i0 = Load(d, from);
+  auto i1 = Load(d, from + 1 * fromstride);
+  auto i2 = Load(d, from + 2 * fromstride);
+  auto i3 = Load(d, from + 3 * fromstride);
+  auto i4 = Load(d, from + 4 * fromstride);
+  auto i5 = Load(d, from + 5 * fromstride);
+  auto i6 = Load(d, from + 6 * fromstride);
+  auto i7 = Load(d, from + 7 * fromstride);
+
+  const auto q0 = InterleaveLower(d, i0, i2);
+  const auto q1 = InterleaveLower(d, i1, i3);
+  const auto q2 = InterleaveUpper(d, i0, i2);
+  const auto q3 = InterleaveUpper(d, i1, i3);
+  const auto q4 = InterleaveLower(d, i4, i6);
+  const auto q5 = InterleaveLower(d, i5, i7);
+  const auto q6 = InterleaveUpper(d, i4, i6);
+  const auto q7 = InterleaveUpper(d, i5, i7);
+
+  const auto r0 = InterleaveLower(d, q0, q1);
+  const auto r1 = InterleaveUpper(d, q0, q1);
+  const auto r2 = InterleaveLower(d, q2, q3);
+  const auto r3 = InterleaveUpper(d, q2, q3);
+  const auto r4 = InterleaveLower(d, q4, q5);
+  const auto r5 = InterleaveUpper(d, q4, q5);
+  const auto r6 = InterleaveLower(d, q6, q7);
+  const auto r7 = InterleaveUpper(d, q6, q7);
+
+  i0 = ConcatLowerLower(d, r4, r0);
+  i1 = ConcatLowerLower(d, r5, r1);
+  i2 = ConcatLowerLower(d, r6, r2);
+  i3 = ConcatLowerLower(d, r7, r3);
+  i4 = ConcatUpperUpper(d, r4, r0);
+  i5 = ConcatUpperUpper(d, r5, r1);
+  i6 = ConcatUpperUpper(d, r6, r2);
+  i7 = ConcatUpperUpper(d, r7, r3);
+
+  Store(i0, d, to);
+  Store(i1, d, to + 1 * 8);
+  Store(i2, d, to + 2 * 8);
+  Store(i3, d, to + 3 * 8);
+  Store(i4, d, to + 4 * 8);
+  Store(i5, d, to + 5 * 8);
+  Store(i6, d, to + 6 * 8);
+  Store(i7, d, to + 7 * 8);
+}
+#elif HWY_TARGET != HWY_SCALAR
+JXL_INLINE void Transpose8x8Block(const int32_t* JXL_RESTRICT from,
+                                  int32_t* JXL_RESTRICT to, size_t fromstride) {
+  const HWY_CAPPED(int32_t, 4) d;
+  for (size_t n = 0; n < 8; n += 4) {
+    for (size_t m = 0; m < 8; m += 4) {
+      auto p0 = Load(d, from + n * fromstride + m);
+      auto p1 = Load(d, from + (n + 1) * fromstride + m);
+      auto p2 = Load(d, from + (n + 2) * fromstride + m);
+      auto p3 = Load(d, from + (n + 3) * fromstride + m);
+      const auto q0 = InterleaveLower(d, p0, p2);
+      const auto q1 = InterleaveLower(d, p1, p3);
+      const auto q2 = InterleaveUpper(d, p0, p2);
+      const auto q3 = InterleaveUpper(d, p1, p3);
+
+      const auto r0 = InterleaveLower(d, q0, q1);
+      const auto r1 = InterleaveUpper(d, q0, q1);
+      const auto r2 = InterleaveLower(d, q2, q3);
+      const auto r3 = InterleaveUpper(d, q2, q3);
+      Store(r0, d, to + m * 8 + n);
+      Store(r1, d, to + (1 + m) * 8 + n);
+      Store(r2, d, to + (2 + m) * 8 + n);
+      Store(r3, d, to + (3 + m) * 8 + n);
+    }
+  }
+}
+
+#endif
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_SIMD_UTIL_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/simd_util_test.cc b/third_party/jpeg-xl/lib/jxl/simd_util_test.cc
new file mode 100644
index 0000000000..b81f5d1279
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/simd_util_test.cc
@@ -0,0 +1,84 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/simd_util_test.cc"
+#include <hwy/foreach_target.h>
+
+#include "lib/jxl/simd_util-inl.h"
+
+// Test utils
+#include <hwy/highway.h>
+#include <hwy/tests/test_util-inl.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+HWY_NOINLINE void TestInterleave2() {
+  HWY_FULL(float) d;
+  auto vec1 = Iota(d, 0 * 128.0);
+  auto vec2 = Iota(d, 1 * 128.0);
+  HWY_ALIGN float mem[MaxLanes(d) * 2];
+  StoreInterleaved(d, vec1, vec2, mem);
+  for (size_t i = 0; i < Lanes(d); i++) {
+    for (size_t j = 0; j < 2; j++) {
+      EXPECT_EQ(mem[2 * i + j], j * 128 + i) << "i: " << i << " j: " << j;
+    }
+  }
+}
+HWY_NOINLINE void TestInterleave4() {
+  HWY_FULL(float) d;
+  auto vec1 = Iota(d, 0 * 128.0);
+  auto vec2 = Iota(d, 1 * 128.0);
+  auto vec3 = Iota(d, 2 * 128.0);
+  auto vec4 = Iota(d, 3 * 128.0);
+  HWY_ALIGN float mem[MaxLanes(d) * 4];
+  StoreInterleaved(d, vec1, vec2, vec3, vec4, mem);
+  for (size_t i = 0; i < Lanes(d); i++) {
+    for (size_t j = 0; j < 4; j++) {
+      EXPECT_EQ(mem[4 * i + j], j * 128 + i) << "i: " << i << " j: " << j;
+    }
+  }
+}
+HWY_NOINLINE void TestInterleave8() {
+  HWY_FULL(float) d;
+  auto vec1 = Iota(d, 0 * 128.0);
+  auto vec2 = Iota(d, 1 * 128.0);
+  auto vec3 = Iota(d, 2 * 128.0);
+  auto vec4 = Iota(d, 3 * 128.0);
+  auto vec5 = Iota(d, 4 * 128.0);
+  auto vec6 = Iota(d, 5 * 128.0);
+  auto vec7 = Iota(d, 6 * 128.0);
+  auto vec8 = Iota(d, 7 * 128.0);
+  HWY_ALIGN float mem[MaxLanes(d) * 8];
+  StoreInterleaved(d, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, mem);
+  for (size_t i = 0; i < Lanes(d); i++) {
+    for (size_t j = 0; j < 8; j++) {
+      EXPECT_EQ(mem[8 * i + j], j * 128 + i) << "i: " << i << " j: " << j;
+    }
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class SimdUtilTargetTest : public hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(SimdUtilTargetTest);
+
+HWY_EXPORT_AND_TEST_P(SimdUtilTargetTest, TestInterleave2);
+HWY_EXPORT_AND_TEST_P(SimdUtilTargetTest, TestInterleave4);
+HWY_EXPORT_AND_TEST_P(SimdUtilTargetTest, TestInterleave8);
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/speed_tier_test.cc b/third_party/jpeg-xl/lib/jxl/speed_tier_test.cc
new file mode 100644
index 0000000000..b3f30c3e4c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/speed_tier_test.cc
@@ -0,0 +1,108 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string>
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+struct SpeedTierTestParams {
+  explicit SpeedTierTestParams(const SpeedTier speed_tier,
+                               const bool shrink8 = false)
+      : speed_tier(speed_tier), shrink8(shrink8) {}
+  SpeedTier speed_tier;
+  bool shrink8;
+};
+
+std::ostream& operator<<(std::ostream& os, SpeedTierTestParams params) {
+  auto previous_flags = os.flags();
+  os << std::boolalpha;
+  os << "SpeedTierTestParams{" << static_cast<size_t>(params.speed_tier)
+     << ", /*shrink8=*/" << params.shrink8 << "}";
+  os.flags(previous_flags);
+  return os;
+}
+
+class SpeedTierTest : public testing::TestWithParam<SpeedTierTestParams> {};
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(
+    SpeedTierTestInstantiation, SpeedTierTest,
+    testing::Values(SpeedTierTestParams{SpeedTier::kCheetah,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kCheetah,
+                                        /*shrink8=*/false},
+                    SpeedTierTestParams{SpeedTier::kThunder,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kThunder,
+                                        /*shrink8=*/false},
+                    SpeedTierTestParams{SpeedTier::kLightning,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kLightning,
+                                        /*shrink8=*/false},
+                    SpeedTierTestParams{SpeedTier::kFalcon,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kFalcon,
+                                        /*shrink8=*/false},
+                    SpeedTierTestParams{SpeedTier::kHare,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kHare,
+                                        /*shrink8=*/false},
+                    SpeedTierTestParams{SpeedTier::kWombat,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kWombat,
+                                        /*shrink8=*/false},
+                    SpeedTierTestParams{SpeedTier::kSquirrel,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kSquirrel,
+                                        /*shrink8=*/false},
+                    SpeedTierTestParams{SpeedTier::kKitten,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kKitten,
+                                        /*shrink8=*/false},
+                    // Only downscaled image for Tortoise mode.
+                    SpeedTierTestParams{SpeedTier::kTortoise,
+                                        /*shrink8=*/true}));
+
+TEST_P(SpeedTierTest, Roundtrip) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io;
+  test::ThreadPoolForTests pool(8);
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+
+  const SpeedTierTestParams& params = GetParam();
+
+  if (params.shrink8) {
+    io.ShrinkTo(io.xsize() / 8, io.ysize() / 8);
+  }
+
+  CompressParams cparams;
+  cparams.speed_tier = params.speed_tier;
+
+  CodecInOut io2;
+  JXL_EXPECT_OK(test::Roundtrip(&io, cparams, {}, &io2, _));
+
+  // Can be 2.2 in non-hare mode.
+  EXPECT_LE(
+      ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(),
+                          /*distmap=*/nullptr, /*pool=*/nullptr),
+      2.8);
+}
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/splines.cc b/third_party/jpeg-xl/lib/jxl/splines.cc
new file mode 100644
index 0000000000..04d1df8e49
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/splines.cc
@@ -0,0 +1,694 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/splines.h"
+
+#include <algorithm>
+#include <cmath>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/opsin_params.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/splines.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/fast_math-inl.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::MulSub;
+using hwy::HWY_NAMESPACE::Sqrt;
+using hwy::HWY_NAMESPACE::Sub;
+
+// Given a set of DCT coefficients, this returns the result of performing cosine
+// interpolation on the original samples.
+float ContinuousIDCT(const float dct[32], const float t) {
+  // We compute here the DCT-3 of the `dct` vector, rescaled by a factor of
+  // sqrt(32). This is such that an input vector vector {x, 0, ..., 0} produces
+  // a constant result of x. dct[0] was scaled in Dequantize() to allow uniform
+  // treatment of all the coefficients.
+  constexpr float kMultipliers[32] = {
+      kPi / 32 * 0,  kPi / 32 * 1,  kPi / 32 * 2,  kPi / 32 * 3,  kPi / 32 * 4,
+      kPi / 32 * 5,  kPi / 32 * 6,  kPi / 32 * 7,  kPi / 32 * 8,  kPi / 32 * 9,
+      kPi / 32 * 10, kPi / 32 * 11, kPi / 32 * 12, kPi / 32 * 13, kPi / 32 * 14,
+      kPi / 32 * 15, kPi / 32 * 16, kPi / 32 * 17, kPi / 32 * 18, kPi / 32 * 19,
+      kPi / 32 * 20, kPi / 32 * 21, kPi / 32 * 22, kPi / 32 * 23, kPi / 32 * 24,
+      kPi / 32 * 25, kPi / 32 * 26, kPi / 32 * 27, kPi / 32 * 28, kPi / 32 * 29,
+      kPi / 32 * 30, kPi / 32 * 31,
+  };
+  HWY_CAPPED(float, 32) df;
+  auto result = Zero(df);
+  const auto tandhalf = Set(df, t + 0.5f);
+  for (int i = 0; i < 32; i += Lanes(df)) {
+    auto cos_arg = Mul(LoadU(df, kMultipliers + i), tandhalf);
+    auto cos = FastCosf(df, cos_arg);
+    auto local_res = Mul(LoadU(df, dct + i), cos);
+    result = MulAdd(Set(df, kSqrt2), local_res, result);
+  }
+  return GetLane(SumOfLanes(df, result));
+}
+
+template <typename DF>
+void DrawSegment(DF df, const SplineSegment& segment, const bool add,
+                 const size_t y, const size_t x, float* JXL_RESTRICT rows[3]) {
+  Rebind<int32_t, DF> di;
+  const auto inv_sigma = Set(df, segment.inv_sigma);
+  const auto half = Set(df, 0.5f);
+  const auto one_over_2s2 = Set(df, 0.353553391f);
+  const auto sigma_over_4_times_intensity =
+      Set(df, segment.sigma_over_4_times_intensity);
+  const auto dx = Sub(ConvertTo(df, Iota(di, x)), Set(df, segment.center_x));
+  const auto dy = Set(df, y - segment.center_y);
+  const auto sqd = MulAdd(dx, dx, Mul(dy, dy));
+  const auto distance = Sqrt(sqd);
+  const auto one_dimensional_factor =
+      Sub(FastErff(df, Mul(MulAdd(distance, half, one_over_2s2), inv_sigma)),
+          FastErff(df, Mul(MulSub(distance, half, one_over_2s2), inv_sigma)));
+  auto local_intensity =
+      Mul(sigma_over_4_times_intensity,
+          Mul(one_dimensional_factor, one_dimensional_factor));
+  for (size_t c = 0; c < 3; ++c) {
+    const auto cm = Set(df, add ? segment.color[c] : -segment.color[c]);
+    const auto in = LoadU(df, rows[c] + x);
+    StoreU(MulAdd(cm, local_intensity, in), df, rows[c] + x);
+  }
+}
+
+void DrawSegment(const SplineSegment& segment, const bool add, const size_t y,
+                 const ssize_t x0, ssize_t x1, float* JXL_RESTRICT rows[3]) {
+  ssize_t x =
+      std::max<ssize_t>(x0, segment.center_x - segment.maximum_distance + 0.5f);
+  // one-past-the-end
+  x1 =
+      std::min<ssize_t>(x1, segment.center_x + segment.maximum_distance + 1.5f);
+  HWY_FULL(float) df;
+  for (; x + static_cast<ssize_t>(Lanes(df)) <= x1; x += Lanes(df)) {
+    DrawSegment(df, segment, add, y, x, rows);
+  }
+  for (; x < x1; ++x) {
+    DrawSegment(HWY_CAPPED(float, 1)(), segment, add, y, x, rows);
+  }
+}
+
+void ComputeSegments(const Spline::Point& center, const float intensity,
+                     const float color[3], const float sigma,
+                     std::vector<SplineSegment>& segments,
+                     std::vector<std::pair<size_t, size_t>>& segments_by_y) {
+  // Sanity check sigma, inverse sigma and intensity
+  if (!(std::isfinite(sigma) && sigma != 0.0f && std::isfinite(1.0f / sigma) &&
+        std::isfinite(intensity))) {
+    return;
+  }
+#if JXL_HIGH_PRECISION
+  constexpr float kDistanceExp = 5;
+#else
+  // About 30% faster.
+  constexpr float kDistanceExp = 3;
+#endif
+  // We cap from below colors to at least 0.01.
+  float max_color = 0.01f;
+  for (size_t c = 0; c < 3; c++) {
+    max_color = std::max(max_color, std::abs(color[c] * intensity));
+  }
+  // Distance beyond which max_color*intensity*exp(-d^2 / (2 * sigma^2)) drops
+  // below 10^-kDistanceExp.
+  const float maximum_distance =
+      std::sqrt(-2 * sigma * sigma *
+                (std::log(0.1) * kDistanceExp - std::log(max_color)));
+  SplineSegment segment;
+  segment.center_y = center.y;
+  segment.center_x = center.x;
+  memcpy(segment.color, color, sizeof(segment.color));
+  segment.inv_sigma = 1.0f / sigma;
+  segment.sigma_over_4_times_intensity = .25f * sigma * intensity;
+  segment.maximum_distance = maximum_distance;
+  ssize_t y0 = center.y - maximum_distance + .5f;
+  ssize_t y1 = center.y + maximum_distance + 1.5f;  // one-past-the-end
+  for (ssize_t y = std::max<ssize_t>(y0, 0); y < y1; y++) {
+    segments_by_y.emplace_back(y, segments.size());
+  }
+  segments.push_back(segment);
+}
+
+void DrawSegments(float* JXL_RESTRICT row_x, float* JXL_RESTRICT row_y,
+                  float* JXL_RESTRICT row_b, const Rect& image_rect,
+                  const bool add, const SplineSegment* segments,
+                  const size_t* segment_indices,
+                  const size_t* segment_y_start) {
+  JXL_ASSERT(image_rect.ysize() == 1);
+  float* JXL_RESTRICT rows[3] = {row_x - image_rect.x0(),
+                                 row_y - image_rect.x0(),
+                                 row_b - image_rect.x0()};
+  size_t y = image_rect.y0();
+  for (size_t i = segment_y_start[y]; i < segment_y_start[y + 1]; i++) {
+    DrawSegment(segments[segment_indices[i]], add, y, image_rect.x0(),
+                image_rect.x0() + image_rect.xsize(), rows);
+  }
+}
+
+void SegmentsFromPoints(
+    const Spline& spline,
+    const std::vector<std::pair<Spline::Point, float>>& points_to_draw,
+    const float arc_length, std::vector<SplineSegment>& segments,
+    std::vector<std::pair<size_t, size_t>>& segments_by_y) {
+  const float inv_arc_length = 1.0f / arc_length;
+  int k = 0;
+  for (const auto& point_to_draw : points_to_draw) {
+    const Spline::Point& point = point_to_draw.first;
+    const float multiplier = point_to_draw.second;
+    const float progress_along_arc =
+        std::min(1.f, (k * kDesiredRenderingDistance) * inv_arc_length);
+    ++k;
+    float color[3];
+    for (size_t c = 0; c < 3; ++c) {
+      color[c] =
+          ContinuousIDCT(spline.color_dct[c], (32 - 1) * progress_along_arc);
+    }
+    const float sigma =
+        ContinuousIDCT(spline.sigma_dct, (32 - 1) * progress_along_arc);
+    ComputeSegments(point, multiplier, color, sigma, segments, segments_by_y);
+  }
+}
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(SegmentsFromPoints);
+HWY_EXPORT(DrawSegments);
+
+namespace {
+
+// It is not in spec, but reasonable limit to avoid overflows.
+template <typename T>
+Status ValidateSplinePointPos(const T& x, const T& y) {
+  constexpr T kSplinePosLimit = 1u << 23;
+  if ((x >= kSplinePosLimit) || (x <= -kSplinePosLimit) ||
+      (y >= kSplinePosLimit) || (y <= -kSplinePosLimit)) {
+    return JXL_FAILURE("Spline coordinates out of bounds");
+  }
+  return true;
+}
+
+// Maximum number of spline control points per frame is
+//   std::min(kMaxNumControlPoints, xsize * ysize / 2)
+constexpr size_t kMaxNumControlPoints = 1u << 20u;
+constexpr size_t kMaxNumControlPointsPerPixelRatio = 2;
+
+float AdjustedQuant(const int32_t adjustment) {
+  return (adjustment >= 0) ? (1.f + .125f * adjustment)
+                           : 1.f / (1.f - .125f * adjustment);
+}
+
+float InvAdjustedQuant(const int32_t adjustment) {
+  return (adjustment >= 0) ? 1.f / (1.f + .125f * adjustment)
+                           : (1.f - .125f * adjustment);
+}
+
+// X, Y, B, sigma.
+static constexpr float kChannelWeight[] = {0.0042f, 0.075f, 0.07f, .3333f};
+
+Status DecodeAllStartingPoints(std::vector<Spline::Point>* const points,
+                               BitReader* const br, ANSSymbolReader* reader,
+                               const std::vector<uint8_t>& context_map,
+                               const size_t num_splines) {
+  points->clear();
+  points->reserve(num_splines);
+  int64_t last_x = 0;
+  int64_t last_y = 0;
+  for (size_t i = 0; i < num_splines; i++) {
+    int64_t x =
+        reader->ReadHybridUint(kStartingPositionContext, br, context_map);
+    int64_t y =
+        reader->ReadHybridUint(kStartingPositionContext, br, context_map);
+    if (i != 0) {
+      x = UnpackSigned(x) + last_x;
+      y = UnpackSigned(y) + last_y;
+    }
+    JXL_RETURN_IF_ERROR(ValidateSplinePointPos(x, y));
+    points->emplace_back(static_cast<float>(x), static_cast<float>(y));
+    last_x = x;
+    last_y = y;
+  }
+  return true;
+}
+
+struct Vector {
+  float x, y;
+  Vector operator-() const { return {-x, -y}; }
+  Vector operator+(const Vector& other) const {
+    return {x + other.x, y + other.y};
+  }
+  float SquaredNorm() const { return x * x + y * y; }
+};
+Vector operator*(const float k, const Vector& vec) {
+  return {k * vec.x, k * vec.y};
+}
+
+Spline::Point operator+(const Spline::Point& p, const Vector& vec) {
+  return {p.x + vec.x, p.y + vec.y};
+}
+Vector operator-(const Spline::Point& a, const Spline::Point& b) {
+  return {a.x - b.x, a.y - b.y};
+}
+
+// TODO(eustas): avoid making a copy of "points".
+void DrawCentripetalCatmullRomSpline(std::vector<Spline::Point> points,
+                                     std::vector<Spline::Point>& result) {
+  if (points.empty()) return;
+  if (points.size() == 1) {
+    result.push_back(points[0]);
+    return;
+  }
+  // Number of points to compute between each control point.
+  static constexpr int kNumPoints = 16;
+  result.reserve((points.size() - 1) * kNumPoints + 1);
+  points.insert(points.begin(), points[0] + (points[0] - points[1]));
+  points.push_back(points[points.size() - 1] +
+                   (points[points.size() - 1] - points[points.size() - 2]));
+  // points has at least 4 elements at this point.
+  for (size_t start = 0; start < points.size() - 3; ++start) {
+    // 4 of them are used, and we draw from p[1] to p[2].
+    const Spline::Point* const p = &points[start];
+    result.push_back(p[1]);
+    float d[3];
+    float t[4];
+    t[0] = 0;
+    for (int k = 0; k < 3; ++k) {
+      // TODO(eustas): for each segment delta is calculated 3 times...
+      // TODO(eustas): restrict d[k] with reasonable limit and spec it.
+      d[k] = std::sqrt(hypotf(p[k + 1].x - p[k].x, p[k + 1].y - p[k].y));
+      t[k + 1] = t[k] + d[k];
+    }
+    for (int i = 1; i < kNumPoints; ++i) {
+      const float tt = d[0] + (static_cast<float>(i) / kNumPoints) * d[1];
+      Spline::Point a[3];
+      for (int k = 0; k < 3; ++k) {
+        // TODO(eustas): reciprocal multiplication would be faster.
+        a[k] = p[k] + ((tt - t[k]) / d[k]) * (p[k + 1] - p[k]);
+      }
+      Spline::Point b[2];
+      for (int k = 0; k < 2; ++k) {
+        b[k] = a[k] + ((tt - t[k]) / (d[k] + d[k + 1])) * (a[k + 1] - a[k]);
+      }
+      result.push_back(b[0] + ((tt - t[1]) / d[1]) * (b[1] - b[0]));
+    }
+  }
+  result.push_back(points[points.size() - 2]);
+}
+
+// Move along the line segments defined by `points`, `kDesiredRenderingDistance`
+// pixels at a time, and call `functor` with each point and the actual distance
+// to the previous point (which will always be kDesiredRenderingDistance except
+// possibly for the very last point).
+// TODO(eustas): this method always adds the last point, but never the first
+//               (unless those are one); I believe both ends matter.
+template <typename Points, typename Functor>
+void ForEachEquallySpacedPoint(const Points& points, const Functor& functor) {
+  JXL_ASSERT(!points.empty());
+  Spline::Point current = points.front();
+  functor(current, kDesiredRenderingDistance);
+  auto next = points.begin();
+  while (next != points.end()) {
+    const Spline::Point* previous = &current;
+    float arclength_from_previous = 0.f;
+    for (;;) {
+      if (next == points.end()) {
+        functor(*previous, arclength_from_previous);
+        return;
+      }
+      const float arclength_to_next =
+          std::sqrt((*next - *previous).SquaredNorm());
+      if (arclength_from_previous + arclength_to_next >=
+          kDesiredRenderingDistance) {
+        current =
+            *previous + ((kDesiredRenderingDistance - arclength_from_previous) /
+                         arclength_to_next) *
+                            (*next - *previous);
+        functor(current, kDesiredRenderingDistance);
+        break;
+      }
+      arclength_from_previous += arclength_to_next;
+      previous = &*next;
+      ++next;
+    }
+  }
+}
+
+}  // namespace
+
+QuantizedSpline::QuantizedSpline(const Spline& original,
+                                 const int32_t quantization_adjustment,
+                                 const float y_to_x, const float y_to_b) {
+  JXL_ASSERT(!original.control_points.empty());
+  control_points_.reserve(original.control_points.size() - 1);
+  const Spline::Point& starting_point = original.control_points.front();
+  int previous_x = static_cast<int>(roundf(starting_point.x)),
+      previous_y = static_cast<int>(roundf(starting_point.y));
+  int previous_delta_x = 0, previous_delta_y = 0;
+  for (auto it = original.control_points.begin() + 1;
+       it != original.control_points.end(); ++it) {
+    const int new_x = static_cast<int>(roundf(it->x));
+    const int new_y = static_cast<int>(roundf(it->y));
+    const int new_delta_x = new_x - previous_x;
+    const int new_delta_y = new_y - previous_y;
+    control_points_.emplace_back(new_delta_x - previous_delta_x,
+                                 new_delta_y - previous_delta_y);
+    previous_delta_x = new_delta_x;
+    previous_delta_y = new_delta_y;
+    previous_x = new_x;
+    previous_y = new_y;
+  }
+
+  const auto to_int = [](float v) -> int {
+    return static_cast<int>(roundf(v));
+  };
+
+  const auto quant = AdjustedQuant(quantization_adjustment);
+  const auto inv_quant = InvAdjustedQuant(quantization_adjustment);
+  for (int c : {1, 0, 2}) {
+    float factor = (c == 0) ? y_to_x : (c == 1) ? 0 : y_to_b;
+    for (int i = 0; i < 32; ++i) {
+      const float dct_factor = (i == 0) ? kSqrt2 : 1.0f;
+      const float inv_dct_factor = (i == 0) ? kSqrt0_5 : 1.0f;
+      auto restored_y =
+          color_dct_[1][i] * inv_dct_factor * kChannelWeight[1] * inv_quant;
+      auto decorellated = original.color_dct[c][i] - factor * restored_y;
+      color_dct_[c][i] =
+          to_int(decorellated * dct_factor * quant / kChannelWeight[c]);
+    }
+  }
+  for (int i = 0; i < 32; ++i) {
+    const float dct_factor = (i == 0) ? kSqrt2 : 1.0f;
+    sigma_dct_[i] =
+        to_int(original.sigma_dct[i] * dct_factor * quant / kChannelWeight[3]);
+  }
+}
+
+Status QuantizedSpline::Dequantize(const Spline::Point& starting_point,
+                                   const int32_t quantization_adjustment,
+                                   const float y_to_x, const float y_to_b,
+                                   const uint64_t image_size,
+                                   uint64_t* total_estimated_area_reached,
+                                   Spline& result) const {
+  result.control_points.clear();
+  result.control_points.reserve(control_points_.size() + 1);
+  float px = roundf(starting_point.x);
+  float py = roundf(starting_point.y);
+  JXL_RETURN_IF_ERROR(ValidateSplinePointPos(px, py));
+  int current_x = static_cast<int>(px);
+  int current_y = static_cast<int>(py);
+  result.control_points.push_back(Spline::Point{static_cast<float>(current_x),
+                                                static_cast<float>(current_y)});
+  int current_delta_x = 0, current_delta_y = 0;
+  size_t manhattan_distance = 0;
+  for (const auto& point : control_points_) {
+    current_delta_x += point.first;
+    current_delta_y += point.second;
+    manhattan_distance += abs(current_delta_x) + abs(current_delta_y);
+    JXL_RETURN_IF_ERROR(
+        ValidateSplinePointPos(current_delta_x, current_delta_y));
+    current_x += current_delta_x;
+    current_y += current_delta_y;
+    JXL_RETURN_IF_ERROR(ValidateSplinePointPos(current_x, current_y));
+    result.control_points.push_back(Spline::Point{
+        static_cast<float>(current_x), static_cast<float>(current_y)});
+  }
+
+  const auto inv_quant = InvAdjustedQuant(quantization_adjustment);
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < 32; ++i) {
+      const float inv_dct_factor = (i == 0) ? kSqrt0_5 : 1.0f;
+      result.color_dct[c][i] =
+          color_dct_[c][i] * inv_dct_factor * kChannelWeight[c] * inv_quant;
+    }
+  }
+  for (int i = 0; i < 32; ++i) {
+    result.color_dct[0][i] += y_to_x * result.color_dct[1][i];
+    result.color_dct[2][i] += y_to_b * result.color_dct[1][i];
+  }
+  uint64_t width_estimate = 0;
+
+  uint64_t color[3] = {};
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < 32; ++i) {
+      color[c] +=
+          static_cast<uint64_t>(ceil(inv_quant * std::abs(color_dct_[c][i])));
+    }
+  }
+  color[0] += static_cast<uint64_t>(ceil(abs(y_to_x))) * color[1];
+  color[2] += static_cast<uint64_t>(ceil(abs(y_to_b))) * color[1];
+  // This is not taking kChannelWeight into account, but up to constant factors
+  // it gives an indication of the influence of the color values on the area
+  // that will need to be rendered.
+  uint64_t logcolor = std::max(
+      uint64_t(1),
+      static_cast<uint64_t>(CeilLog2Nonzero(
+          uint64_t(1) + std::max(color[1], std::max(color[0], color[2])))));
+
+  for (int i = 0; i < 32; ++i) {
+    const float inv_dct_factor = (i == 0) ? kSqrt0_5 : 1.0f;
+    result.sigma_dct[i] =
+        sigma_dct_[i] * inv_dct_factor * kChannelWeight[3] * inv_quant;
+    // If we include the factor kChannelWeight[3]=.3333f here, we get a
+    // realistic area estimate. We leave it out to simplify the calculations,
+    // and understand that this way we underestimate the area by a factor of
+    // 1/(0.3333*0.3333). This is taken into account in the limits below.
+    uint64_t weight = std::max(
+        uint64_t(1),
+        static_cast<uint64_t>(ceil(inv_quant * std::abs(sigma_dct_[i]))));
+    width_estimate += weight * weight * logcolor;
+  }
+  *total_estimated_area_reached += (width_estimate * manhattan_distance);
+  if (*total_estimated_area_reached >
+      std::min((1024 * image_size + (uint64_t(1) << 32)),
+               (uint64_t(1) << 42))) {
+    return JXL_FAILURE("Too large total_estimated_area_reached: %" PRIu64,
+                       *total_estimated_area_reached);
+  }
+
+  return true;
+}
+
+Status QuantizedSpline::Decode(const std::vector<uint8_t>& context_map,
+                               ANSSymbolReader* const decoder,
+                               BitReader* const br,
+                               const size_t max_control_points,
+                               size_t* total_num_control_points) {
+  const size_t num_control_points =
+      decoder->ReadHybridUint(kNumControlPointsContext, br, context_map);
+  *total_num_control_points += num_control_points;
+  if (*total_num_control_points > max_control_points) {
+    return JXL_FAILURE("Too many control points: %" PRIuS,
+                       *total_num_control_points);
+  }
+  control_points_.resize(num_control_points);
+  // Maximal image dimension.
+  constexpr int64_t kDeltaLimit = 1u << 30;
+  for (std::pair<int64_t, int64_t>& control_point : control_points_) {
+    control_point.first = UnpackSigned(
+        decoder->ReadHybridUint(kControlPointsContext, br, context_map));
+    control_point.second = UnpackSigned(
+        decoder->ReadHybridUint(kControlPointsContext, br, context_map));
+    // Check delta-deltas are not outrageous; it is not in spec, but there is
+    // no reason to allow larger values.
+    if ((control_point.first >= kDeltaLimit) ||
+        (control_point.first <= -kDeltaLimit) ||
+        (control_point.second >= kDeltaLimit) ||
+        (control_point.second <= -kDeltaLimit)) {
+      return JXL_FAILURE("Spline delta-delta is out of bounds");
+    }
+  }
+
+  const auto decode_dct = [decoder, br, &context_map](int dct[32]) -> Status {
+    for (int i = 0; i < 32; ++i) {
+      dct[i] =
+          UnpackSigned(decoder->ReadHybridUint(kDCTContext, br, context_map));
+    }
+    return true;
+  };
+  for (int c = 0; c < 3; ++c) {
+    JXL_RETURN_IF_ERROR(decode_dct(color_dct_[c]));
+  }
+  JXL_RETURN_IF_ERROR(decode_dct(sigma_dct_));
+  return true;
+}
+
+void Splines::Clear() {
+  quantization_adjustment_ = 0;
+  splines_.clear();
+  starting_points_.clear();
+  segments_.clear();
+  segment_indices_.clear();
+  segment_y_start_.clear();
+}
+
+Status Splines::Decode(jxl::BitReader* br, const size_t num_pixels) {
+  std::vector<uint8_t> context_map;
+  ANSCode code;
+  JXL_RETURN_IF_ERROR(
+      DecodeHistograms(br, kNumSplineContexts, &code, &context_map));
+  ANSSymbolReader decoder(&code, br);
+  const size_t num_splines =
+      1 + decoder.ReadHybridUint(kNumSplinesContext, br, context_map);
+  size_t max_control_points = std::min(
+      kMaxNumControlPoints, num_pixels / kMaxNumControlPointsPerPixelRatio);
+  if (num_splines > max_control_points) {
+    return JXL_FAILURE("Too many splines: %" PRIuS, num_splines);
+  }
+  JXL_RETURN_IF_ERROR(DecodeAllStartingPoints(&starting_points_, br, &decoder,
+                                              context_map, num_splines));
+
+  quantization_adjustment_ = UnpackSigned(
+      decoder.ReadHybridUint(kQuantizationAdjustmentContext, br, context_map));
+
+  splines_.clear();
+  splines_.reserve(num_splines);
+  size_t num_control_points = num_splines;
+  for (size_t i = 0; i < num_splines; ++i) {
+    QuantizedSpline spline;
+    JXL_RETURN_IF_ERROR(spline.Decode(context_map, &decoder, br,
+                                      max_control_points, &num_control_points));
+    splines_.push_back(std::move(spline));
+  }
+
+  JXL_RETURN_IF_ERROR(decoder.CheckANSFinalState());
+
+  if (!HasAny()) {
+    return JXL_FAILURE("Decoded splines but got none");
+  }
+
+  return true;
+}
+
+void Splines::AddTo(Image3F* const opsin, const Rect& opsin_rect,
+                    const Rect& image_rect) const {
+  return Apply</*add=*/true>(opsin, opsin_rect, image_rect);
+}
+void Splines::AddToRow(float* JXL_RESTRICT row_x, float* JXL_RESTRICT row_y,
+                       float* JXL_RESTRICT row_b, const Rect& image_row) const {
+  return ApplyToRow</*add=*/true>(row_x, row_y, row_b, image_row);
+}
+
+void Splines::SubtractFrom(Image3F* const opsin) const {
+  return Apply</*add=*/false>(opsin, Rect(*opsin), Rect(*opsin));
+}
+
+Status Splines::InitializeDrawCache(const size_t image_xsize,
+                                    const size_t image_ysize,
+                                    const ColorCorrelationMap& cmap) {
+  // TODO(veluca): avoid storing segments that are entirely outside image
+  // boundaries.
+  segments_.clear();
+  segment_indices_.clear();
+  segment_y_start_.clear();
+  std::vector<std::pair<size_t, size_t>> segments_by_y;
+  std::vector<Spline::Point> intermediate_points;
+  uint64_t total_estimated_area_reached = 0;
+  std::vector<Spline> splines;
+  for (size_t i = 0; i < splines_.size(); ++i) {
+    Spline spline;
+    JXL_RETURN_IF_ERROR(splines_[i].Dequantize(
+        starting_points_[i], quantization_adjustment_, cmap.YtoXRatio(0),
+        cmap.YtoBRatio(0), image_xsize * image_ysize,
+        &total_estimated_area_reached, spline));
+    if (std::adjacent_find(spline.control_points.begin(),
+                           spline.control_points.end()) !=
+        spline.control_points.end()) {
+      // Otherwise division by zero might occur. Once control points coincide,
+      // the direction of curve is undefined...
+      return JXL_FAILURE(
+          "identical successive control points in spline %" PRIuS, i);
+    }
+    splines.push_back(spline);
+  }
+  // TODO(firsching) Change this into a JXL_FAILURE for level 5 codestreams.
+  if (total_estimated_area_reached >
+      std::min((8 * image_xsize * image_ysize + (uint64_t(1) << 25)),
+               (uint64_t(1) << 30))) {
+    JXL_WARNING(
+        "Large total_estimated_area_reached, expect slower decoding: %" PRIu64,
+        total_estimated_area_reached);
+  }
+
+  for (Spline& spline : splines) {
+    std::vector<std::pair<Spline::Point, float>> points_to_draw;
+    auto add_point = [&](const Spline::Point& point, const float multiplier) {
+      points_to_draw.emplace_back(point, multiplier);
+    };
+    intermediate_points.clear();
+    DrawCentripetalCatmullRomSpline(spline.control_points, intermediate_points);
+    ForEachEquallySpacedPoint(intermediate_points, add_point);
+    const float arc_length =
+        (points_to_draw.size() - 2) * kDesiredRenderingDistance +
+        points_to_draw.back().second;
+    if (arc_length <= 0.f) {
+      // This spline wouldn't have any effect.
+      continue;
+    }
+    HWY_DYNAMIC_DISPATCH(SegmentsFromPoints)
+    (spline, points_to_draw, arc_length, segments_, segments_by_y);
+  }
+
+  // TODO(eustas): consider linear sorting here.
+  std::sort(segments_by_y.begin(), segments_by_y.end());
+  segment_indices_.resize(segments_by_y.size());
+  segment_y_start_.resize(image_ysize + 1);
+  for (size_t i = 0; i < segments_by_y.size(); i++) {
+    segment_indices_[i] = segments_by_y[i].second;
+    size_t y = segments_by_y[i].first;
+    if (y < image_ysize) {
+      segment_y_start_[y + 1]++;
+    }
+  }
+  for (size_t y = 0; y < image_ysize; y++) {
+    segment_y_start_[y + 1] += segment_y_start_[y];
+  }
+  return true;
+}
+
+template <bool add>
+void Splines::ApplyToRow(float* JXL_RESTRICT row_x, float* JXL_RESTRICT row_y,
+                         float* JXL_RESTRICT row_b,
+                         const Rect& image_row) const {
+  if (segments_.empty()) return;
+  JXL_ASSERT(image_row.ysize() == 1);
+  for (size_t iy = 0; iy < image_row.ysize(); iy++) {
+    HWY_DYNAMIC_DISPATCH(DrawSegments)
+    (row_x, row_y, row_b, image_row.Line(iy), add, segments_.data(),
+     segment_indices_.data(), segment_y_start_.data());
+  }
+}
+
+template <bool add>
+void Splines::Apply(Image3F* const opsin, const Rect& opsin_rect,
+                    const Rect& image_rect) const {
+  if (segments_.empty()) return;
+  for (size_t iy = 0; iy < image_rect.ysize(); iy++) {
+    const size_t y0 = opsin_rect.Line(iy).y0();
+    const size_t x0 = opsin_rect.x0();
+    ApplyToRow<add>(opsin->PlaneRow(0, y0) + x0, opsin->PlaneRow(1, y0) + x0,
+                    opsin->PlaneRow(2, y0) + x0, image_rect.Line(iy));
+  }
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/splines.h b/third_party/jpeg-xl/lib/jxl/splines.h
new file mode 100644
index 0000000000..c8dad3417c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/splines.h
@@ -0,0 +1,148 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_SPLINES_H_
+#define LIB_JXL_SPLINES_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+static constexpr float kDesiredRenderingDistance = 1.f;
+
+enum SplineEntropyContexts : size_t {
+  kQuantizationAdjustmentContext = 0,
+  kStartingPositionContext,
+  kNumSplinesContext,
+  kNumControlPointsContext,
+  kControlPointsContext,
+  kDCTContext,
+  kNumSplineContexts
+};
+
+struct Spline {
+  struct Point {
+    Point() : x(0.0f), y(0.0f) {}
+    Point(float x, float y) : x(x), y(y) {}
+    float x, y;
+    bool operator==(const Point& other) const {
+      return std::fabs(x - other.x) < 1e-3f && std::fabs(y - other.y) < 1e-3f;
+    }
+  };
+  std::vector<Point> control_points;
+  // X, Y, B.
+  float color_dct[3][32];
+  // Splines are draws by normalized Gaussian splatting. This controls the
+  // Gaussian's parameter along the spline.
+  float sigma_dct[32];
+};
+
+class QuantizedSplineEncoder;
+
+class QuantizedSpline {
+ public:
+  QuantizedSpline() = default;
+  explicit QuantizedSpline(const Spline& original,
+                           int32_t quantization_adjustment, float y_to_x,
+                           float y_to_b);
+
+  Status Dequantize(const Spline::Point& starting_point,
+                    int32_t quantization_adjustment, float y_to_x, float y_to_b,
+                    uint64_t image_size, uint64_t* total_estimated_area_reached,
+                    Spline& result) const;
+
+  Status Decode(const std::vector<uint8_t>& context_map,
+                ANSSymbolReader* decoder, BitReader* br,
+                size_t max_control_points, size_t* total_num_control_points);
+
+ private:
+  friend class QuantizedSplineEncoder;
+
+  std::vector<std::pair<int64_t, int64_t>>
+      control_points_;  // Double delta-encoded.
+  int color_dct_[3][32] = {};
+  int sigma_dct_[32] = {};
+};
+
+// A single "drawable unit" of a spline, i.e. a line of the region in which we
+// render each Gaussian. The structure doesn't actually depend on the exact
+// row, which allows reuse for different y values (which are tracked
+// separately).
+struct SplineSegment {
+  float center_x, center_y;
+  float maximum_distance;
+  float inv_sigma;
+  float sigma_over_4_times_intensity;
+  float color[3];
+};
+
+class Splines {
+ public:
+  Splines() = default;
+  explicit Splines(const int32_t quantization_adjustment,
+                   std::vector<QuantizedSpline> splines,
+                   std::vector<Spline::Point> starting_points)
+      : quantization_adjustment_(quantization_adjustment),
+        splines_(std::move(splines)),
+        starting_points_(std::move(starting_points)) {}
+
+  bool HasAny() const { return !splines_.empty(); }
+
+  void Clear();
+
+  Status Decode(BitReader* br, size_t num_pixels);
+
+  void AddTo(Image3F* opsin, const Rect& opsin_rect,
+             const Rect& image_rect) const;
+  void AddToRow(float* JXL_RESTRICT row_x, float* JXL_RESTRICT row_y,
+                float* JXL_RESTRICT row_b, const Rect& image_row) const;
+  void SubtractFrom(Image3F* opsin) const;
+
+  const std::vector<QuantizedSpline>& QuantizedSplines() const {
+    return splines_;
+  }
+  const std::vector<Spline::Point>& StartingPoints() const {
+    return starting_points_;
+  }
+
+  int32_t GetQuantizationAdjustment() const { return quantization_adjustment_; }
+
+  Status InitializeDrawCache(size_t image_xsize, size_t image_ysize,
+                             const ColorCorrelationMap& cmap);
+
+ private:
+  template <bool>
+  void ApplyToRow(float* JXL_RESTRICT row_x, float* JXL_RESTRICT row_y,
+                  float* JXL_RESTRICT row_b, const Rect& image_row) const;
+  template <bool>
+  void Apply(Image3F* opsin, const Rect& opsin_rect,
+             const Rect& image_rect) const;
+
+  // If positive, quantization weights are multiplied by 1 + this/8, which
+  // increases precision. If negative, they are divided by 1 - this/8. If 0,
+  // they are unchanged.
+  int32_t quantization_adjustment_ = 0;
+  std::vector<QuantizedSpline> splines_;
+  std::vector<Spline::Point> starting_points_;
+  std::vector<SplineSegment> segments_;
+  std::vector<size_t> segment_indices_;
+  std::vector<size_t> segment_y_start_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_SPLINES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/splines_gbench.cc b/third_party/jpeg-xl/lib/jxl/splines_gbench.cc
new file mode 100644
index 0000000000..78ff6d41c0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/splines_gbench.cc
@@ -0,0 +1,52 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "benchmark/benchmark.h"
+#include "lib/jxl/splines.h"
+
+namespace jxl {
+namespace {
+
+constexpr int kQuantizationAdjustment = 0;
+const ColorCorrelationMap* const cmap = new ColorCorrelationMap;
+const float kYToX = cmap->YtoXRatio(0);
+const float kYToB = cmap->YtoBRatio(0);
+
+void BM_Splines(benchmark::State& state) {
+  const size_t n = state.range();
+
+  std::vector<Spline> spline_data = {
+      {/*control_points=*/{
+           {9, 54}, {118, 159}, {97, 3}, {10, 40}, {150, 25}, {120, 300}},
+       /*color_dct=*/
+       {{0.03125f, 0.00625f, 0.003125f}, {1.f, 0.321875f}, {1.f, 0.24375f}},
+       /*sigma_dct=*/{0.3125f, 0.f, 0.f, 0.0625f}}};
+  std::vector<QuantizedSpline> quantized_splines;
+  std::vector<Spline::Point> starting_points;
+  for (const Spline& spline : spline_data) {
+    quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX,
+                                   kYToB);
+    starting_points.push_back(spline.control_points.front());
+  }
+  Splines splines(kQuantizationAdjustment, std::move(quantized_splines),
+                  std::move(starting_points));
+
+  Image3F drawing_area(320, 320);
+  ZeroFillImage(&drawing_area);
+  for (auto _ : state) {
+    for (size_t i = 0; i < n; ++i) {
+      JXL_CHECK(splines.InitializeDrawCache(drawing_area.xsize(),
+                                            drawing_area.ysize(), *cmap));
+      splines.AddTo(&drawing_area, Rect(drawing_area), Rect(drawing_area));
+    }
+  }
+
+  state.SetItemsProcessed(n * state.iterations());
+}
+
+BENCHMARK(BM_Splines)->Range(1, 1 << 10);
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/splines_test.cc b/third_party/jpeg-xl/lib/jxl/splines_test.cc
new file mode 100644
index 0000000000..8d6bc7ed1c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/splines_test.cc
@@ -0,0 +1,348 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/splines.h"
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_splines.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+
+std::ostream& operator<<(std::ostream& os, const Spline::Point& p) {
+  return os << "(" << p.x << ", " << p.y << ")";
+}
+
+std::ostream& operator<<(std::ostream& os, const Spline& spline) {
+  return os << "(spline with " << spline.control_points.size()
+            << " control points)";
+}
+
+namespace {
+
+using ::testing::AllOf;
+using ::testing::Field;
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+constexpr int kQuantizationAdjustment = 0;
+const ColorCorrelationMap* const cmap = new ColorCorrelationMap;
+const float kYToX = cmap->YtoXRatio(0);
+const float kYToB = cmap->YtoBRatio(0);
+
+constexpr float kTolerance = 0.003125;
+
+std::vector<Spline> DequantizeSplines(const Splines& splines) {
+  const auto& quantized_splines = splines.QuantizedSplines();
+  const auto& starting_points = splines.StartingPoints();
+  JXL_CHECK(quantized_splines.size() == starting_points.size());
+
+  std::vector<Spline> dequantized;
+  uint64_t total = 0;
+  for (size_t i = 0; i < quantized_splines.size(); ++i) {
+    dequantized.emplace_back();
+    JXL_CHECK(quantized_splines[i].Dequantize(
+        starting_points[i], kQuantizationAdjustment, kYToX, kYToB, 2u << 30u,
+        &total, dequantized.back()));
+  }
+  return dequantized;
+}
+
+MATCHER(ControlPointIs, "") {
+  const Spline::Point& actual = std::get<0>(arg);
+  const Spline::Point& expected = std::get<1>(arg);
+  return testing::ExplainMatchResult(
+      AllOf(Field(&Spline::Point::x, FloatNear(expected.x, kTolerance)),
+            Field(&Spline::Point::y, FloatNear(expected.y, kTolerance))),
+      actual, result_listener);
+}
+
+MATCHER(ControlPointsMatch, "") {
+  const Spline& actual = std::get<0>(arg);
+  const Spline& expected = std::get<1>(arg);
+  return testing::ExplainMatchResult(
+      Field(&Spline::control_points,
+            Pointwise(ControlPointIs(), expected.control_points)),
+      actual, result_listener);
+}
+
+MATCHER(SplinesMatch, "") {
+  const Spline& actual = std::get<0>(arg);
+  const Spline& expected = std::get<1>(arg);
+  if (!testing::ExplainMatchResult(ControlPointsMatch(), arg,
+                                   result_listener)) {
+    return false;
+  }
+  for (int i = 0; i < 3; ++i) {
+    size_t color_dct_size =
+        sizeof(expected.color_dct[i]) / sizeof(expected.color_dct[i][0]);
+    for (size_t j = 0; j < color_dct_size; j++) {
+      testing::StringMatchResultListener color_dct_listener;
+      if (!testing::ExplainMatchResult(
+              FloatNear(expected.color_dct[i][j], kTolerance),
+              actual.color_dct[i][j], &color_dct_listener)) {
+        *result_listener << ", where color_dct[" << i << "][" << j
+                         << "] don't match, " << color_dct_listener.str();
+        return false;
+      }
+    }
+  }
+  size_t sigma_dct_size =
+      sizeof(expected.sigma_dct) / sizeof(expected.sigma_dct[0]);
+  for (size_t i = 0; i < sigma_dct_size; i++) {
+    testing::StringMatchResultListener sigma_listener;
+    if (!testing::ExplainMatchResult(
+            FloatNear(expected.sigma_dct[i], kTolerance), actual.sigma_dct[i],
+            &sigma_listener)) {
+      *result_listener << ", where sigma_dct[" << i << "] don't match, "
+                       << sigma_listener.str();
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+TEST(SplinesTest, Serialization) {
+  std::vector<Spline> spline_data = {
+      {/*control_points=*/{
+           {109, 54}, {218, 159}, {80, 3}, {110, 274}, {94, 185}, {17, 277}},
+       /*color_dct=*/
+       {{36.3, 39.7, 23.2, 67.5, 4.4,  71.5, 62.3, 32.3, 92.2, 10.1, 10.8,
+         9.2,  6.1,  10.5, 79.1, 7,    24.6, 90.8, 5.5,  84,   43.8, 49,
+         33.5, 78.9, 54.5, 77.9, 62.1, 51.4, 36.4, 14.3, 83.7, 35.4},
+        {9.4,  53.4, 9.5,  74.9, 72.7, 26.7, 7.9,  0.9, 84.9, 23.2, 26.5,
+         31.1, 91,   11.7, 74.1, 39.3, 23.7, 82.5, 4.8, 2.7,  61.2, 96.4,
+         13.7, 66.7, 62.9, 82.4, 5.9,  98.7, 21.5, 7.9, 51.7, 63.1},
+        {48,   39.3, 6.9,  26.3, 33.3, 6.2,  1.7,  98.9, 59.9, 59.6, 95,
+         61.3, 82.7, 53,   6.1,  30.4, 34.7, 96.9, 93.4, 17,   38.8, 80.8,
+         63,   18.6, 43.6, 32.3, 61,   20.2, 24.3, 28.3, 69.1, 62.4}},
+       /*sigma_dct=*/{32.7, 21.5, 44.4, 1.8,  45.8, 90.6, 29.3, 59.2,
+                      23.7, 85.2, 84.8, 27.2, 42.1, 84.1, 50.6, 17.6,
+                      93.7, 4.9,  2.6,  69.8, 94.9, 52,   24.3, 18.8,
+                      12.1, 95.7, 28.5, 81.4, 89.9, 31.4, 74.8, 52}},
+      {/*control_points=*/{{172, 309},
+                           {196, 277},
+                           {42, 238},
+                           {114, 350},
+                           {307, 290},
+                           {316, 269},
+                           {124, 66},
+                           {233, 267}},
+       /*color_dct=*/
+       {{15,   28.9, 22, 6.6,  41.8, 83,   8.6,  56.8, 68.9, 9.7,  5.4,
+         19.8, 70.8, 90, 52.5, 65.2, 7.8,  23.5, 26.4, 72.2, 64.7, 87.1,
+         1.3,  67.5, 46, 68.4, 65.4, 35.5, 29.1, 13,   41.6, 23.9},
+        {47.7, 79.4, 62.7, 29.1, 96.8, 18.5, 17.6, 15.2, 80.5, 56,  96.2,
+         59.9, 26.7, 96.1, 92.3, 42.1, 35.8, 54,   23.2, 55,   76,  35.8,
+         58.4, 88.7, 2.4,  78.1, 95.6, 27.5, 6.6,  78.5, 24.1, 69.8},
+        {43.8, 96.5, 0.9,  95.1, 49.1, 71.2, 25.1, 33.6, 75.2, 95,  82.1,
+         19.7, 10.5, 44.9, 50,   93.3, 83.5, 99.5, 64.6, 54,   3.5, 99.7,
+         45.3, 82.1, 22.4, 37.9, 60,   32.2, 12.6, 4.6,  65.5, 96.4}},
+       /*sigma_dct=*/{72.5, 2.6,  41.7, 2.2,  39.7, 79.1, 69.6, 19.9,
+                      92.3, 71.5, 41.9, 62.1, 30,   49.4, 70.3, 45.3,
+                      62.5, 47.2, 46.7, 41.2, 90.8, 46.8, 91.2, 55,
+                      8.1,  69.6, 25.4, 84.7, 61.7, 27.6, 3.7,  46.9}},
+      {/*control_points=*/{{100, 186},
+                           {257, 97},
+                           {170, 49},
+                           {25, 169},
+                           {309, 104},
+                           {232, 237},
+                           {385, 101},
+                           {122, 168},
+                           {26, 300},
+                           {390, 88}},
+       /*color_dct=*/
+       {{16.9, 64.8, 4.2,  10.6, 23.5, 17,   79.3, 5.7,  60.4, 16.6, 94.9,
+         63.7, 87.6, 10.5, 3.8,  61.1, 22.9, 81.9, 80.4, 40.5, 45.9, 25.4,
+         39.8, 30,   50.2, 90.4, 27.9, 93.7, 65.1, 48.2, 22.3, 43.9},
+        {24.9, 66,   3.5,  90.2, 97.1, 15.8, 35.6, 0.6,  68,   39.6, 24.4,
+         85.9, 57.7, 77.6, 47.5, 67.9, 4.3,  5.4,  91.2, 58.5, 0.1,  52.2,
+         3.5,  47.8, 63.2, 43.5, 85.8, 35.8, 50.2, 35.9, 19.2, 48.2},
+        {82.8, 44.9, 76.4, 39.5, 94.1, 14.3, 89.8, 10,   10.5, 74.5, 56.3,
+         65.8, 7.8,  23.3, 52.8, 99.3, 56.8, 46,   76.7, 13.5, 67,   22.4,
+         29.9, 43.3, 70.3, 26,   74.3, 53.9, 62,   19.1, 49.3, 46.7}},
+       /*sigma_dct=*/{83.5, 1.7,  25.1, 18.7, 46.5, 75.3, 28,   62.3,
+                      50.3, 23.3, 85.6, 96,   45.8, 33.1, 33.4, 52.9,
+                      26.3, 58.5, 19.6, 70,   92.6, 22.5, 57,   21.6,
+                      76.8, 87.5, 22.9, 66.3, 35.7, 35.6, 56.8, 67.2}},
+  };
+
+  std::vector<QuantizedSpline> quantized_splines;
+  std::vector<Spline::Point> starting_points;
+  for (const Spline& spline : spline_data) {
+    quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX,
+                                   kYToB);
+    starting_points.push_back(spline.control_points.front());
+  }
+
+  Splines splines(kQuantizationAdjustment, std::move(quantized_splines),
+                  std::move(starting_points));
+  const std::vector<Spline> quantized_spline_data = DequantizeSplines(splines);
+  EXPECT_THAT(quantized_spline_data,
+              Pointwise(ControlPointsMatch(), spline_data));
+
+  BitWriter writer;
+  EncodeSplines(splines, &writer, kLayerSplines, HistogramParams(), nullptr);
+  writer.ZeroPadToByte();
+  const size_t bits_written = writer.BitsWritten();
+
+  printf("Wrote %" PRIuS " bits of splines.\n", bits_written);
+
+  BitReader reader(writer.GetSpan());
+  Splines decoded_splines;
+  ASSERT_TRUE(decoded_splines.Decode(&reader, /*num_pixels=*/1000));
+  ASSERT_TRUE(reader.JumpToByteBoundary());
+  EXPECT_EQ(reader.TotalBitsConsumed(), bits_written);
+  ASSERT_TRUE(reader.Close());
+
+  const std::vector<Spline> decoded_spline_data =
+      DequantizeSplines(decoded_splines);
+  EXPECT_THAT(decoded_spline_data,
+              Pointwise(SplinesMatch(), quantized_spline_data));
+}
+
+#ifdef JXL_CRASH_ON_ERROR
+TEST(SplinesTest, DISABLED_TooManySplinesTest) {
+#else
+TEST(SplinesTest, TooManySplinesTest) {
+#endif
+  // This is more than the limit for 1000 pixels.
+  const size_t kNumSplines = 300;
+
+  std::vector<QuantizedSpline> quantized_splines;
+  std::vector<Spline::Point> starting_points;
+  for (size_t i = 0; i < kNumSplines; i++) {
+    Spline spline = {
+        /*control_points=*/{{1.f + i, 2}, {10.f + i, 25}, {30.f + i, 300}},
+        /*color_dct=*/
+        {{1.f, 0.2f, 0.1f}, {35.7f, 10.3f}, {35.7f, 7.8f}},
+        /*sigma_dct=*/{10.f, 0.f, 0.f, 2.f}};
+    quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX,
+                                   kYToB);
+    starting_points.push_back(spline.control_points.front());
+  }
+
+  Splines splines(kQuantizationAdjustment, std::move(quantized_splines),
+                  std::move(starting_points));
+  BitWriter writer;
+  EncodeSplines(splines, &writer, kLayerSplines,
+                HistogramParams(SpeedTier::kFalcon, 1), nullptr);
+  writer.ZeroPadToByte();
+  // Re-read splines.
+  BitReader reader(writer.GetSpan());
+  Splines decoded_splines;
+  EXPECT_FALSE(decoded_splines.Decode(&reader, /*num_pixels=*/1000));
+  EXPECT_TRUE(reader.Close());
+}
+
+#ifdef JXL_CRASH_ON_ERROR
+TEST(SplinesTest, DISABLED_DuplicatePoints) {
+#else
+TEST(SplinesTest, DuplicatePoints) {
+#endif
+  std::vector<Spline::Point> control_points{
+      {9, 54}, {118, 159}, {97, 3},  // Repeated.
+      {97, 3}, {10, 40},   {150, 25}, {120, 300}};
+  Spline spline{control_points,
+                /*color_dct=*/
+                {{1.f, 0.2f, 0.1f}, {35.7f, 10.3f}, {35.7f, 7.8f}},
+                /*sigma_dct=*/{10.f, 0.f, 0.f, 2.f}};
+  std::vector<Spline> spline_data{spline};
+  std::vector<QuantizedSpline> quantized_splines;
+  std::vector<Spline::Point> starting_points;
+  for (const Spline& spline : spline_data) {
+    quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX,
+                                   kYToB);
+    starting_points.push_back(spline.control_points.front());
+  }
+  Splines splines(kQuantizationAdjustment, std::move(quantized_splines),
+                  std::move(starting_points));
+
+  Image3F image(320, 320);
+  ZeroFillImage(&image);
+  EXPECT_FALSE(
+      splines.InitializeDrawCache(image.xsize(), image.ysize(), *cmap));
+}
+
+TEST(SplinesTest, Drawing) {
+  CodecInOut io_expected;
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/splines.pfm");
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io_expected,
+                           /*pool=*/nullptr));
+
+  std::vector<Spline::Point> control_points{{9, 54},  {118, 159}, {97, 3},
+                                            {10, 40}, {150, 25},  {120, 300}};
+  // Use values that survive quant/decorellation roundtrip.
+  const Spline spline{
+      control_points,
+      /*color_dct=*/
+      {{0.4989345073699951171875000f, 0.4997999966144561767578125f},
+       {0.4772970676422119140625000f, 0.f, 0.5250000357627868652343750f},
+       {-0.0176776945590972900390625f, 0.4900000095367431640625000f,
+        0.5250000357627868652343750f}},
+      /*sigma_dct=*/
+      {0.9427147507667541503906250f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+       0.6665999889373779296875000f}};
+  std::vector<Spline> spline_data = {spline};
+  std::vector<QuantizedSpline> quantized_splines;
+  std::vector<Spline::Point> starting_points;
+  for (const Spline& spline : spline_data) {
+    quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX,
+                                   kYToB);
+    starting_points.push_back(spline.control_points.front());
+  }
+  Splines splines(kQuantizationAdjustment, std::move(quantized_splines),
+                  std::move(starting_points));
+
+  Image3F image(320, 320);
+  ZeroFillImage(&image);
+  ASSERT_TRUE(splines.InitializeDrawCache(image.xsize(), image.ysize(), *cmap));
+  splines.AddTo(&image, Rect(image), Rect(image));
+
+  CodecInOut io_actual;
+  io_actual.SetFromImage(CopyImage(image), ColorEncoding::SRGB());
+  ASSERT_TRUE(io_actual.frames[0].TransformTo(io_expected.Main().c_current(),
+                                              GetJxlCms()));
+
+  JXL_ASSERT_OK(VerifyRelativeError(
+      *io_expected.Main().color(), *io_actual.Main().color(), 1e-2f, 1e-1f, _));
+}
+
+TEST(SplinesTest, ClearedEveryFrame) {
+  CodecInOut io_expected;
+  const PaddedBytes bytes_expected =
+      jxl::test::ReadTestData("jxl/spline_on_first_frame.png");
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(bytes_expected), &io_expected,
+                           /*pool=*/nullptr));
+  CodecInOut io_actual;
+  const PaddedBytes bytes_actual =
+      jxl::test::ReadTestData("jxl/spline_on_first_frame.jxl");
+  ASSERT_TRUE(
+      test::DecodeFile({}, Span<const uint8_t>(bytes_actual), &io_actual));
+
+  ASSERT_TRUE(
+      io_actual.frames[0].TransformTo(ColorEncoding::SRGB(), GetJxlCms()));
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < io_actual.ysize(); ++y) {
+      float* const JXL_RESTRICT row = io_actual.Main().color()->PlaneRow(c, y);
+      for (size_t x = 0; x < io_actual.xsize(); ++x) {
+        row[x] = Clamp1(row[x], 0.f, 1.f);
+      }
+    }
+  }
+  JXL_ASSERT_OK(VerifyRelativeError(
+      *io_expected.Main().color(), *io_actual.Main().color(), 1e-2f, 1e-1f, _));
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/test_image.cc b/third_party/jpeg-xl/lib/jxl/test_image.cc
new file mode 100644
index 0000000000..af1d1293ef
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/test_image.cc
@@ -0,0 +1,453 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/test_image.h"
+
+#include <jxl/encode.h>
+
+#include <algorithm>
+#include <cstring>
+#include <utility>
+
+#include "lib/extras/dec/color_description.h"
+#include "lib/extras/dec/color_hints.h"
+#include "lib/extras/dec/decode.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+
+namespace jxl {
+namespace test {
+
+namespace {
+
+void StoreValue(float val, size_t bits_per_sample, JxlPixelFormat format,
+                uint8_t** out) {
+  const float mul = (1u << bits_per_sample) - 1;
+  if (format.data_type == JXL_TYPE_UINT8) {
+    **out = val * mul;
+  } else if (format.data_type == JXL_TYPE_UINT16) {
+    uint16_t uval = val * mul;
+    if (SwapEndianness(format.endianness)) {
+      uval = JXL_BSWAP16(uval);
+    }
+    memcpy(*out, &uval, 2);
+  } else if (format.data_type == JXL_TYPE_FLOAT) {
+    // TODO(szabadka) Add support for custom bits / exponent bits floats.
+    if (SwapEndianness(format.endianness)) {
+      val = BSwapFloat(val);
+    }
+    memcpy(*out, &val, 4);
+  } else {
+    // TODO(szabadka) Add support for FLOAT16.
+  }
+  *out += extras::PackedImage::BitsPerChannel(format.data_type) / 8;
+}
+
+void FillPackedImage(size_t bits_per_sample, uint16_t seed,
+                     extras::PackedImage* image) {
+  const size_t xsize = image->xsize;
+  const size_t ysize = image->ysize;
+  const JxlPixelFormat format = image->format;
+
+  // Cause more significant image difference for successive seeds.
+  Rng generator(seed);
+
+  // Returns random integer in interval [0, max_value)
+  auto rngu = [&generator](size_t max_value) -> size_t {
+    return generator.UniformU(0, max_value);
+  };
+
+  // Returns random float in interval [0.0, max_value)
+  auto rngf = [&generator](float max_value) {
+    return generator.UniformF(0.0f, max_value);
+  };
+
+  // Dark background gradient color
+  float r0 = rngf(0.5f);
+  float g0 = rngf(0.5f);
+  float b0 = rngf(0.5f);
+  float a0 = rngf(0.5f);
+  float r1 = rngf(0.5f);
+  float g1 = rngf(0.5f);
+  float b1 = rngf(0.5f);
+  float a1 = rngf(0.5f);
+
+  // Circle with different color
+  size_t circle_x = rngu(xsize);
+  size_t circle_y = rngu(ysize);
+  size_t circle_r = rngu(std::min(xsize, ysize));
+
+  // Rectangle with random noise
+  size_t rect_x0 = rngu(xsize);
+  size_t rect_y0 = rngu(ysize);
+  size_t rect_x1 = rngu(xsize);
+  size_t rect_y1 = rngu(ysize);
+  if (rect_x1 < rect_x0) std::swap(rect_x0, rect_y1);
+  if (rect_y1 < rect_y0) std::swap(rect_y0, rect_y1);
+
+  // Create pixel content to test, actual content does not matter as long as it
+  // can be compared after roundtrip.
+  uint8_t* out = reinterpret_cast<uint8_t*>(image->pixels());
+  const float imul16 = 1.0f / 65536.0f;
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      float r = r0 * (ysize - y - 1) / ysize + r1 * y / ysize;
+      float g = g0 * (ysize - y - 1) / ysize + g1 * y / ysize;
+      float b = b0 * (ysize - y - 1) / ysize + b1 * y / ysize;
+      float a = a0 * (ysize - y - 1) / ysize + a1 * y / ysize;
+      // put some shape in there for visual debugging
+      if ((x - circle_x) * (x - circle_x) + (y - circle_y) * (y - circle_y) <
+          circle_r * circle_r) {
+        r = std::min(1.0f, ((65535 - x * y) ^ seed) * imul16);
+        g = std::min(1.0f, ((x << 8) + y + seed) * imul16);
+        b = std::min(1.0f, ((y << 8) + x * seed) * imul16);
+        a = std::min(1.0f, (32768 + x * 256 - y) * imul16);
+      } else if (x > rect_x0 && x < rect_x1 && y > rect_y0 && y < rect_y1) {
+        r = rngf(1.0f);
+        g = rngf(1.0f);
+        b = rngf(1.0f);
+        a = rngf(1.0f);
+      }
+      if (format.num_channels == 1) {
+        StoreValue(g, bits_per_sample, format, &out);
+      } else if (format.num_channels == 2) {
+        StoreValue(g, bits_per_sample, format, &out);
+        StoreValue(a, bits_per_sample, format, &out);
+      } else if (format.num_channels == 3) {
+        StoreValue(r, bits_per_sample, format, &out);
+        StoreValue(g, bits_per_sample, format, &out);
+        StoreValue(b, bits_per_sample, format, &out);
+      } else if (format.num_channels == 4) {
+        StoreValue(r, bits_per_sample, format, &out);
+        StoreValue(g, bits_per_sample, format, &out);
+        StoreValue(b, bits_per_sample, format, &out);
+        StoreValue(a, bits_per_sample, format, &out);
+      }
+    }
+  }
+}
+
+}  // namespace
+
+std::vector<uint8_t> GetSomeTestImage(size_t xsize, size_t ysize,
+                                      size_t num_channels, uint16_t seed) {
+  // Cause more significant image difference for successive seeds.
+  Rng generator(seed);
+
+  // Returns random integer in interval [0, max_value)
+  auto rng = [&generator](size_t max_value) -> size_t {
+    return generator.UniformU(0, max_value);
+  };
+
+  // Dark background gradient color
+  uint16_t r0 = rng(32768);
+  uint16_t g0 = rng(32768);
+  uint16_t b0 = rng(32768);
+  uint16_t a0 = rng(32768);
+  uint16_t r1 = rng(32768);
+  uint16_t g1 = rng(32768);
+  uint16_t b1 = rng(32768);
+  uint16_t a1 = rng(32768);
+
+  // Circle with different color
+  size_t circle_x = rng(xsize);
+  size_t circle_y = rng(ysize);
+  size_t circle_r = rng(std::min(xsize, ysize));
+
+  // Rectangle with random noise
+  size_t rect_x0 = rng(xsize);
+  size_t rect_y0 = rng(ysize);
+  size_t rect_x1 = rng(xsize);
+  size_t rect_y1 = rng(ysize);
+  if (rect_x1 < rect_x0) std::swap(rect_x0, rect_y1);
+  if (rect_y1 < rect_y0) std::swap(rect_y0, rect_y1);
+
+  size_t num_pixels = xsize * ysize;
+  // 16 bits per channel, big endian, 4 channels
+  std::vector<uint8_t> pixels(num_pixels * num_channels * 2);
+  // Create pixel content to test, actual content does not matter as long as it
+  // can be compared after roundtrip.
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      uint16_t r = r0 * (ysize - y - 1) / ysize + r1 * y / ysize;
+      uint16_t g = g0 * (ysize - y - 1) / ysize + g1 * y / ysize;
+      uint16_t b = b0 * (ysize - y - 1) / ysize + b1 * y / ysize;
+      uint16_t a = a0 * (ysize - y - 1) / ysize + a1 * y / ysize;
+      // put some shape in there for visual debugging
+      if ((x - circle_x) * (x - circle_x) + (y - circle_y) * (y - circle_y) <
+          circle_r * circle_r) {
+        r = (65535 - x * y) ^ seed;
+        g = (x << 8) + y + seed;
+        b = (y << 8) + x * seed;
+        a = 32768 + x * 256 - y;
+      } else if (x > rect_x0 && x < rect_x1 && y > rect_y0 && y < rect_y1) {
+        r = rng(65536);
+        g = rng(65536);
+        b = rng(65536);
+        a = rng(65536);
+      }
+      size_t i = (y * xsize + x) * 2 * num_channels;
+      pixels[i + 0] = (r >> 8);
+      pixels[i + 1] = (r & 255);
+      if (num_channels >= 2) {
+        // This may store what is called 'g' in the alpha channel of a 2-channel
+        // image, but that's ok since the content is arbitrary
+        pixels[i + 2] = (g >> 8);
+        pixels[i + 3] = (g & 255);
+      }
+      if (num_channels >= 3) {
+        pixels[i + 4] = (b >> 8);
+        pixels[i + 5] = (b & 255);
+      }
+      if (num_channels >= 4) {
+        pixels[i + 6] = (a >> 8);
+        pixels[i + 7] = (a & 255);
+      }
+    }
+  }
+  return pixels;
+}
+
+TestImage::TestImage() {
+  SetChannels(3);
+  SetAllBitDepths(8);
+  SetColorEncoding("RGB_D65_SRG_Rel_SRG");
+}
+
+TestImage& TestImage::DecodeFromBytes(const PaddedBytes& bytes) {
+  ColorEncoding c_enc;
+  JXL_CHECK(
+      ConvertExternalToInternalColorEncoding(ppf_.color_encoding, &c_enc));
+  extras::ColorHints color_hints;
+  color_hints.Add("color_space", Description(c_enc));
+  JXL_CHECK(
+      extras::DecodeBytes(Span<const uint8_t>(bytes), color_hints, &ppf_));
+  return *this;
+}
+
+TestImage& TestImage::ClearMetadata() {
+  ppf_.metadata = extras::PackedMetadata();
+  return *this;
+}
+
+TestImage& TestImage::SetDimensions(size_t xsize, size_t ysize) {
+  if (xsize <= ppf_.info.xsize && ysize <= ppf_.info.ysize) {
+    for (auto& frame : ppf_.frames) {
+      CropLayerInfo(xsize, ysize, &frame.frame_info.layer_info);
+      CropImage(xsize, ysize, &frame.color);
+      for (auto& ec : frame.extra_channels) {
+        CropImage(xsize, ysize, &ec);
+      }
+    }
+  } else {
+    JXL_CHECK(ppf_.info.xsize == 0 && ppf_.info.ysize == 0);
+  }
+  ppf_.info.xsize = xsize;
+  ppf_.info.ysize = ysize;
+  return *this;
+}
+
+TestImage& TestImage::SetChannels(size_t num_channels) {
+  JXL_CHECK(ppf_.frames.empty());
+  JXL_CHECK(!ppf_.preview_frame);
+  ppf_.info.num_color_channels = num_channels < 3 ? 1 : 3;
+  ppf_.info.num_extra_channels = num_channels - ppf_.info.num_color_channels;
+  if (ppf_.info.num_extra_channels > 0 && ppf_.info.alpha_bits == 0) {
+    ppf_.info.alpha_bits = ppf_.info.bits_per_sample;
+    ppf_.info.alpha_exponent_bits = ppf_.info.exponent_bits_per_sample;
+  }
+  ppf_.extra_channels_info.clear();
+  for (size_t i = 1; i < ppf_.info.num_extra_channels; ++i) {
+    extras::PackedExtraChannel ec;
+    ec.index = i;
+    JxlEncoderInitExtraChannelInfo(JXL_CHANNEL_ALPHA, &ec.ec_info);
+    if (ec.ec_info.bits_per_sample == 0) {
+      ec.ec_info.bits_per_sample = ppf_.info.bits_per_sample;
+      ec.ec_info.exponent_bits_per_sample = ppf_.info.exponent_bits_per_sample;
+    }
+    ppf_.extra_channels_info.emplace_back(std::move(ec));
+  }
+  format_.num_channels = std::min(static_cast<size_t>(4), num_channels);
+  if (ppf_.info.num_color_channels == 1 &&
+      ppf_.color_encoding.color_space != JXL_COLOR_SPACE_GRAY) {
+    SetColorEncoding("Gra_D65_Rel_SRG");
+  }
+  return *this;
+}
+
+// Sets the same bit depth on color, alpha and all extra channels.
+TestImage& TestImage::SetAllBitDepths(uint32_t bits_per_sample,
+                                      uint32_t exponent_bits_per_sample) {
+  ppf_.info.bits_per_sample = bits_per_sample;
+  ppf_.info.exponent_bits_per_sample = exponent_bits_per_sample;
+  if (ppf_.info.num_extra_channels > 0) {
+    ppf_.info.alpha_bits = bits_per_sample;
+    ppf_.info.alpha_exponent_bits = exponent_bits_per_sample;
+  }
+  for (size_t i = 0; i < ppf_.extra_channels_info.size(); ++i) {
+    extras::PackedExtraChannel& ec = ppf_.extra_channels_info[i];
+    ec.ec_info.bits_per_sample = bits_per_sample;
+    ec.ec_info.exponent_bits_per_sample = exponent_bits_per_sample;
+  }
+  format_.data_type = DefaultDataType(ppf_.info);
+  return *this;
+}
+
+TestImage& TestImage::SetDataType(JxlDataType data_type) {
+  format_.data_type = data_type;
+  return *this;
+}
+
+TestImage& TestImage::SetEndianness(JxlEndianness endianness) {
+  format_.endianness = endianness;
+  return *this;
+}
+
+TestImage& TestImage::SetColorEncoding(const std::string& description) {
+  JXL_CHECK(ParseDescription(description, &ppf_.color_encoding));
+  ColorEncoding c_enc;
+  JXL_CHECK(
+      ConvertExternalToInternalColorEncoding(ppf_.color_encoding, &c_enc));
+  JXL_CHECK(c_enc.CreateICC());
+  PaddedBytes icc = c_enc.ICC();
+  ppf_.icc.assign(icc.begin(), icc.end());
+  return *this;
+}
+
+TestImage& TestImage::CoalesceGIFAnimationWithAlpha() {
+  extras::PackedFrame canvas = ppf_.frames[0].Copy();
+  JXL_CHECK(canvas.color.format.num_channels == 3);
+  JXL_CHECK(canvas.color.format.data_type == JXL_TYPE_UINT8);
+  JXL_CHECK(canvas.extra_channels.size() == 1);
+  for (size_t i = 1; i < ppf_.frames.size(); i++) {
+    const extras::PackedFrame& frame = ppf_.frames[i];
+    JXL_CHECK(frame.extra_channels.size() == 1);
+    const JxlLayerInfo& layer_info = frame.frame_info.layer_info;
+    extras::PackedFrame rendered = canvas.Copy();
+    uint8_t* pixels_rendered =
+        reinterpret_cast<uint8_t*>(rendered.color.pixels());
+    const uint8_t* pixels_frame =
+        reinterpret_cast<const uint8_t*>(frame.color.pixels());
+    uint8_t* alpha_rendered =
+        reinterpret_cast<uint8_t*>(rendered.extra_channels[0].pixels());
+    const uint8_t* alpha_frame =
+        reinterpret_cast<const uint8_t*>(frame.extra_channels[0].pixels());
+    for (size_t y = 0; y < frame.color.ysize; y++) {
+      for (size_t x = 0; x < frame.color.xsize; x++) {
+        size_t idx_frame = y * frame.color.xsize + x;
+        size_t idx_rendered = ((layer_info.crop_y0 + y) * rendered.color.xsize +
+                               (layer_info.crop_x0 + x));
+        if (alpha_frame[idx_frame] != 0) {
+          memcpy(&pixels_rendered[idx_rendered * 3],
+                 &pixels_frame[idx_frame * 3], 3);
+          alpha_rendered[idx_rendered] = alpha_frame[idx_frame];
+        }
+      }
+    }
+    if (layer_info.save_as_reference != 0) {
+      canvas = rendered.Copy();
+    }
+    ppf_.frames[i] = std::move(rendered);
+  }
+  return *this;
+}
+
+TestImage::Frame::Frame(TestImage* parent, bool is_preview, size_t index)
+    : parent_(parent), is_preview_(is_preview), index_(index) {}
+
+void TestImage::Frame::ZeroFill() {
+  memset(frame().color.pixels(), 0, frame().color.pixels_size);
+  for (auto& ec : frame().extra_channels) {
+    memset(ec.pixels(), 0, ec.pixels_size);
+  }
+}
+
+void TestImage::Frame::RandomFill(uint16_t seed) {
+  FillPackedImage(ppf().info.bits_per_sample, seed, &frame().color);
+  for (size_t i = 0; i < ppf().extra_channels_info.size(); ++i) {
+    FillPackedImage(ppf().extra_channels_info[i].ec_info.bits_per_sample,
+                    seed + 1 + i, &frame().extra_channels[i]);
+  }
+}
+
+void TestImage::Frame::SetValue(size_t y, size_t x, size_t c, float val) {
+  const extras::PackedImage& color = frame().color;
+  JxlPixelFormat format = color.format;
+  JXL_CHECK(y < ppf().info.ysize);
+  JXL_CHECK(x < ppf().info.xsize);
+  JXL_CHECK(c < format.num_channels);
+  size_t pwidth = extras::PackedImage::BitsPerChannel(format.data_type) / 8;
+  size_t idx = ((y * color.xsize + x) * format.num_channels + c) * pwidth;
+  uint8_t* pixels = reinterpret_cast<uint8_t*>(frame().color.pixels());
+  uint8_t* p = pixels + idx;
+  StoreValue(val, ppf().info.bits_per_sample, frame().color.format, &p);
+}
+
+TestImage::Frame TestImage::AddFrame() {
+  size_t index = ppf_.frames.size();
+  extras::PackedFrame frame(ppf_.info.xsize, ppf_.info.ysize, format_);
+  for (size_t i = 0; i < ppf_.extra_channels_info.size(); ++i) {
+    JxlPixelFormat ec_format = {1, format_.data_type, format_.endianness, 0};
+    extras::PackedImage image(ppf_.info.xsize, ppf_.info.ysize, ec_format);
+    frame.extra_channels.emplace_back(std::move(image));
+  }
+  ppf_.frames.emplace_back(std::move(frame));
+  return Frame(this, false, index);
+}
+
+TestImage::Frame TestImage::AddPreview(size_t xsize, size_t ysize) {
+  extras::PackedFrame frame(xsize, ysize, format_);
+  for (size_t i = 0; i < ppf_.extra_channels_info.size(); ++i) {
+    JxlPixelFormat ec_format = {1, format_.data_type, format_.endianness, 0};
+    extras::PackedImage image(xsize, ysize, ec_format);
+    frame.extra_channels.emplace_back(std::move(image));
+  }
+  ppf_.preview_frame = make_unique<extras::PackedFrame>(std::move(frame));
+  return Frame(this, true, 0);
+}
+
+void TestImage::CropLayerInfo(size_t xsize, size_t ysize, JxlLayerInfo* info) {
+  if (info->crop_x0 < static_cast<ssize_t>(xsize)) {
+    info->xsize = std::min<size_t>(info->xsize, xsize - info->crop_x0);
+  } else {
+    info->xsize = 0;
+  }
+  if (info->crop_y0 < static_cast<ssize_t>(ysize)) {
+    info->ysize = std::min<size_t>(info->ysize, ysize - info->crop_y0);
+  } else {
+    info->ysize = 0;
+  }
+}
+
+void TestImage::CropImage(size_t xsize, size_t ysize,
+                          extras::PackedImage* image) {
+  size_t new_stride = (image->stride / image->xsize) * xsize;
+  uint8_t* buf = reinterpret_cast<uint8_t*>(image->pixels());
+  for (size_t y = 0; y < ysize; ++y) {
+    memmove(&buf[y * new_stride], &buf[y * image->stride], new_stride);
+  }
+  image->xsize = xsize;
+  image->ysize = ysize;
+  image->stride = new_stride;
+  image->pixels_size = ysize * new_stride;
+}
+
+JxlDataType TestImage::DefaultDataType(const JxlBasicInfo& info) {
+  if (info.bits_per_sample == 16 && info.exponent_bits_per_sample == 5) {
+    return JXL_TYPE_FLOAT16;
+  } else if (info.exponent_bits_per_sample > 0 || info.bits_per_sample > 16) {
+    return JXL_TYPE_FLOAT;
+  } else if (info.bits_per_sample > 8) {
+    return JXL_TYPE_UINT16;
+  } else {
+    return JXL_TYPE_UINT8;
+  }
+}
+
+}  // namespace test
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/test_image.h b/third_party/jpeg-xl/lib/jxl/test_image.h
new file mode 100644
index 0000000000..0106a4b341
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/test_image.h
@@ -0,0 +1,94 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_TEST_IMAGE_H_
+#define LIB_JXL_TEST_IMAGE_H_
+
+#include <jxl/codestream_header.h>
+#include <jxl/types.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/padded_bytes.h"
+
+namespace jxl {
+namespace test {
+
+// Returns a test image with some autogenerated pixel content, using 16 bits per
+// channel, big endian order, 1 to 4 channels
+// The seed parameter allows to create images with different pixel content.
+std::vector<uint8_t> GetSomeTestImage(size_t xsize, size_t ysize,
+                                      size_t num_channels, uint16_t seed);
+
+class TestImage {
+ public:
+  TestImage();
+
+  extras::PackedPixelFile& ppf() { return ppf_; }
+
+  TestImage& DecodeFromBytes(const PaddedBytes& bytes);
+
+  TestImage& ClearMetadata();
+
+  TestImage& SetDimensions(size_t xsize, size_t ysize);
+
+  TestImage& SetChannels(size_t num_channels);
+
+  // Sets the same bit depth on color, alpha and all extra channels.
+  TestImage& SetAllBitDepths(uint32_t bits_per_sample,
+                             uint32_t exponent_bits_per_sample = 0);
+
+  TestImage& SetDataType(JxlDataType data_type);
+
+  TestImage& SetEndianness(JxlEndianness endianness);
+
+  TestImage& SetColorEncoding(const std::string& description);
+
+  TestImage& CoalesceGIFAnimationWithAlpha();
+
+  class Frame {
+   public:
+    Frame(TestImage* parent, bool is_preview, size_t index);
+
+    void ZeroFill();
+    void RandomFill(uint16_t seed = 177);
+
+    void SetValue(size_t y, size_t x, size_t c, float val);
+
+   private:
+    extras::PackedPixelFile& ppf() const { return parent_->ppf(); }
+
+    extras::PackedFrame& frame() {
+      return is_preview_ ? *ppf().preview_frame : ppf().frames[index_];
+    }
+
+    TestImage* parent_;
+    bool is_preview_;
+    size_t index_;
+  };
+
+  Frame AddFrame();
+
+  Frame AddPreview(size_t xsize, size_t ysize);
+
+ private:
+  extras::PackedPixelFile ppf_;
+  JxlPixelFormat format_ = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+  static void CropLayerInfo(size_t xsize, size_t ysize, JxlLayerInfo* info);
+
+  static void CropImage(size_t xsize, size_t ysize, extras::PackedImage* image);
+
+  static JxlDataType DefaultDataType(const JxlBasicInfo& info);
+};
+
+}  // namespace test
+}  // namespace jxl
+
+#endif  // LIB_JXL_TEST_IMAGE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/test_utils.cc b/third_party/jpeg-xl/lib/jxl/test_utils.cc
new file mode 100644
index 0000000000..223641a6a5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/test_utils.cc
@@ -0,0 +1,673 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/test_utils.h"
+
+#include <memory>
+#include <string>
+
+#include "lib/extras/packed_image_convert.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_butteraugli_pnorm.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/enc_file.h"
+
+#if !defined(TEST_DATA_PATH)
+#include "tools/cpp/runfiles/runfiles.h"
+#endif
+
+namespace jxl {
+namespace test {
+
+#if defined(TEST_DATA_PATH)
+std::string GetTestDataPath(const std::string& filename) {
+  return std::string(TEST_DATA_PATH "/") + filename;
+}
+#else
+using bazel::tools::cpp::runfiles::Runfiles;
+const std::unique_ptr<Runfiles> kRunfiles(Runfiles::Create(""));
+std::string GetTestDataPath(const std::string& filename) {
+  std::string root(JPEGXL_ROOT_PACKAGE "/testdata/");
+  return kRunfiles->Rlocation(root + filename);
+}
+#endif
+
+PaddedBytes ReadTestData(const std::string& filename) {
+  std::string full_path = GetTestDataPath(filename);
+  PaddedBytes data;
+  fprintf(stderr, "ReadTestData %s\n", full_path.c_str());
+  JXL_CHECK(jxl::ReadFile(full_path, &data));
+  printf("Test data %s is %d bytes long.\n", filename.c_str(),
+         static_cast<int>(data.size()));
+  return data;
+}
+
+Status DecodeFile(extras::JXLDecompressParams dparams,
+                  const Span<const uint8_t> file, CodecInOut* JXL_RESTRICT io,
+                  ThreadPool* pool) {
+  SetThreadParallelRunner(dparams, pool);
+  extras::PackedPixelFile ppf;
+  JXL_RETURN_IF_ERROR(DecodeImageJXL(file.data(), file.size(), dparams,
+                                     /*decoded_bytes=*/nullptr, &ppf));
+  JXL_RETURN_IF_ERROR(ConvertPackedPixelFileToCodecInOut(ppf, pool, io));
+  return true;
+}
+
+void JxlBasicInfoSetFromPixelFormat(JxlBasicInfo* basic_info,
+                                    const JxlPixelFormat* pixel_format) {
+  JxlEncoderInitBasicInfo(basic_info);
+  switch (pixel_format->data_type) {
+    case JXL_TYPE_FLOAT:
+      basic_info->bits_per_sample = 32;
+      basic_info->exponent_bits_per_sample = 8;
+      break;
+    case JXL_TYPE_FLOAT16:
+      basic_info->bits_per_sample = 16;
+      basic_info->exponent_bits_per_sample = 5;
+      break;
+    case JXL_TYPE_UINT8:
+      basic_info->bits_per_sample = 8;
+      basic_info->exponent_bits_per_sample = 0;
+      break;
+    case JXL_TYPE_UINT16:
+      basic_info->bits_per_sample = 16;
+      basic_info->exponent_bits_per_sample = 0;
+      break;
+    default:
+      JXL_ABORT("Unhandled JxlDataType");
+  }
+  if (pixel_format->num_channels < 3) {
+    basic_info->num_color_channels = 1;
+  } else {
+    basic_info->num_color_channels = 3;
+  }
+  if (pixel_format->num_channels == 2 || pixel_format->num_channels == 4) {
+    basic_info->alpha_exponent_bits = basic_info->exponent_bits_per_sample;
+    basic_info->alpha_bits = basic_info->bits_per_sample;
+    basic_info->num_extra_channels = 1;
+  } else {
+    basic_info->alpha_exponent_bits = 0;
+    basic_info->alpha_bits = 0;
+  }
+}
+
+ColorEncoding ColorEncodingFromDescriptor(const ColorEncodingDescriptor& desc) {
+  ColorEncoding c;
+  c.SetColorSpace(desc.color_space);
+  if (desc.color_space != ColorSpace::kXYB) {
+    c.white_point = desc.white_point;
+    c.primaries = desc.primaries;
+    c.tf.SetTransferFunction(desc.tf);
+  }
+  c.rendering_intent = desc.rendering_intent;
+  JXL_CHECK(c.CreateICC());
+  return c;
+}
+
+namespace {
+void CheckSameEncodings(const std::vector<ColorEncoding>& a,
+                        const std::vector<ColorEncoding>& b,
+                        const std::string& check_name,
+                        std::stringstream& failures) {
+  JXL_CHECK(a.size() == b.size());
+  for (size_t i = 0; i < a.size(); ++i) {
+    if ((a[i].ICC() == b[i].ICC()) ||
+        ((a[i].primaries == b[i].primaries) && a[i].tf.IsSame(b[i].tf))) {
+      continue;
+    }
+    failures << "CheckSameEncodings " << check_name << ": " << i
+             << "-th encoding mismatch\n";
+  }
+}
+}  // namespace
+
+bool Roundtrip(const CodecInOut* io, const CompressParams& cparams,
+               extras::JXLDecompressParams dparams,
+               CodecInOut* JXL_RESTRICT io2, std::stringstream& failures,
+               size_t* compressed_size, ThreadPool* pool, AuxOut* aux_out) {
+  if (compressed_size) {
+    *compressed_size = static_cast<size_t>(-1);
+  }
+  PaddedBytes compressed;
+
+  std::vector<ColorEncoding> original_metadata_encodings;
+  std::vector<ColorEncoding> original_current_encodings;
+  std::vector<ColorEncoding> metadata_encodings_1;
+  std::vector<ColorEncoding> metadata_encodings_2;
+  std::vector<ColorEncoding> current_encodings_2;
+  original_metadata_encodings.reserve(io->frames.size());
+  original_current_encodings.reserve(io->frames.size());
+  metadata_encodings_1.reserve(io->frames.size());
+  metadata_encodings_2.reserve(io->frames.size());
+  current_encodings_2.reserve(io->frames.size());
+
+  for (const ImageBundle& ib : io->frames) {
+    // Remember original encoding, will be returned by decoder.
+    original_metadata_encodings.push_back(ib.metadata()->color_encoding);
+    // c_current should not change during encoding.
+    original_current_encodings.push_back(ib.c_current());
+  }
+
+  std::unique_ptr<PassesEncoderState> enc_state =
+      jxl::make_unique<PassesEncoderState>();
+  JXL_CHECK(EncodeFile(cparams, io, enc_state.get(), &compressed, GetJxlCms(),
+                       aux_out, pool));
+
+  for (const ImageBundle& ib1 : io->frames) {
+    metadata_encodings_1.push_back(ib1.metadata()->color_encoding);
+  }
+
+  // Should still be in the same color space after encoding.
+  CheckSameEncodings(metadata_encodings_1, original_metadata_encodings,
+                     "original vs after encoding", failures);
+
+  JXL_CHECK(DecodeFile(dparams, Span<const uint8_t>(compressed), io2, pool));
+  JXL_CHECK(io2->frames.size() == io->frames.size());
+
+  for (const ImageBundle& ib2 : io2->frames) {
+    metadata_encodings_2.push_back(ib2.metadata()->color_encoding);
+    current_encodings_2.push_back(ib2.c_current());
+  }
+
+  // We always produce the original color encoding if a color transform hook is
+  // set.
+  CheckSameEncodings(current_encodings_2, original_current_encodings,
+                     "current: original vs decoded", failures);
+
+  // Decoder returns the originals passed to the encoder.
+  CheckSameEncodings(metadata_encodings_2, original_metadata_encodings,
+                     "metadata: original vs decoded", failures);
+
+  if (compressed_size) {
+    *compressed_size = compressed.size();
+  }
+
+  return failures.str().empty();
+}
+
+size_t Roundtrip(const extras::PackedPixelFile& ppf_in,
+                 extras::JXLCompressParams cparams,
+                 extras::JXLDecompressParams dparams, ThreadPool* pool,
+                 extras::PackedPixelFile* ppf_out) {
+  SetThreadParallelRunner(cparams, pool);
+  SetThreadParallelRunner(dparams, pool);
+  std::vector<uint8_t> compressed;
+  JXL_CHECK(extras::EncodeImageJXL(cparams, ppf_in, /*jpeg_bytes=*/nullptr,
+                                   &compressed));
+  size_t decoded_bytes = 0;
+  JXL_CHECK(extras::DecodeImageJXL(compressed.data(), compressed.size(),
+                                   dparams, &decoded_bytes, ppf_out));
+  JXL_CHECK(decoded_bytes == compressed.size());
+  return compressed.size();
+}
+
+std::vector<ColorEncodingDescriptor> AllEncodings() {
+  std::vector<ColorEncodingDescriptor> all_encodings;
+  all_encodings.reserve(300);
+  ColorEncoding c;
+
+  for (ColorSpace cs : Values<ColorSpace>()) {
+    if (cs == ColorSpace::kUnknown || cs == ColorSpace::kXYB) continue;
+    c.SetColorSpace(cs);
+
+    for (WhitePoint wp : Values<WhitePoint>()) {
+      if (wp == WhitePoint::kCustom) continue;
+      if (c.ImplicitWhitePoint() && c.white_point != wp) continue;
+      c.white_point = wp;
+
+      for (Primaries primaries : Values<Primaries>()) {
+        if (primaries == Primaries::kCustom) continue;
+        if (!c.HasPrimaries()) continue;
+        c.primaries = primaries;
+
+        for (TransferFunction tf : Values<TransferFunction>()) {
+          if (tf == TransferFunction::kUnknown) continue;
+          if (c.tf.SetImplicit() &&
+              (c.tf.IsGamma() || c.tf.GetTransferFunction() != tf)) {
+            continue;
+          }
+          c.tf.SetTransferFunction(tf);
+
+          for (RenderingIntent ri : Values<RenderingIntent>()) {
+            ColorEncodingDescriptor cdesc;
+            cdesc.color_space = cs;
+            cdesc.white_point = wp;
+            cdesc.primaries = primaries;
+            cdesc.tf = tf;
+            cdesc.rendering_intent = ri;
+            all_encodings.push_back(cdesc);
+          }
+        }
+      }
+    }
+  }
+
+  return all_encodings;
+}
+
+jxl::CodecInOut SomeTestImageToCodecInOut(const std::vector<uint8_t>& buf,
+                                          size_t num_channels, size_t xsize,
+                                          size_t ysize) {
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetAlphaBits(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(
+      /*is_gray=*/num_channels == 1 || num_channels == 2);
+  JxlPixelFormat format = {static_cast<uint32_t>(num_channels), JXL_TYPE_UINT16,
+                           JXL_BIG_ENDIAN, 0};
+  JXL_CHECK(ConvertFromExternal(
+      jxl::Span<const uint8_t>(buf.data(), buf.size()), xsize, ysize,
+      jxl::ColorEncoding::SRGB(/*is_gray=*/num_channels < 3),
+      /*bits_per_sample=*/16, format,
+      /*pool=*/nullptr,
+      /*ib=*/&io.Main()));
+  return io;
+}
+
+bool Near(double expected, double value, double max_dist) {
+  double dist = expected > value ? expected - value : value - expected;
+  return dist <= max_dist;
+}
+
+float LoadFloat16(uint16_t bits16) {
+  const uint32_t sign = bits16 >> 15;
+  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
+  const uint32_t mantissa = bits16 & 0x3FF;
+
+  // Subnormal or zero
+  if (biased_exp == 0) {
+    const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024));
+    return sign ? -subnormal : subnormal;
+  }
+
+  // Normalized: convert the representation directly (faster than ldexp/tables).
+  const uint32_t biased_exp32 = biased_exp + (127 - 15);
+  const uint32_t mantissa32 = mantissa << (23 - 10);
+  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
+
+  float result;
+  memcpy(&result, &bits32, 4);
+  return result;
+}
+
+float LoadLEFloat16(const uint8_t* p) {
+  uint16_t bits16 = LoadLE16(p);
+  return LoadFloat16(bits16);
+}
+
+float LoadBEFloat16(const uint8_t* p) {
+  uint16_t bits16 = LoadBE16(p);
+  return LoadFloat16(bits16);
+}
+
+size_t GetPrecision(JxlDataType data_type) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      return 8;
+    case JXL_TYPE_UINT16:
+      return 16;
+    case JXL_TYPE_FLOAT:
+      // Floating point mantissa precision
+      return 24;
+    case JXL_TYPE_FLOAT16:
+      return 11;
+    default:
+      JXL_ABORT("Unhandled JxlDataType");
+  }
+}
+
+size_t GetDataBits(JxlDataType data_type) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      return 8;
+    case JXL_TYPE_UINT16:
+      return 16;
+    case JXL_TYPE_FLOAT:
+      return 32;
+    case JXL_TYPE_FLOAT16:
+      return 16;
+    default:
+      JXL_ABORT("Unhandled JxlDataType");
+  }
+}
+
+std::vector<double> ConvertToRGBA32(const uint8_t* pixels, size_t xsize,
+                                    size_t ysize, const JxlPixelFormat& format,
+                                    double factor) {
+  std::vector<double> result(xsize * ysize * 4);
+  size_t num_channels = format.num_channels;
+  bool gray = num_channels == 1 || num_channels == 2;
+  bool alpha = num_channels == 2 || num_channels == 4;
+  JxlEndianness endianness = format.endianness;
+  // Compute actual type:
+  if (endianness == JXL_NATIVE_ENDIAN) {
+    endianness = IsLittleEndian() ? JXL_LITTLE_ENDIAN : JXL_BIG_ENDIAN;
+  }
+
+  size_t stride =
+      xsize * jxl::DivCeil(GetDataBits(format.data_type) * num_channels,
+                           jxl::kBitsPerByte);
+  if (format.align > 1) stride = jxl::RoundUpTo(stride, format.align);
+
+  if (format.data_type == JXL_TYPE_UINT8) {
+    // Multiplier to bring to 0-1.0 range
+    double mul = factor > 0.0 ? factor : 1.0 / 255.0;
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        size_t j = (y * xsize + x) * 4;
+        size_t i = y * stride + x * num_channels;
+        double r = pixels[i];
+        double g = gray ? r : pixels[i + 1];
+        double b = gray ? r : pixels[i + 2];
+        double a = alpha ? pixels[i + num_channels - 1] : 255;
+        result[j + 0] = r * mul;
+        result[j + 1] = g * mul;
+        result[j + 2] = b * mul;
+        result[j + 3] = a * mul;
+      }
+    }
+  } else if (format.data_type == JXL_TYPE_UINT16) {
+    JXL_ASSERT(endianness != JXL_NATIVE_ENDIAN);
+    // Multiplier to bring to 0-1.0 range
+    double mul = factor > 0.0 ? factor : 1.0 / 65535.0;
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        size_t j = (y * xsize + x) * 4;
+        size_t i = y * stride + x * num_channels * 2;
+        double r, g, b, a;
+        if (endianness == JXL_BIG_ENDIAN) {
+          r = (pixels[i + 0] << 8) + pixels[i + 1];
+          g = gray ? r : (pixels[i + 2] << 8) + pixels[i + 3];
+          b = gray ? r : (pixels[i + 4] << 8) + pixels[i + 5];
+          a = alpha ? (pixels[i + num_channels * 2 - 2] << 8) +
+                          pixels[i + num_channels * 2 - 1]
+                    : 65535;
+        } else {
+          r = (pixels[i + 1] << 8) + pixels[i + 0];
+          g = gray ? r : (pixels[i + 3] << 8) + pixels[i + 2];
+          b = gray ? r : (pixels[i + 5] << 8) + pixels[i + 4];
+          a = alpha ? (pixels[i + num_channels * 2 - 1] << 8) +
+                          pixels[i + num_channels * 2 - 2]
+                    : 65535;
+        }
+        result[j + 0] = r * mul;
+        result[j + 1] = g * mul;
+        result[j + 2] = b * mul;
+        result[j + 3] = a * mul;
+      }
+    }
+  } else if (format.data_type == JXL_TYPE_FLOAT) {
+    JXL_ASSERT(endianness != JXL_NATIVE_ENDIAN);
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        size_t j = (y * xsize + x) * 4;
+        size_t i = y * stride + x * num_channels * 4;
+        double r, g, b, a;
+        if (endianness == JXL_BIG_ENDIAN) {
+          r = LoadBEFloat(pixels + i);
+          g = gray ? r : LoadBEFloat(pixels + i + 4);
+          b = gray ? r : LoadBEFloat(pixels + i + 8);
+          a = alpha ? LoadBEFloat(pixels + i + num_channels * 4 - 4) : 1.0;
+        } else {
+          r = LoadLEFloat(pixels + i);
+          g = gray ? r : LoadLEFloat(pixels + i + 4);
+          b = gray ? r : LoadLEFloat(pixels + i + 8);
+          a = alpha ? LoadLEFloat(pixels + i + num_channels * 4 - 4) : 1.0;
+        }
+        result[j + 0] = r;
+        result[j + 1] = g;
+        result[j + 2] = b;
+        result[j + 3] = a;
+      }
+    }
+  } else if (format.data_type == JXL_TYPE_FLOAT16) {
+    JXL_ASSERT(endianness != JXL_NATIVE_ENDIAN);
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        size_t j = (y * xsize + x) * 4;
+        size_t i = y * stride + x * num_channels * 2;
+        double r, g, b, a;
+        if (endianness == JXL_BIG_ENDIAN) {
+          r = LoadBEFloat16(pixels + i);
+          g = gray ? r : LoadBEFloat16(pixels + i + 2);
+          b = gray ? r : LoadBEFloat16(pixels + i + 4);
+          a = alpha ? LoadBEFloat16(pixels + i + num_channels * 2 - 2) : 1.0;
+        } else {
+          r = LoadLEFloat16(pixels + i);
+          g = gray ? r : LoadLEFloat16(pixels + i + 2);
+          b = gray ? r : LoadLEFloat16(pixels + i + 4);
+          a = alpha ? LoadLEFloat16(pixels + i + num_channels * 2 - 2) : 1.0;
+        }
+        result[j + 0] = r;
+        result[j + 1] = g;
+        result[j + 2] = b;
+        result[j + 3] = a;
+      }
+    }
+  } else {
+    JXL_ASSERT(false);  // Unsupported type
+  }
+  return result;
+}
+
+size_t ComparePixels(const uint8_t* a, const uint8_t* b, size_t xsize,
+                     size_t ysize, const JxlPixelFormat& format_a,
+                     const JxlPixelFormat& format_b,
+                     double threshold_multiplier) {
+  // Convert both images to equal full precision for comparison.
+  std::vector<double> a_full = ConvertToRGBA32(a, xsize, ysize, format_a);
+  std::vector<double> b_full = ConvertToRGBA32(b, xsize, ysize, format_b);
+  bool gray_a = format_a.num_channels < 3;
+  bool gray_b = format_b.num_channels < 3;
+  bool alpha_a = !(format_a.num_channels & 1);
+  bool alpha_b = !(format_b.num_channels & 1);
+  size_t bits_a = GetPrecision(format_a.data_type);
+  size_t bits_b = GetPrecision(format_b.data_type);
+  size_t bits = std::min(bits_a, bits_b);
+  // How much distance is allowed in case of pixels with lower bit depths, given
+  // that the double precision float images use range 0-1.0.
+  // E.g. in case of 1-bit this is 0.5 since 0.499 must map to 0 and 0.501 must
+  // map to 1.
+  double precision = 0.5 * threshold_multiplier / ((1ull << bits) - 1ull);
+  if (format_a.data_type == JXL_TYPE_FLOAT16 ||
+      format_b.data_type == JXL_TYPE_FLOAT16) {
+    // Lower the precision for float16, because it currently looks like the
+    // scalar and wasm implementations of hwy have 1 less bit of precision
+    // than the x86 implementations.
+    // TODO(lode): Set the required precision back to 11 bits when possible.
+    precision = 0.5 * threshold_multiplier / ((1ull << (bits - 1)) - 1ull);
+  }
+  size_t numdiff = 0;
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      size_t i = (y * xsize + x) * 4;
+      bool ok = true;
+      if (gray_a || gray_b) {
+        if (!Near(a_full[i + 0], b_full[i + 0], precision)) ok = false;
+        // If the input was grayscale and the output not, then the output must
+        // have all channels equal.
+        if (gray_a && b_full[i + 0] != b_full[i + 1] &&
+            b_full[i + 2] != b_full[i + 2]) {
+          ok = false;
+        }
+      } else {
+        if (!Near(a_full[i + 0], b_full[i + 0], precision) ||
+            !Near(a_full[i + 1], b_full[i + 1], precision) ||
+            !Near(a_full[i + 2], b_full[i + 2], precision)) {
+          ok = false;
+        }
+      }
+      if (alpha_a && alpha_b) {
+        if (!Near(a_full[i + 3], b_full[i + 3], precision)) ok = false;
+      } else {
+        // If the input had no alpha channel, the output should be opaque
+        // after roundtrip.
+        if (alpha_b && !Near(1.0, b_full[i + 3], precision)) ok = false;
+      }
+      if (!ok) numdiff++;
+    }
+  }
+  return numdiff;
+}
+
+double DistanceRMS(const uint8_t* a, const uint8_t* b, size_t xsize,
+                   size_t ysize, const JxlPixelFormat& format) {
+  // Convert both images to equal full precision for comparison.
+  std::vector<double> a_full = ConvertToRGBA32(a, xsize, ysize, format);
+  std::vector<double> b_full = ConvertToRGBA32(b, xsize, ysize, format);
+  double sum = 0.0;
+  for (size_t y = 0; y < ysize; y++) {
+    double row_sum = 0.0;
+    for (size_t x = 0; x < xsize; x++) {
+      size_t i = (y * xsize + x) * 4;
+      for (size_t c = 0; c < format.num_channels; ++c) {
+        double diff = a_full[i + c] - b_full[i + c];
+        row_sum += diff * diff;
+      }
+    }
+    sum += row_sum;
+  }
+  sum /= (xsize * ysize);
+  return sqrt(sum);
+}
+
+float ButteraugliDistance(const extras::PackedPixelFile& a,
+                          const extras::PackedPixelFile& b, ThreadPool* pool) {
+  CodecInOut io0;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(a, pool, &io0));
+  CodecInOut io1;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(b, pool, &io1));
+  // TODO(eustas): simplify?
+  return ButteraugliDistance(io0.frames, io1.frames, ButteraugliParams(),
+                             GetJxlCms(),
+                             /*distmap=*/nullptr, pool);
+}
+
+float Butteraugli3Norm(const extras::PackedPixelFile& a,
+                       const extras::PackedPixelFile& b, ThreadPool* pool) {
+  CodecInOut io0;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(a, pool, &io0));
+  CodecInOut io1;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(b, pool, &io1));
+  ButteraugliParams ba;
+  ImageF distmap;
+  ButteraugliDistance(io0.frames, io1.frames, ba, GetJxlCms(), &distmap, pool);
+  return ComputeDistanceP(distmap, ba, 3);
+}
+
+float ComputeDistance2(const extras::PackedPixelFile& a,
+                       const extras::PackedPixelFile& b) {
+  CodecInOut io0;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(a, nullptr, &io0));
+  CodecInOut io1;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(b, nullptr, &io1));
+  return ComputeDistance2(io0.Main(), io1.Main(), GetJxlCms());
+}
+
+bool SameAlpha(const extras::PackedPixelFile& a,
+               const extras::PackedPixelFile& b) {
+  JXL_CHECK(a.info.xsize == b.info.xsize);
+  JXL_CHECK(a.info.ysize == b.info.ysize);
+  JXL_CHECK(a.info.alpha_bits == b.info.alpha_bits);
+  JXL_CHECK(a.info.alpha_exponent_bits == b.info.alpha_exponent_bits);
+  JXL_CHECK(a.info.alpha_bits > 0);
+  JXL_CHECK(a.frames.size() == b.frames.size());
+  for (size_t i = 0; i < a.frames.size(); ++i) {
+    const extras::PackedImage& color_a = a.frames[i].color;
+    const extras::PackedImage& color_b = b.frames[i].color;
+    JXL_CHECK(color_a.format.num_channels == color_b.format.num_channels);
+    JXL_CHECK(color_a.format.data_type == color_b.format.data_type);
+    JXL_CHECK(color_a.format.endianness == color_b.format.endianness);
+    JXL_CHECK(color_a.pixels_size == color_b.pixels_size);
+    size_t pwidth =
+        extras::PackedImage::BitsPerChannel(color_a.format.data_type) / 8;
+    size_t num_color = color_a.format.num_channels < 3 ? 1 : 3;
+    const uint8_t* p_a = reinterpret_cast<const uint8_t*>(color_a.pixels());
+    const uint8_t* p_b = reinterpret_cast<const uint8_t*>(color_b.pixels());
+    for (size_t y = 0; y < a.info.ysize; ++y) {
+      for (size_t x = 0; x < a.info.xsize; ++x) {
+        size_t idx =
+            ((y * a.info.xsize + x) * color_a.format.num_channels + num_color) *
+            pwidth;
+        if (memcmp(&p_a[idx], &p_b[idx], pwidth) != 0) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+bool SamePixels(const extras::PackedImage& a, const extras::PackedImage& b) {
+  JXL_CHECK(a.xsize == b.xsize);
+  JXL_CHECK(a.ysize == b.ysize);
+  JXL_CHECK(a.format.num_channels == b.format.num_channels);
+  JXL_CHECK(a.format.data_type == b.format.data_type);
+  JXL_CHECK(a.format.endianness == b.format.endianness);
+  JXL_CHECK(a.pixels_size == b.pixels_size);
+  const uint8_t* p_a = reinterpret_cast<const uint8_t*>(a.pixels());
+  const uint8_t* p_b = reinterpret_cast<const uint8_t*>(b.pixels());
+  for (size_t y = 0; y < a.ysize; ++y) {
+    for (size_t x = 0; x < a.xsize; ++x) {
+      size_t idx = (y * a.xsize + x) * a.pixel_stride();
+      if (memcmp(&p_a[idx], &p_b[idx], a.pixel_stride()) != 0) {
+        printf("Mismatch at row %" PRIuS " col %" PRIuS "\n", y, x);
+        printf("  a: ");
+        for (size_t j = 0; j < a.pixel_stride(); ++j) {
+          printf(" %3u", p_a[idx + j]);
+        }
+        printf("\n  b: ");
+        for (size_t j = 0; j < a.pixel_stride(); ++j) {
+          printf(" %3u", p_b[idx + j]);
+        }
+        printf("\n");
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool SamePixels(const extras::PackedPixelFile& a,
+                const extras::PackedPixelFile& b) {
+  JXL_CHECK(a.info.xsize == b.info.xsize);
+  JXL_CHECK(a.info.ysize == b.info.ysize);
+  JXL_CHECK(a.info.bits_per_sample == b.info.bits_per_sample);
+  JXL_CHECK(a.info.exponent_bits_per_sample == b.info.exponent_bits_per_sample);
+  JXL_CHECK(a.frames.size() == b.frames.size());
+  for (size_t i = 0; i < a.frames.size(); ++i) {
+    const auto& frame_a = a.frames[i];
+    const auto& frame_b = b.frames[i];
+    if (!SamePixels(frame_a.color, frame_b.color)) {
+      return false;
+    }
+    JXL_CHECK(frame_a.extra_channels.size() == frame_b.extra_channels.size());
+    for (size_t j = 0; j < frame_a.extra_channels.size(); ++j) {
+      if (!SamePixels(frame_a.extra_channels[i], frame_b.extra_channels[i])) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace test
+
+bool operator==(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b) {
+  if (a.size() != b.size()) return false;
+  if (memcmp(a.data(), b.data(), a.size()) != 0) return false;
+  return true;
+}
+
+// Allow using EXPECT_EQ on jxl::PaddedBytes
+bool operator!=(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b) {
+  return !(a == b);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/test_utils.h b/third_party/jpeg-xl/lib/jxl/test_utils.h
new file mode 100644
index 0000000000..8c5cd434f5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/test_utils.h
@@ -0,0 +1,175 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_TEST_UTILS_H_
+#define LIB_JXL_TEST_UTILS_H_
+
+// TODO(eustas): reduce includes (move to .cc)
+
+// Macros and functions useful for tests.
+
+#include <jxl/codestream_header.h>
+#include <jxl/thread_parallel_runner_cxx.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <ostream>
+#include <vector>
+
+#include "lib/extras/dec/jxl.h"
+#include "lib/extras/enc/jxl.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/enc_params.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+namespace test {
+
+std::string GetTestDataPath(const std::string& filename);
+PaddedBytes ReadTestData(const std::string& filename);
+
+void JxlBasicInfoSetFromPixelFormat(JxlBasicInfo* basic_info,
+                                    const JxlPixelFormat* pixel_format);
+
+template <typename Params>
+void SetThreadParallelRunner(Params params, ThreadPool* pool) {
+  if (pool && !params.runner_opaque) {
+    params.runner = pool->runner();
+    params.runner_opaque = pool->runner_opaque();
+  }
+}
+
+Status DecodeFile(extras::JXLDecompressParams dparams,
+                  const Span<const uint8_t> file, CodecInOut* JXL_RESTRICT io,
+                  ThreadPool* pool = nullptr);
+
+bool Roundtrip(const CodecInOut* io, const CompressParams& cparams,
+               extras::JXLDecompressParams dparams,
+               CodecInOut* JXL_RESTRICT io2, std::stringstream& failures,
+               size_t* compressed_size = nullptr, ThreadPool* pool = nullptr,
+               AuxOut* aux_out = nullptr);
+
+// Returns compressed size [bytes].
+size_t Roundtrip(const extras::PackedPixelFile& ppf_in,
+                 extras::JXLCompressParams cparams,
+                 extras::JXLDecompressParams dparams, ThreadPool* pool,
+                 extras::PackedPixelFile* ppf_out);
+
+// A POD descriptor of a ColorEncoding. Only used in tests as the return value
+// of AllEncodings().
+struct ColorEncodingDescriptor {
+  ColorSpace color_space;
+  WhitePoint white_point;
+  Primaries primaries;
+  TransferFunction tf;
+  RenderingIntent rendering_intent;
+};
+
+ColorEncoding ColorEncodingFromDescriptor(const ColorEncodingDescriptor& desc);
+
+// Define the operator<< for tests.
+static inline ::std::ostream& operator<<(::std::ostream& os,
+                                         const ColorEncodingDescriptor& c) {
+  return os << "ColorEncoding/" << Description(ColorEncodingFromDescriptor(c));
+}
+
+// Returns ColorEncodingDescriptors, which are only used in tests. To obtain a
+// ColorEncoding object call ColorEncodingFromDescriptor and then call
+// ColorEncoding::CreateProfile() on that object to generate a profile.
+std::vector<ColorEncodingDescriptor> AllEncodings();
+
+// Returns a CodecInOut based on the buf, xsize, ysize, and the assumption
+// that the buffer was created using `GetSomeTestImage`.
+jxl::CodecInOut SomeTestImageToCodecInOut(const std::vector<uint8_t>& buf,
+                                          size_t num_channels, size_t xsize,
+                                          size_t ysize);
+
+bool Near(double expected, double value, double max_dist);
+
+// Based on highway scalar implementation, for testing
+float LoadFloat16(uint16_t bits16);
+
+float LoadLEFloat16(const uint8_t* p);
+
+float LoadBEFloat16(const uint8_t* p);
+
+size_t GetPrecision(JxlDataType data_type);
+
+size_t GetDataBits(JxlDataType data_type);
+
+// Procedure to convert pixels to double precision, not efficient, but
+// well-controlled for testing. It uses double, to be able to represent all
+// precisions needed for the maximum data types the API supports: uint32_t
+// integers, and, single precision float. The values are in range 0-1 for SDR.
+std::vector<double> ConvertToRGBA32(const uint8_t* pixels, size_t xsize,
+                                    size_t ysize, const JxlPixelFormat& format,
+                                    double factor = 0.0);
+
+// Returns amount of pixels which differ between the two pictures. Image b is
+// the image after roundtrip after roundtrip, image a before roundtrip. There
+// are more strict requirements for the alpha channel and grayscale values of
+// the output image.
+size_t ComparePixels(const uint8_t* a, const uint8_t* b, size_t xsize,
+                     size_t ysize, const JxlPixelFormat& format_a,
+                     const JxlPixelFormat& format_b,
+                     double threshold_multiplier = 1.0);
+
+double DistanceRMS(const uint8_t* a, const uint8_t* b, size_t xsize,
+                   size_t ysize, const JxlPixelFormat& format);
+
+float ButteraugliDistance(const extras::PackedPixelFile& a,
+                          const extras::PackedPixelFile& b,
+                          ThreadPool* pool = nullptr);
+
+float Butteraugli3Norm(const extras::PackedPixelFile& a,
+                       const extras::PackedPixelFile& b,
+                       ThreadPool* pool = nullptr);
+
+float ComputeDistance2(const extras::PackedPixelFile& a,
+                       const extras::PackedPixelFile& b);
+
+bool SameAlpha(const extras::PackedPixelFile& a,
+               const extras::PackedPixelFile& b);
+
+bool SamePixels(const extras::PackedImage& a, const extras::PackedImage& b);
+
+bool SamePixels(const extras::PackedPixelFile& a,
+                const extras::PackedPixelFile& b);
+
+class ThreadPoolForTests {
+ public:
+  explicit ThreadPoolForTests(int num_threads) {
+    runner_ =
+        JxlThreadParallelRunnerMake(/* memory_manager */ nullptr, num_threads);
+    pool_ =
+        jxl::make_unique<ThreadPool>(JxlThreadParallelRunner, runner_.get());
+  }
+  ThreadPoolForTests(const ThreadPoolForTests&) = delete;
+  ThreadPoolForTests& operator&(const ThreadPoolForTests&) = delete;
+  ThreadPool* operator&() { return pool_.get(); }
+
+ private:
+  JxlThreadParallelRunnerPtr runner_;
+  std::unique_ptr<ThreadPool> pool_;
+};
+
+}  // namespace test
+
+bool operator==(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b);
+
+// Allow using EXPECT_EQ on jxl::PaddedBytes
+bool operator!=(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_TEST_UTILS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/testing.h b/third_party/jpeg-xl/lib/jxl/testing.h
new file mode 100644
index 0000000000..d10b0c3c54
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/testing.h
@@ -0,0 +1,73 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_TESTING_H_
+#define LIB_JXL_TESTING_H_
+
+// GTest/GMock specific macros / wrappers.
+
+// gmock unconditionally redefines those macros (to wrong values).
+// Lets include it only here and mitigate the problem.
+#pragma push_macro("PRIdS")
+#pragma push_macro("PRIuS")
+#include "gmock/gmock.h"
+#pragma pop_macro("PRIuS")
+#pragma pop_macro("PRIdS")
+
+#include <sstream>
+
+#include "gtest/gtest.h"
+
+#ifdef JXL_DISABLE_SLOW_TESTS
+#define JXL_SLOW_TEST(X) DISABLED_##X
+#else
+#define JXL_SLOW_TEST(X) X
+#endif  // JXL_DISABLE_SLOW_TESTS
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+#define JXL_TRANSCODE_JPEG_TEST(X) X
+#else
+#define JXL_TRANSCODE_JPEG_TEST(X) DISABLED_##X
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+#if JPEGXL_ENABLE_BOXES
+#define JXL_BOXES_TEST(X) X
+#else
+#define JXL_BOXES_TEST(X) DISABLED_##X
+#endif  // JPEGXL_ENABLE_BOXES
+
+#ifdef THREAD_SANITIZER
+#define JXL_TSAN_SLOW_TEST(X) DISABLED_##X
+#else
+#define JXL_TSAN_SLOW_TEST(X) X
+#endif  // THREAD_SANITIZER
+
+// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
+// used INSTANTIATE_TEST_CASE_P which is now deprecated.
+#ifdef INSTANTIATE_TEST_SUITE_P
+#define JXL_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
+#else
+#define JXL_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
+#endif
+
+// Ensures that we don't make our test bounds too lax, effectively disabling the
+// tests.
+MATCHER_P(IsSlightlyBelow, max, "") {
+  return max * 0.75 <= arg && arg <= max * 1.0;
+}
+
+#define JXL_EXPECT_OK(F)       \
+  {                            \
+    std::stringstream _;       \
+    EXPECT_TRUE(F) << _.str(); \
+  }
+
+#define JXL_ASSERT_OK(F)       \
+  {                            \
+    std::stringstream _;       \
+    ASSERT_TRUE(F) << _.str(); \
+  }
+
+#endif  // LIB_JXL_TESTING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/tf_gbench.cc b/third_party/jpeg-xl/lib/jxl/tf_gbench.cc
new file mode 100644
index 0000000000..9c010d460a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/tf_gbench.cc
@@ -0,0 +1,143 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "benchmark/benchmark.h"
+#include "lib/jxl/image_ops.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/tf_gbench.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+#define RUN_BENCHMARK(F)                                            \
+  constexpr size_t kNum = 1 << 12;                                  \
+  HWY_FULL(float) d;                                                \
+  /* Three parallel runs, as this will run on R, G and B. */        \
+  auto sum1 = Zero(d);                                              \
+  auto sum2 = Zero(d);                                              \
+  auto sum3 = Zero(d);                                              \
+  for (auto _ : state) {                                            \
+    auto x = Set(d, 1e-5);                                          \
+    auto v1 = Set(d, 1e-5);                                         \
+    auto v2 = Set(d, 1.1e-5);                                       \
+    auto v3 = Set(d, 1.2e-5);                                       \
+    for (size_t i = 0; i < kNum; i++) {                             \
+      sum1 += F(d, v1);                                             \
+      sum2 += F(d, v2);                                             \
+      sum3 += F(d, v3);                                             \
+      v1 += x;                                                      \
+      v2 += x;                                                      \
+      v3 += x;                                                      \
+    }                                                               \
+  }                                                                 \
+  /* floats per second */                                           \
+  state.SetItemsProcessed(kNum* state.iterations() * Lanes(d) * 3); \
+  benchmark::DoNotOptimize(sum1 + sum2 + sum3);
+
+#define RUN_BENCHMARK_SCALAR(F)                              \
+  constexpr size_t kNum = 1 << 12;                           \
+  /* Three parallel runs, as this will run on R, G and B. */ \
+  float sum1 = 0, sum2 = 0, sum3 = 0;                        \
+  for (auto _ : state) {                                     \
+    float x = 1e-5;                                          \
+    float v1 = 1e-5;                                         \
+    float v2 = 1.1e-5;                                       \
+    float v3 = 1.2e-5;                                       \
+    for (size_t i = 0; i < kNum; i++) {                      \
+      sum1 += F(v1);                                         \
+      sum2 += F(v2);                                         \
+      sum3 += F(v3);                                         \
+      v1 += x;                                               \
+      v2 += x;                                               \
+      v3 += x;                                               \
+    }                                                        \
+  }                                                          \
+  /* floats per second */                                    \
+  state.SetItemsProcessed(kNum* state.iterations() * 3);     \
+  benchmark::DoNotOptimize(sum1 + sum2 + sum3);
+
+HWY_NOINLINE void BM_FastSRGB(benchmark::State& state) {
+  RUN_BENCHMARK(FastLinearToSRGB);
+}
+
+HWY_NOINLINE void BM_TFSRGB(benchmark::State& state) {
+  RUN_BENCHMARK(TF_SRGB().EncodedFromDisplay);
+}
+
+HWY_NOINLINE void BM_PQDFE(benchmark::State& state) {
+  RUN_BENCHMARK(TF_PQ().DisplayFromEncoded);
+}
+
+HWY_NOINLINE void BM_PQEFD(benchmark::State& state) {
+  RUN_BENCHMARK(TF_PQ().EncodedFromDisplay);
+}
+
+HWY_NOINLINE void BM_PQSlowDFE(benchmark::State& state) {
+  RUN_BENCHMARK_SCALAR(TF_PQ().DisplayFromEncoded);
+}
+
+HWY_NOINLINE void BM_PQSlowEFD(benchmark::State& state) {
+  RUN_BENCHMARK_SCALAR(TF_PQ().EncodedFromDisplay);
+}
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+namespace {
+
+HWY_EXPORT(BM_FastSRGB);
+HWY_EXPORT(BM_TFSRGB);
+HWY_EXPORT(BM_PQDFE);
+HWY_EXPORT(BM_PQEFD);
+HWY_EXPORT(BM_PQSlowDFE);
+HWY_EXPORT(BM_PQSlowEFD);
+
+float SRGB_pow(float x) {
+  return x < 0.0031308f ? 12.92f * x : 1.055f * powf(x, 1.0f / 2.4f) - 0.055f;
+}
+
+void BM_FastSRGB(benchmark::State& state) {
+  HWY_DYNAMIC_DISPATCH(BM_FastSRGB)(state);
+}
+void BM_TFSRGB(benchmark::State& state) {
+  HWY_DYNAMIC_DISPATCH(BM_TFSRGB)(state);
+}
+void BM_PQDFE(benchmark::State& state) {
+  HWY_DYNAMIC_DISPATCH(BM_PQDFE)(state);
+}
+void BM_PQEFD(benchmark::State& state) {
+  HWY_DYNAMIC_DISPATCH(BM_PQEFD)(state);
+}
+void BM_PQSlowDFE(benchmark::State& state) {
+  HWY_DYNAMIC_DISPATCH(BM_PQSlowDFE)(state);
+}
+void BM_PQSlowEFD(benchmark::State& state) {
+  HWY_DYNAMIC_DISPATCH(BM_PQSlowEFD)(state);
+}
+
+void BM_SRGB_pow(benchmark::State& state) { RUN_BENCHMARK_SCALAR(SRGB_pow); }
+
+BENCHMARK(BM_FastSRGB);
+BENCHMARK(BM_TFSRGB);
+BENCHMARK(BM_SRGB_pow);
+BENCHMARK(BM_PQDFE);
+BENCHMARK(BM_PQEFD);
+BENCHMARK(BM_PQSlowDFE);
+BENCHMARK(BM_PQSlowEFD);
+
+}  // namespace
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/toc.cc b/third_party/jpeg-xl/lib/jxl/toc.cc
new file mode 100644
index 0000000000..fd7740c144
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/toc.cc
@@ -0,0 +1,105 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/toc.h"
+
+#include <stdint.h>
+
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+size_t MaxBits(const size_t num_sizes) {
+  const size_t entry_bits = U32Coder::MaxEncodedBits(kTocDist) * num_sizes;
+  // permutation bit (not its tokens!), padding, entries, padding.
+  return 1 + kBitsPerByte + entry_bits + kBitsPerByte;
+}
+
+Status ReadToc(size_t toc_entries, BitReader* JXL_RESTRICT reader,
+               std::vector<uint32_t>* JXL_RESTRICT sizes,
+               std::vector<coeff_order_t>* JXL_RESTRICT permutation) {
+  if (toc_entries > 65536) {
+    // Prevent out of memory if invalid JXL codestream causes a bogus amount
+    // of toc_entries such as 2720436919446 to be computed.
+    // TODO(lode): verify whether 65536 is a reasonable upper bound
+    return JXL_FAILURE("too many toc entries");
+  }
+
+  sizes->clear();
+  sizes->resize(toc_entries);
+  if (reader->TotalBitsConsumed() >= reader->TotalBytes() * kBitsPerByte) {
+    return JXL_STATUS(StatusCode::kNotEnoughBytes, "Not enough bytes for TOC");
+  }
+  const auto check_bit_budget = [&](size_t num_entries) -> Status {
+    // U32Coder reads 2 bits to recognize variant and kTocDist cheapest variant
+    // is Bits(10), this way at least 12 bits are required per toc-entry.
+    size_t minimal_bit_cost = num_entries * (2 + 10);
+    size_t bit_budget = reader->TotalBytes() * 8;
+    size_t expenses = reader->TotalBitsConsumed();
+    if ((expenses <= bit_budget) &&
+        (minimal_bit_cost <= bit_budget - expenses)) {
+      return true;
+    }
+    return JXL_STATUS(StatusCode::kNotEnoughBytes, "Not enough bytes for TOC");
+  };
+
+  JXL_DASSERT(toc_entries > 0);
+  if (reader->ReadFixedBits<1>() == 1) {
+    JXL_RETURN_IF_ERROR(check_bit_budget(toc_entries));
+    permutation->resize(toc_entries);
+    JXL_RETURN_IF_ERROR(DecodePermutation(/*skip=*/0, toc_entries,
+                                          permutation->data(), reader));
+  }
+  JXL_RETURN_IF_ERROR(reader->JumpToByteBoundary());
+  JXL_RETURN_IF_ERROR(check_bit_budget(toc_entries));
+  for (size_t i = 0; i < toc_entries; ++i) {
+    (*sizes)[i] = U32Coder::Read(kTocDist, reader);
+  }
+  JXL_RETURN_IF_ERROR(reader->JumpToByteBoundary());
+  JXL_RETURN_IF_ERROR(check_bit_budget(0));
+  return true;
+}
+
+Status ReadGroupOffsets(size_t toc_entries, BitReader* JXL_RESTRICT reader,
+                        std::vector<uint64_t>* JXL_RESTRICT offsets,
+                        std::vector<uint32_t>* JXL_RESTRICT sizes,
+                        uint64_t* total_size) {
+  std::vector<coeff_order_t> permutation;
+  JXL_RETURN_IF_ERROR(ReadToc(toc_entries, reader, sizes, &permutation));
+
+  offsets->clear();
+  offsets->resize(toc_entries);
+
+  // Prefix sum starting with 0 and ending with the offset of the last group
+  uint64_t offset = 0;
+  for (size_t i = 0; i < toc_entries; ++i) {
+    if (offset + (*sizes)[i] < offset) {
+      return JXL_FAILURE("group offset overflow");
+    }
+    (*offsets)[i] = offset;
+    offset += (*sizes)[i];
+  }
+  if (total_size) {
+    *total_size = offset;
+  }
+
+  if (!permutation.empty()) {
+    std::vector<uint64_t> permuted_offsets;
+    std::vector<uint32_t> permuted_sizes;
+    permuted_offsets.reserve(toc_entries);
+    permuted_sizes.reserve(toc_entries);
+    for (coeff_order_t index : permutation) {
+      permuted_offsets.push_back((*offsets)[index]);
+      permuted_sizes.push_back((*sizes)[index]);
+    }
+    std::swap(*offsets, permuted_offsets);
+    std::swap(*sizes, permuted_sizes);
+  }
+
+  return true;
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/toc.h b/third_party/jpeg-xl/lib/jxl/toc.h
new file mode 100644
index 0000000000..a97197ad45
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/toc.h
@@ -0,0 +1,55 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_TOC_H_
+#define LIB_JXL_TOC_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/field_encodings.h"
+
+namespace jxl {
+
+// (2+bits) = 2,3,4 bytes so encoders can patch TOC after encoding.
+// 30 is sufficient for 4K channels of uncompressed 16-bit samples.
+constexpr U32Enc kTocDist(Bits(10), BitsOffset(14, 1024), BitsOffset(22, 17408),
+                          BitsOffset(30, 4211712));
+
+size_t MaxBits(const size_t num_sizes);
+
+// TODO(veluca): move these to FrameDimensions.
+static JXL_INLINE size_t AcGroupIndex(size_t pass, size_t group,
+                                      size_t num_groups, size_t num_dc_groups,
+                                      bool has_ac_global) {
+  return 1 + num_dc_groups + static_cast<size_t>(has_ac_global) +
+         pass * num_groups + group;
+}
+
+static JXL_INLINE size_t NumTocEntries(size_t num_groups, size_t num_dc_groups,
+                                       size_t num_passes, bool has_ac_global) {
+  if (num_groups == 1 && num_passes == 1) return 1;
+  return AcGroupIndex(0, 0, num_groups, num_dc_groups, has_ac_global) +
+         num_groups * num_passes;
+}
+
+Status ReadToc(size_t toc_entries, BitReader* JXL_RESTRICT reader,
+               std::vector<uint32_t>* JXL_RESTRICT sizes,
+               std::vector<coeff_order_t>* JXL_RESTRICT permutation);
+
+Status ReadGroupOffsets(size_t toc_entries, BitReader* JXL_RESTRICT reader,
+                        std::vector<uint64_t>* JXL_RESTRICT offsets,
+                        std::vector<uint32_t>* JXL_RESTRICT sizes,
+                        uint64_t* total_size);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_TOC_H_
diff --git a/third_party/jpeg-xl/lib/jxl/toc_test.cc b/third_party/jpeg-xl/lib/jxl/toc_test.cc
new file mode 100644
index 0000000000..a7f0f2c27b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/toc_test.cc
@@ -0,0 +1,92 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/toc.h"
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_toc.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+void Roundtrip(size_t num_entries, bool permute, Rng* rng) {
+  // Generate a random permutation.
+  std::vector<coeff_order_t> permutation(num_entries);
+  std::vector<coeff_order_t> inv_permutation(num_entries);
+  for (size_t i = 0; i < num_entries; i++) {
+    permutation[i] = i;
+    inv_permutation[i] = i;
+  }
+  if (permute) {
+    rng->Shuffle(permutation.data(), permutation.size());
+    for (size_t i = 0; i < num_entries; i++) {
+      inv_permutation[permutation[i]] = i;
+    }
+  }
+
+  // Generate num_entries groups of random (byte-aligned) length
+  std::vector<BitWriter> group_codes(num_entries);
+  for (BitWriter& writer : group_codes) {
+    const size_t max_bits = (*rng)() & 0xFFF;
+    BitWriter::Allotment allotment(&writer, max_bits + kBitsPerByte);
+    size_t i = 0;
+    for (; i + BitWriter::kMaxBitsPerCall < max_bits;
+         i += BitWriter::kMaxBitsPerCall) {
+      writer.Write(BitWriter::kMaxBitsPerCall, 0);
+    }
+    for (; i < max_bits; i += 1) {
+      writer.Write(/*n_bits=*/1, 0);
+    }
+    writer.ZeroPadToByte();
+    AuxOut aux_out;
+    allotment.ReclaimAndCharge(&writer, 0, &aux_out);
+  }
+
+  BitWriter writer;
+  AuxOut aux_out;
+  ASSERT_TRUE(WriteGroupOffsets(group_codes, permute ? &permutation : nullptr,
+                                &writer, &aux_out));
+
+  BitReader reader(writer.GetSpan());
+  std::vector<uint64_t> group_offsets;
+  std::vector<uint32_t> group_sizes;
+  uint64_t total_size;
+  ASSERT_TRUE(ReadGroupOffsets(num_entries, &reader, &group_offsets,
+                               &group_sizes, &total_size));
+  ASSERT_EQ(num_entries, group_offsets.size());
+  ASSERT_EQ(num_entries, group_sizes.size());
+  EXPECT_TRUE(reader.Close());
+
+  uint64_t prefix_sum = 0;
+  for (size_t i = 0; i < num_entries; ++i) {
+    EXPECT_EQ(prefix_sum, group_offsets[inv_permutation[i]]);
+
+    EXPECT_EQ(0u, group_codes[i].BitsWritten() % kBitsPerByte);
+    prefix_sum += group_codes[i].BitsWritten() / kBitsPerByte;
+
+    if (i + 1 < num_entries) {
+      EXPECT_EQ(
+          group_offsets[inv_permutation[i]] + group_sizes[inv_permutation[i]],
+          group_offsets[inv_permutation[i + 1]]);
+    }
+  }
+  EXPECT_EQ(prefix_sum, total_size);
+}
+
+TEST(TocTest, Test) {
+  Rng rng(0);
+  for (size_t num_entries = 1; num_entries < 10; ++num_entries) {
+    for (bool permute : std::vector<bool>{false, true}) {
+      Roundtrip(num_entries, permute, &rng);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/transfer_functions-inl.h b/third_party/jpeg-xl/lib/jxl/transfer_functions-inl.h
new file mode 100644
index 0000000000..9f4c10c76d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/transfer_functions-inl.h
@@ -0,0 +1,413 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Transfer functions for color encodings.
+
+#if defined(LIB_JXL_TRANSFER_FUNCTIONS_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_TRANSFER_FUNCTIONS_INL_H_
+#undef LIB_JXL_TRANSFER_FUNCTIONS_INL_H_
+#else
+#define LIB_JXL_TRANSFER_FUNCTIONS_INL_H_
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/rational_polynomial-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::AndNot;
+using hwy::HWY_NAMESPACE::Gt;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::Lt;
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::Sqrt;
+using hwy::HWY_NAMESPACE::TableLookupBytes;
+
+// Definitions for BT.2100-2 transfer functions (used inside/outside SIMD):
+// "display" is linear light (nits) normalized to [0, 1].
+// "encoded" is a nonlinear encoding (e.g. PQ) in [0, 1].
+// "scene" is a linear function of photon counts, normalized to [0, 1].
+
+// Despite the stated ranges, we need unbounded transfer functions: see
+// http://www.littlecms.com/CIC18_UnboundedCMM.pdf. Inputs can be negative or
+// above 1 due to chromatic adaptation. To avoid severe round-trip errors caused
+// by clamping, we mirror negative inputs via copysign (f(-x) = -f(x), see
+// https://developer.apple.com/documentation/coregraphics/cgcolorspace/1644735-extendedsrgb)
+// and extend the function domains above 1.
+
+// Hybrid Log-Gamma.
+class TF_HLG {
+ public:
+  // EOTF. e = encoded.
+  JXL_INLINE double DisplayFromEncoded(const double e) const {
+    return OOTF(InvOETF(e));
+  }
+
+  // Inverse EOTF. d = display.
+  JXL_INLINE double EncodedFromDisplay(const double d) const {
+    return OETF(InvOOTF(d));
+  }
+
+  // Maximum error 5e-7.
+  template <class D, class V>
+  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
+    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
+    const V kSign = BitCast(d, Set(du, 0x80000000u));
+    const V original_sign = And(x, kSign);
+    x = AndNot(kSign, x);  // abs
+    const V below_div12 = Sqrt(Mul(Set(d, 3.0f), x));
+    const V e =
+        MulAdd(Set(d, kA * 0.693147181f),
+               FastLog2f(d, MulAdd(Set(d, 12), x, Set(d, -kB))), Set(d, kC));
+    const V magnitude = IfThenElse(Le(x, Set(d, kDiv12)), below_div12, e);
+    return Or(AndNot(kSign, magnitude), original_sign);
+  }
+
+ private:
+  // OETF (defines the HLG approach). s = scene, returns encoded.
+  JXL_INLINE double OETF(double s) const {
+    if (s == 0.0) return 0.0;
+    const double original_sign = s;
+    s = std::abs(s);
+
+    if (s <= kDiv12) return copysignf(std::sqrt(3.0 * s), original_sign);
+
+    const double e = kA * std::log(12 * s - kB) + kC;
+    JXL_ASSERT(e > 0.0);
+    return copysignf(e, original_sign);
+  }
+
+  // e = encoded, returns scene.
+  JXL_INLINE double InvOETF(double e) const {
+    if (e == 0.0) return 0.0;
+    const double original_sign = e;
+    e = std::abs(e);
+
+    if (e <= 0.5) return copysignf(e * e * (1.0 / 3), original_sign);
+
+    const double s = (std::exp((e - kC) * kRA) + kB) * kDiv12;
+    JXL_ASSERT(s >= 0);
+    return copysignf(s, original_sign);
+  }
+
+  // s = scene, returns display.
+  JXL_INLINE double OOTF(const double s) const {
+    // The actual (red channel) OOTF is RD = alpha * YS^(gamma-1) * RS, where
+    // YS = 0.2627 * RS + 0.6780 * GS + 0.0593 * BS. Let alpha = 1 so we return
+    // "display" (normalized [0, 1]) instead of nits. Our transfer function
+    // interface does not allow a dependency on YS. Fortunately, the system
+    // gamma at 334 nits is 1.0, so this reduces to RD = RS.
+    return s;
+  }
+
+  // d = display, returns scene.
+  JXL_INLINE double InvOOTF(const double d) const {
+    return d;  // see OOTF().
+  }
+
+  static constexpr double kA = 0.17883277;
+  static constexpr double kRA = 1.0 / kA;
+  static constexpr double kB = 1 - 4 * kA;
+  static constexpr double kC = 0.5599107295;
+  static constexpr double kDiv12 = 1.0 / 12;
+};
+
+class TF_709 {
+ public:
+  JXL_INLINE double EncodedFromDisplay(const double d) const {
+    if (d < kThresh) return kMulLow * d;
+    return kMulHi * std::pow(d, kPowHi) + kSub;
+  }
+
+  // Maximum error 1e-6.
+  template <class D, class V>
+  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
+    auto low = Mul(Set(d, kMulLow), x);
+    auto hi =
+        MulAdd(Set(d, kMulHi), FastPowf(d, x, Set(d, kPowHi)), Set(d, kSub));
+    return IfThenElse(Le(x, Set(d, kThresh)), low, hi);
+  }
+
+  template <class D, class V>
+  JXL_INLINE V DisplayFromEncoded(D d, V x) const {
+    auto low = Mul(Set(d, kInvMulLow), x);
+    auto hi = FastPowf(d, MulAdd(x, Set(d, kInvMulHi), Set(d, kInvAdd)),
+                       Set(d, kInvPowHi));
+    return IfThenElse(Lt(x, Set(d, kInvThresh)), low, hi);
+  }
+
+ private:
+  static constexpr double kThresh = 0.018;
+  static constexpr double kMulLow = 4.5;
+  static constexpr double kMulHi = 1.099;
+  static constexpr double kPowHi = 0.45;
+  static constexpr double kSub = -0.099;
+
+  static constexpr double kInvThresh = 0.081;
+  static constexpr double kInvMulLow = 1 / 4.5;
+  static constexpr double kInvMulHi = 1 / 1.099;
+  static constexpr double kInvPowHi = 1 / 0.45;
+  static constexpr double kInvAdd = 0.099 * kInvMulHi;
+};
+
+// Perceptual Quantization
+class TF_PQ {
+ public:
+  // EOTF (defines the PQ approach). e = encoded.
+  JXL_INLINE double DisplayFromEncoded(double e) const {
+    if (e == 0.0) return 0.0;
+    const double original_sign = e;
+    e = std::abs(e);
+
+    const double xp = std::pow(e, 1.0 / kM2);
+    const double num = std::max(xp - kC1, 0.0);
+    const double den = kC2 - kC3 * xp;
+    JXL_DASSERT(den != 0.0);
+    const double d = std::pow(num / den, 1.0 / kM1);
+    JXL_DASSERT(d >= 0.0);  // Equal for e ~= 1E-9
+    return copysignf(d, original_sign);
+  }
+
+  // Maximum error 3e-6
+  template <class D, class V>
+  JXL_INLINE V DisplayFromEncoded(D d, V x) const {
+    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
+    const V kSign = BitCast(d, Set(du, 0x80000000u));
+    const V original_sign = And(x, kSign);
+    x = AndNot(kSign, x);  // abs
+    // 4-over-4-degree rational polynomial approximation on x+x*x. This improves
+    // the maximum error by about 5x over a rational polynomial for x.
+    auto xpxx = MulAdd(x, x, x);
+    HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
+        HWY_REP4(2.62975656e-04f), HWY_REP4(-6.23553089e-03f),
+        HWY_REP4(7.38602301e-01f), HWY_REP4(2.64553172e+00f),
+        HWY_REP4(5.50034862e-01f),
+    };
+    HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
+        HWY_REP4(4.21350107e+02f), HWY_REP4(-4.28736818e+02f),
+        HWY_REP4(1.74364667e+02f), HWY_REP4(-3.39078883e+01f),
+        HWY_REP4(2.67718770e+00f),
+    };
+    auto magnitude = EvalRationalPolynomial(d, xpxx, p, q);
+    return Or(AndNot(kSign, magnitude), original_sign);
+  }
+
+  // Inverse EOTF. d = display.
+  JXL_INLINE double EncodedFromDisplay(double d) const {
+    if (d == 0.0) return 0.0;
+    const double original_sign = d;
+    d = std::abs(d);
+
+    const double xp = std::pow(d, kM1);
+    const double num = kC1 + xp * kC2;
+    const double den = 1.0 + xp * kC3;
+    const double e = std::pow(num / den, kM2);
+    JXL_DASSERT(e > 0.0);
+    return copysignf(e, original_sign);
+  }
+
+  // Maximum error 7e-7.
+  template <class D, class V>
+  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
+    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
+    const V kSign = BitCast(d, Set(du, 0x80000000u));
+    const V original_sign = And(x, kSign);
+    x = AndNot(kSign, x);  // abs
+    // 4-over-4-degree rational polynomial approximation on x**0.25, with two
+    // different polynomials above and below 1e-4.
+    auto xto025 = Sqrt(Sqrt(x));
+    HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
+        HWY_REP4(1.351392e-02f), HWY_REP4(-1.095778e+00f),
+        HWY_REP4(5.522776e+01f), HWY_REP4(1.492516e+02f),
+        HWY_REP4(4.838434e+01f),
+    };
+    HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
+        HWY_REP4(1.012416e+00f), HWY_REP4(2.016708e+01f),
+        HWY_REP4(9.263710e+01f), HWY_REP4(1.120607e+02f),
+        HWY_REP4(2.590418e+01f),
+    };
+
+    HWY_ALIGN constexpr float plo[(4 + 1) * 4] = {
+        HWY_REP4(9.863406e-06f),  HWY_REP4(3.881234e-01f),
+        HWY_REP4(1.352821e+02f),  HWY_REP4(6.889862e+04f),
+        HWY_REP4(-2.864824e+05f),
+    };
+    HWY_ALIGN constexpr float qlo[(4 + 1) * 4] = {
+        HWY_REP4(3.371868e+01f),  HWY_REP4(1.477719e+03f),
+        HWY_REP4(1.608477e+04f),  HWY_REP4(-4.389884e+04f),
+        HWY_REP4(-2.072546e+05f),
+    };
+
+    auto magnitude = IfThenElse(Lt(x, Set(d, 1e-4f)),
+                                EvalRationalPolynomial(d, xto025, plo, qlo),
+                                EvalRationalPolynomial(d, xto025, p, q));
+    return Or(AndNot(kSign, magnitude), original_sign);
+  }
+
+ private:
+  static constexpr double kM1 = 2610.0 / 16384;
+  static constexpr double kM2 = (2523.0 / 4096) * 128;
+  static constexpr double kC1 = 3424.0 / 4096;
+  static constexpr double kC2 = (2413.0 / 4096) * 32;
+  static constexpr double kC3 = (2392.0 / 4096) * 32;
+};
+
+// sRGB
+class TF_SRGB {
+ public:
+  template <typename V>
+  JXL_INLINE V DisplayFromEncoded(V x) const {
+    const HWY_FULL(float) d;
+    const HWY_FULL(uint32_t) du;
+    const V kSign = BitCast(d, Set(du, 0x80000000u));
+    const V original_sign = And(x, kSign);
+    x = AndNot(kSign, x);  // abs
+
+    // TODO(janwas): range reduction
+    // Computed via af_cheb_rational (k=100); replicated 4x.
+    HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
+        2.200248328e-04f, 2.200248328e-04f, 2.200248328e-04f, 2.200248328e-04f,
+        1.043637593e-02f, 1.043637593e-02f, 1.043637593e-02f, 1.043637593e-02f,
+        1.624820318e-01f, 1.624820318e-01f, 1.624820318e-01f, 1.624820318e-01f,
+        7.961564959e-01f, 7.961564959e-01f, 7.961564959e-01f, 7.961564959e-01f,
+        8.210152774e-01f, 8.210152774e-01f, 8.210152774e-01f, 8.210152774e-01f,
+    };
+    HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
+        2.631846970e-01f,  2.631846970e-01f,  2.631846970e-01f,
+        2.631846970e-01f,  1.076976492e+00f,  1.076976492e+00f,
+        1.076976492e+00f,  1.076976492e+00f,  4.987528350e-01f,
+        4.987528350e-01f,  4.987528350e-01f,  4.987528350e-01f,
+        -5.512498495e-02f, -5.512498495e-02f, -5.512498495e-02f,
+        -5.512498495e-02f, 6.521209011e-03f,  6.521209011e-03f,
+        6.521209011e-03f,  6.521209011e-03f,
+    };
+    const V linear = Mul(x, Set(d, kLowDivInv));
+    const V poly = EvalRationalPolynomial(d, x, p, q);
+    const V magnitude =
+        IfThenElse(Gt(x, Set(d, kThreshSRGBToLinear)), poly, linear);
+    return Or(AndNot(kSign, magnitude), original_sign);
+  }
+
+  // Error ~5e-07
+  template <class D, class V>
+  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
+    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
+    const V kSign = BitCast(d, Set(du, 0x80000000u));
+    const V original_sign = And(x, kSign);
+    x = AndNot(kSign, x);  // abs
+
+    // Computed via af_cheb_rational (k=100); replicated 4x.
+    HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
+        -5.135152395e-04f, -5.135152395e-04f, -5.135152395e-04f,
+        -5.135152395e-04f, 5.287254571e-03f,  5.287254571e-03f,
+        5.287254571e-03f,  5.287254571e-03f,  3.903842876e-01f,
+        3.903842876e-01f,  3.903842876e-01f,  3.903842876e-01f,
+        1.474205315e+00f,  1.474205315e+00f,  1.474205315e+00f,
+        1.474205315e+00f,  7.352629620e-01f,  7.352629620e-01f,
+        7.352629620e-01f,  7.352629620e-01f,
+    };
+    HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
+        1.004519624e-02f, 1.004519624e-02f, 1.004519624e-02f, 1.004519624e-02f,
+        3.036675394e-01f, 3.036675394e-01f, 3.036675394e-01f, 3.036675394e-01f,
+        1.340816930e+00f, 1.340816930e+00f, 1.340816930e+00f, 1.340816930e+00f,
+        9.258482155e-01f, 9.258482155e-01f, 9.258482155e-01f, 9.258482155e-01f,
+        2.424867759e-02f, 2.424867759e-02f, 2.424867759e-02f, 2.424867759e-02f,
+    };
+    const V linear = Mul(x, Set(d, kLowDiv));
+    const V poly = EvalRationalPolynomial(d, Sqrt(x), p, q);
+    const V magnitude =
+        IfThenElse(Gt(x, Set(d, kThreshLinearToSRGB)), poly, linear);
+    return Or(AndNot(kSign, magnitude), original_sign);
+  }
+
+ private:
+  static constexpr float kThreshSRGBToLinear = 0.04045f;
+  static constexpr float kThreshLinearToSRGB = 0.0031308f;
+  static constexpr float kLowDiv = 12.92f;
+  static constexpr float kLowDivInv = 1.0f / kLowDiv;
+};
+
+// Linear to sRGB conversion with error of at most 1.2e-4.
+template <typename D, typename V>
+V FastLinearToSRGB(D d, V v) {
+  const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
+  const hwy::HWY_NAMESPACE::Rebind<int32_t, D> di;
+  // Convert to 0.25 - 0.5 range.
+  auto v025_05 = BitCast(
+      d, And(Or(BitCast(du, v), Set(du, 0x3e800000)), Set(du, 0x3effffff)));
+  // third degree polynomial approximation between 0.25 and 0.5
+  // of 1.055/2^(7/2.4) * x^(1/2.4) * 0.5. A degree 4 polynomial only improves
+  // accuracy by about 3x.
+  auto d1 = MulAdd(v025_05, Set(d, 0.059914046f), Set(d, -0.108894556f));
+  auto d2 = MulAdd(d1, v025_05, Set(d, 0.107963754f));
+  auto pow = MulAdd(d2, v025_05, Set(d, 0.018092343f));
+  // Compute extra multiplier depending on exponent. Valid exponent range for
+  // [0.0031308f, 1.0) is 0...8 after subtracting 118.
+  // The next three constants contain a representation of the powers of
+  // 2**(1/2.4) = 2**(5/12) times two; in particular, bits from 26 to 31 are
+  // always the same and in k2to512powers_basebits, and the two arrays contain
+  // the next groups of 8 bits. This ends up being a 22-bit representation (with
+  // a mantissa of 13 bits). The choice of polynomial to approximate is such
+  // that the multiplication factor has the highest 5 bits constant, and that
+  // the factor for the lowest possible exponent is a power of two (thus making
+  // the additional bits 0, which is used to correctly merge back together the
+  // floats).
+  constexpr uint32_t k2to512powers_basebits = 0x40000000;
+  HWY_ALIGN constexpr uint8_t k2to512powers_25to18bits[16] = {
+      0x0,  0xa,  0x19, 0x26, 0x32, 0x41, 0x4d, 0x5c,
+      0x68, 0x75, 0x83, 0x8f, 0xa0, 0xaa, 0xb9, 0xc6,
+  };
+  HWY_ALIGN constexpr uint8_t k2to512powers_17to10bits[16] = {
+      0x0,  0xb7, 0x4,  0xd,  0xcb, 0xe7, 0x41, 0x68,
+      0x51, 0xd1, 0xeb, 0xf2, 0x0,  0xb7, 0x4,  0xd,
+  };
+  // Note that vld1q_s8_x2 on ARM seems to actually be slower.
+#if HWY_TARGET != HWY_SCALAR
+  using hwy::HWY_NAMESPACE::ShiftLeft;
+  using hwy::HWY_NAMESPACE::ShiftRight;
+  // Every lane of exp is now (if cast to byte) {0, 0, 0, <index for lookup>}.
+  auto exp = Sub(ShiftRight<23>(BitCast(di, v)), Set(di, 118));
+  auto pow25to18bits = TableLookupBytes(
+      LoadDup128(di,
+                 reinterpret_cast<const int32_t*>(k2to512powers_25to18bits)),
+      exp);
+  auto pow17to10bits = TableLookupBytes(
+      LoadDup128(di,
+                 reinterpret_cast<const int32_t*>(k2to512powers_17to10bits)),
+      exp);
+  // Now, pow* contain {0, 0, 0, <part of float repr of multiplier>}. Here
+  // we take advantage of the fact that each table has its position 0 equal to
+  // 0.
+  // We can now just reassemble the float.
+  auto mul = BitCast(
+      d, Or(Or(ShiftLeft<18>(pow25to18bits), ShiftLeft<10>(pow17to10bits)),
+            Set(di, k2to512powers_basebits)));
+#else
+  // Fallback for scalar.
+  uint32_t exp = ((BitCast(di, v).raw >> 23) - 118) & 0xf;
+  auto mul = BitCast(d, Set(di, (k2to512powers_25to18bits[exp] << 18) |
+                                    (k2to512powers_17to10bits[exp] << 10) |
+                                    k2to512powers_basebits));
+#endif
+  return IfThenElse(Lt(v, Set(d, 0.0031308f)), Mul(v, Set(d, 12.92f)),
+                    MulAdd(pow, mul, Set(d, -0.055)));
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_TRANSFER_FUNCTIONS_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/transpose-inl.h b/third_party/jpeg-xl/lib/jxl/transpose-inl.h
new file mode 100644
index 0000000000..4674420737
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/transpose-inl.h
@@ -0,0 +1,203 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Block transpose for DCT/IDCT
+
+#if defined(LIB_JXL_TRANSPOSE_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_TRANSPOSE_INL_H_
+#undef LIB_JXL_TRANSPOSE_INL_H_
+#else
+#define LIB_JXL_TRANSPOSE_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+#include <type_traits>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dct_block-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+#ifndef JXL_INLINE_TRANSPOSE
+// Workaround for issue #42 - (excessive?) inlining causes invalid codegen.
+#if defined(__arm__)
+#define JXL_INLINE_TRANSPOSE HWY_NOINLINE
+#else
+#define JXL_INLINE_TRANSPOSE HWY_INLINE
+#endif
+#endif  // JXL_INLINE_TRANSPOSE
+
+// Simple wrapper that ensures that a function will not be inlined.
+template <typename T, typename... Args>
+JXL_NOINLINE void NoInlineWrapper(const T& f, const Args&... args) {
+  return f(args...);
+}
+
+template <bool enabled>
+struct TransposeSimdTag {};
+
+// TODO(veluca): it's not super useful to have this in the SIMD namespace.
+template <size_t ROWS_or_0, size_t COLS_or_0, class From, class To>
+JXL_INLINE_TRANSPOSE void GenericTransposeBlock(TransposeSimdTag<false>,
+                                                const From& from, const To& to,
+                                                size_t ROWSp, size_t COLSp) {
+  size_t ROWS = ROWS_or_0 == 0 ? ROWSp : ROWS_or_0;
+  size_t COLS = COLS_or_0 == 0 ? COLSp : COLS_or_0;
+  for (size_t n = 0; n < ROWS; ++n) {
+    for (size_t m = 0; m < COLS; ++m) {
+      to.Write(from.Read(n, m), m, n);
+    }
+  }
+}
+
+// TODO(veluca): AVX3?
+#if HWY_CAP_GE256
+constexpr bool TransposeUseSimd(size_t ROWS, size_t COLS) {
+  return ROWS % 8 == 0 && COLS % 8 == 0;
+}
+
+template <size_t ROWS_or_0, size_t COLS_or_0, class From, class To>
+JXL_INLINE_TRANSPOSE void GenericTransposeBlock(TransposeSimdTag<true>,
+                                                const From& from, const To& to,
+                                                size_t ROWSp, size_t COLSp) {
+  size_t ROWS = ROWS_or_0 == 0 ? ROWSp : ROWS_or_0;
+  size_t COLS = COLS_or_0 == 0 ? COLSp : COLS_or_0;
+  static_assert(MaxLanes(BlockDesc<8>()) == 8, "Invalid descriptor size");
+  static_assert(ROWS_or_0 % 8 == 0, "Invalid number of rows");
+  static_assert(COLS_or_0 % 8 == 0, "Invalid number of columns");
+  for (size_t n = 0; n < ROWS; n += 8) {
+    for (size_t m = 0; m < COLS; m += 8) {
+      const BlockDesc<8> d;
+      auto i0 = from.LoadPart(d, n + 0, m + 0);
+      auto i1 = from.LoadPart(d, n + 1, m + 0);
+      auto i2 = from.LoadPart(d, n + 2, m + 0);
+      auto i3 = from.LoadPart(d, n + 3, m + 0);
+      auto i4 = from.LoadPart(d, n + 4, m + 0);
+      auto i5 = from.LoadPart(d, n + 5, m + 0);
+      auto i6 = from.LoadPart(d, n + 6, m + 0);
+      auto i7 = from.LoadPart(d, n + 7, m + 0);
+      // Surprisingly, this straightforward implementation (24 cycles on port5)
+      // is faster than load128+insert and LoadDup128+ConcatUpperLower+blend.
+      const auto q0 = InterleaveLower(d, i0, i2);
+      const auto q1 = InterleaveLower(d, i1, i3);
+      const auto q2 = InterleaveUpper(d, i0, i2);
+      const auto q3 = InterleaveUpper(d, i1, i3);
+      const auto q4 = InterleaveLower(d, i4, i6);
+      const auto q5 = InterleaveLower(d, i5, i7);
+      const auto q6 = InterleaveUpper(d, i4, i6);
+      const auto q7 = InterleaveUpper(d, i5, i7);
+
+      const auto r0 = InterleaveLower(d, q0, q1);
+      const auto r1 = InterleaveUpper(d, q0, q1);
+      const auto r2 = InterleaveLower(d, q2, q3);
+      const auto r3 = InterleaveUpper(d, q2, q3);
+      const auto r4 = InterleaveLower(d, q4, q5);
+      const auto r5 = InterleaveUpper(d, q4, q5);
+      const auto r6 = InterleaveLower(d, q6, q7);
+      const auto r7 = InterleaveUpper(d, q6, q7);
+
+      i0 = ConcatLowerLower(d, r4, r0);
+      i1 = ConcatLowerLower(d, r5, r1);
+      i2 = ConcatLowerLower(d, r6, r2);
+      i3 = ConcatLowerLower(d, r7, r3);
+      i4 = ConcatUpperUpper(d, r4, r0);
+      i5 = ConcatUpperUpper(d, r5, r1);
+      i6 = ConcatUpperUpper(d, r6, r2);
+      i7 = ConcatUpperUpper(d, r7, r3);
+      to.StorePart(d, i0, m + 0, n + 0);
+      to.StorePart(d, i1, m + 1, n + 0);
+      to.StorePart(d, i2, m + 2, n + 0);
+      to.StorePart(d, i3, m + 3, n + 0);
+      to.StorePart(d, i4, m + 4, n + 0);
+      to.StorePart(d, i5, m + 5, n + 0);
+      to.StorePart(d, i6, m + 6, n + 0);
+      to.StorePart(d, i7, m + 7, n + 0);
+    }
+  }
+}
+#elif HWY_TARGET != HWY_SCALAR
+constexpr bool TransposeUseSimd(size_t ROWS, size_t COLS) {
+  return ROWS % 4 == 0 && COLS % 4 == 0;
+}
+
+template <size_t ROWS_or_0, size_t COLS_or_0, class From, class To>
+JXL_INLINE_TRANSPOSE void GenericTransposeBlock(TransposeSimdTag<true>,
+                                                const From& from, const To& to,
+                                                size_t ROWSp, size_t COLSp) {
+  size_t ROWS = ROWS_or_0 == 0 ? ROWSp : ROWS_or_0;
+  size_t COLS = COLS_or_0 == 0 ? COLSp : COLS_or_0;
+  static_assert(MaxLanes(BlockDesc<4>()) == 4, "Invalid descriptor size");
+  static_assert(ROWS_or_0 % 4 == 0, "Invalid number of rows");
+  static_assert(COLS_or_0 % 4 == 0, "Invalid number of columns");
+  for (size_t n = 0; n < ROWS; n += 4) {
+    for (size_t m = 0; m < COLS; m += 4) {
+      const BlockDesc<4> d;
+      const auto p0 = from.LoadPart(d, n + 0, m + 0);
+      const auto p1 = from.LoadPart(d, n + 1, m + 0);
+      const auto p2 = from.LoadPart(d, n + 2, m + 0);
+      const auto p3 = from.LoadPart(d, n + 3, m + 0);
+
+      const auto q0 = InterleaveLower(d, p0, p2);
+      const auto q1 = InterleaveLower(d, p1, p3);
+      const auto q2 = InterleaveUpper(d, p0, p2);
+      const auto q3 = InterleaveUpper(d, p1, p3);
+
+      const auto r0 = InterleaveLower(d, q0, q1);
+      const auto r1 = InterleaveUpper(d, q0, q1);
+      const auto r2 = InterleaveLower(d, q2, q3);
+      const auto r3 = InterleaveUpper(d, q2, q3);
+
+      to.StorePart(d, r0, m + 0, n + 0);
+      to.StorePart(d, r1, m + 1, n + 0);
+      to.StorePart(d, r2, m + 2, n + 0);
+      to.StorePart(d, r3, m + 3, n + 0);
+    }
+  }
+}
+#else
+constexpr bool TransposeUseSimd(size_t ROWS, size_t COLS) { return false; }
+#endif
+
+template <size_t N, size_t M, typename = void>
+struct Transpose {
+  template <typename From, typename To>
+  static void Run(const From& from, const To& to) {
+    // This does not guarantee anything, just saves from the most stupid
+    // mistakes.
+    JXL_DASSERT(from.Address(0, 0) != to.Address(0, 0));
+    TransposeSimdTag<TransposeUseSimd(N, M)> tag;
+    GenericTransposeBlock<N, M>(tag, from, to, N, M);
+  }
+};
+
+// Avoid inlining and unrolling transposes for large blocks.
+template <size_t N, size_t M>
+struct Transpose<
+    N, M, typename std::enable_if<(N >= 8 && M >= 8 && N * M >= 512)>::type> {
+  template <typename From, typename To>
+  static void Run(const From& from, const To& to) {
+    // This does not guarantee anything, just saves from the most stupid
+    // mistakes.
+    JXL_DASSERT(from.Address(0, 0) != to.Address(0, 0));
+    TransposeSimdTag<TransposeUseSimd(N, M)> tag;
+    constexpr void (*transpose)(TransposeSimdTag<TransposeUseSimd(N, M)>,
+                                const From&, const To&, size_t, size_t) =
+        GenericTransposeBlock<0, 0, From, To>;
+    NoInlineWrapper(transpose, tag, from, to, N, M);
+  }
+};
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_TRANSPOSE_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/version.h.in b/third_party/jpeg-xl/lib/jxl/version.h.in
new file mode 100644
index 0000000000..d077abec79
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/version.h.in
@@ -0,0 +1,39 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+/** @addtogroup libjxl_common
+ * @{
+ * @file version.h
+ * @brief libjxl version information
+ */
+
+#ifndef JXL_VERSION_H_
+#define JXL_VERSION_H_
+
+#define JPEGXL_MAJOR_VERSION @JPEGXL_MAJOR_VERSION@ ///< JPEG XL Major version
+#define JPEGXL_MINOR_VERSION @JPEGXL_MINOR_VERSION@ ///< JPEG XL Minor version
+#define JPEGXL_PATCH_VERSION @JPEGXL_PATCH_VERSION@ ///< JPEG XL Patch version
+
+/** Can be used to conditionally compile code for a specific JXL version
+ * @param[maj] major version
+ * @param[min] minor version
+ *
+ * @code
+ * #if JPEGXL_NUMERIC_VERSION < JPEGXL_COMPUTE_NUMERIC_VERSION(0,8,0)
+ * // use old/deprecated api
+ * #else
+ * // use current api
+ * #endif
+ * @endcode
+ */
+#define JPEGXL_COMPUTE_NUMERIC_VERSION(major,minor,patch) ((major<<24) | (minor<<16) | (patch<<8) | 0)
+
+/* Numeric representation of the version */
+#define JPEGXL_NUMERIC_VERSION JPEGXL_COMPUTE_NUMERIC_VERSION(JPEGXL_MAJOR_VERSION,JPEGXL_MINOR_VERSION,JPEGXL_PATCH_VERSION)
+
+#endif /* JXL_VERSION_H_ */
+
+/** @}*/
diff --git a/third_party/jpeg-xl/lib/jxl/xorshift128plus-inl.h b/third_party/jpeg-xl/lib/jxl/xorshift128plus-inl.h
new file mode 100644
index 0000000000..a473d591f2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/xorshift128plus-inl.h
@@ -0,0 +1,103 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Fast but weak random generator.
+
+#if defined(LIB_JXL_XORSHIFT128PLUS_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_XORSHIFT128PLUS_INL_H_
+#undef LIB_JXL_XORSHIFT128PLUS_INL_H_
+#else
+#define LIB_JXL_XORSHIFT128PLUS_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::ShiftLeft;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Xor;
+
+// Adapted from https://github.com/vpxyz/xorshift/blob/master/xorshift128plus/
+// (MIT-license)
+class Xorshift128Plus {
+ public:
+  // 8 independent generators (= single iteration for AVX-512)
+  enum { N = 8 };
+
+  explicit HWY_MAYBE_UNUSED Xorshift128Plus(const uint64_t seed) {
+    // Init state using SplitMix64 generator
+    s0_[0] = SplitMix64(seed + 0x9E3779B97F4A7C15ull);
+    s1_[0] = SplitMix64(s0_[0]);
+    for (size_t i = 1; i < N; ++i) {
+      s0_[i] = SplitMix64(s1_[i - 1]);
+      s1_[i] = SplitMix64(s0_[i]);
+    }
+  }
+
+  HWY_MAYBE_UNUSED Xorshift128Plus(const uint32_t seed1, const uint32_t seed2,
+                                   const uint32_t seed3, const uint32_t seed4) {
+    // Init state using SplitMix64 generator
+    s0_[0] = SplitMix64(((static_cast<uint64_t>(seed1) << 32) + seed2) +
+                        0x9E3779B97F4A7C15ull);
+    s1_[0] = SplitMix64(((static_cast<uint64_t>(seed3) << 32) + seed4) +
+                        0x9E3779B97F4A7C15ull);
+    for (size_t i = 1; i < N; ++i) {
+      s0_[i] = SplitMix64(s0_[i - 1]);
+      s1_[i] = SplitMix64(s1_[i - 1]);
+    }
+  }
+
+  HWY_INLINE HWY_MAYBE_UNUSED void Fill(uint64_t* HWY_RESTRICT random_bits) {
+#if HWY_CAP_INTEGER64
+    const HWY_FULL(uint64_t) d;
+    for (size_t i = 0; i < N; i += Lanes(d)) {
+      auto s1 = Load(d, s0_ + i);
+      const auto s0 = Load(d, s1_ + i);
+      const auto bits = Add(s1, s0);  // b, c
+      Store(s0, d, s0_ + i);
+      s1 = Xor(s1, ShiftLeft<23>(s1));
+      Store(bits, d, random_bits + i);
+      s1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
+      Store(s1, d, s1_ + i);
+    }
+#else
+    for (size_t i = 0; i < N; ++i) {
+      auto s1 = s0_[i];
+      const auto s0 = s1_[i];
+      const auto bits = s1 + s0;  // b, c
+      s0_[i] = s0;
+      s1 ^= s1 << 23;
+      random_bits[i] = bits;
+      s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
+      s1_[i] = s1;
+    }
+#endif
+  }
+
+ private:
+  static uint64_t SplitMix64(uint64_t z) {
+    z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
+    z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
+    return z ^ (z >> 31);
+  }
+
+  HWY_ALIGN uint64_t s0_[N];
+  HWY_ALIGN uint64_t s1_[N];
+};
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_XORSHIFT128PLUS_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/xorshift128plus_test.cc b/third_party/jpeg-xl/lib/jxl/xorshift128plus_test.cc
new file mode 100644
index 0000000000..2b0c78b1d1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/xorshift128plus_test.cc
@@ -0,0 +1,378 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <vector>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/xorshift128plus_test.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+#include <hwy/tests/test_util-inl.h>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/xorshift128plus-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Sub;
+
+// Define to nonzero in order to print the (new) golden outputs.
+#define PRINT_RESULTS 0
+
+const size_t kVectors = 64;
+
+#if PRINT_RESULTS
+
+template <int kNumLanes>
+void Print(const uint64_t (&result)[kNumLanes]) {
+  printf("{ ");
+  for (int i = 0; i < kNumLanes; ++i) {
+    if (i != 0) {
+      printf(", ");
+    }
+    printf("0x%016llXull", result[i]);
+  }
+  printf("},\n");
+}
+
+#else  // PRINT_RESULTS
+
+const uint64_t kExpected[kVectors][Xorshift128Plus::N] = {
+    {0x6E901576D477CBB1ull, 0xE9E53789195DA2A2ull, 0xB681F6DDA5E0AE99ull,
+     0x8EFD18CE21FD6896ull, 0xA898A80DF75CF532ull, 0x50CEB2C9E2DE7E32ull,
+     0x3CA7C2FEB25C0DD0ull, 0xA4D0866B80B4D836ull},
+    {0x8CD6A1E6233D3A26ull, 0x3D4603ADE98B112Dull, 0xDC427AF674019E36ull,
+     0xE28B4D230705AC53ull, 0x7297E9BBA88783DDull, 0x34D3D23CFCD9B41Aull,
+     0x5A223615ADBE96B8ull, 0xE5EB529027CFBD01ull},
+    {0xC1894CF00DFAC6A2ull, 0x18EDF8AE9085E404ull, 0x8E936625296B4CCDull,
+     0x31971EF3A14A899Bull, 0xBE87535FCE0BF26Aull, 0x576F7A752BC6649Full,
+     0xA44CBADCE0C6B937ull, 0x3DBA819BB17A353Aull},
+    {0x27CE38DFCC1C5EB6ull, 0x920BEB5606340256ull, 0x3986CBC40C9AFC2Cull,
+     0xE22BCB3EEB1E191Eull, 0x6E1FCDD3602A8FBAull, 0x052CB044E5415A29ull,
+     0x46266646EFB9ECD7ull, 0x8F44914618D29335ull},
+    {0xDD30AEDF72A362C5ull, 0xBC1D824E16BB98F4ull, 0x9EA6009C2AA3D2F1ull,
+     0xF65C0FBBE17AF081ull, 0x22424D06A8738991ull, 0x8A62763F2B7611D2ull,
+     0x2F3E89F722637939ull, 0x84D338BEF50AFD50ull},
+    {0x00F46494898E2B0Bull, 0x81239DC4FB8E8003ull, 0x414AD93EC5773FE7ull,
+     0x791473C450E4110Full, 0x87F127BF68C959ACull, 0x6429282D695EF67Bull,
+     0x661082E11546CBA8ull, 0x5815D53FA5436BFDull},
+    {0xB3DEADAB9BE6E0F9ull, 0xAA1B7B8F7CED0202ull, 0x4C5ED437699D279Eull,
+     0xA4471727F1CB39D3ull, 0xE439DA193F802F70ull, 0xF89401BB04FA6493ull,
+     0x3B08045A4FE898BAull, 0x32137BFE98227950ull},
+    {0xFBAE4A092897FEF3ull, 0x0639F6CE56E71C8Eull, 0xF0AD6465C07F0C1Eull,
+     0xFF8E28563361DCE5ull, 0xC2013DB7F86BC6B9ull, 0x8EFCC0503330102Full,
+     0x3F6B767EA5C4DA40ull, 0xB9864B950B2232E1ull},
+    {0x76EB58DE8E5EC22Aull, 0x9BBBF49A18B32F4Full, 0xC8405F02B2B2FAB9ull,
+     0xC3E122A5F146BC34ull, 0xC90BB046660F5765ull, 0xB933981310DBECCFull,
+     0x5A2A7BFC9126FD1Cull, 0x8BB388C94DF87901ull},
+    {0x753EB89AD63EF3C3ull, 0xF24AAF40C89D65ADull, 0x23F68931C1A6AA6Dull,
+     0xF47E79BF702C6DD0ull, 0xA3AD113244EE7EAEull, 0xD42CBEA28F793DC3ull,
+     0xD896FCF1820F497Cull, 0x042B86D2818948C1ull},
+    {0x8F2A4FC5A4265763ull, 0xEC499E6F95EAA10Cull, 0xE3786D4ECCD0DEB5ull,
+     0xC725C53D3AC4CC43ull, 0x065A4ACBBF83610Eull, 0x35C61C9FEF167129ull,
+     0x7B720AEAA7D70048ull, 0x14206B841377D039ull},
+    {0xAD27D78BF96055F6ull, 0x5F43B20FF47ADCD4ull, 0xE184C2401E2BF71Eull,
+     0x30B263D78990045Dull, 0xC22F00EBFF9BA201ull, 0xAE7F86522B53A562ull,
+     0x2853312BC039F0A4ull, 0x868D619E6549C3C8ull},
+    {0xFD5493D8AE9A8371ull, 0x773D5E224DF61B3Bull, 0x5377C54FBB1A8280ull,
+     0xCAD4DE3B8265CAFAull, 0xCDF3F19C91EBD5F6ull, 0xC8EA0F182D73BD78ull,
+     0x220502D593433FF1ull, 0xB81205E612DC31B1ull},
+    {0x8F32A39EAEDA4C70ull, 0x1D4B0914AA4DAC7Full, 0x56EF1570F3A8B405ull,
+     0x29812CB17404A592ull, 0x97A2AAF69CAE90F2ull, 0x12BF5E02778BBFE5ull,
+     0x9D4B55AD42A05FD2ull, 0x06C2BAB5E6086620ull},
+    {0x8DB4B9648302B253ull, 0xD756AD9E3AEA12C7ull, 0x68709B7F11D4B188ull,
+     0x7CC299DDCD707A4Bull, 0x97B860C370A7661Dull, 0xCECD314FC20E64F5ull,
+     0x55F412CDFB4C7EC3ull, 0x55EE97591193B525ull},
+    {0xCF70F3ACA96E6254ull, 0x022FEDECA2E09F46ull, 0x686823DB60AE1ECFull,
+     0xFD36190D3739830Eull, 0x74E1C09027F68120ull, 0xB5883A835C093842ull,
+     0x93E1EFB927E9E4E3ull, 0xB2721E249D7E5EBEull},
+    {0x69B6E21C44188CB8ull, 0x5D6CFB853655A7AAull, 0x3E001A0B425A66DCull,
+     0x8C57451103A5138Full, 0x7BF8B4BE18EAB402ull, 0x494102EB8761A365ull,
+     0xB33796A9F6A81F0Eull, 0x10005AB3BCCFD960ull},
+    {0xB2CF25740AE965DCull, 0x6F7C1DF7EF53D670ull, 0x648DD6087AC2251Eull,
+     0x040955D9851D487Dull, 0xBD550FC7E21A7F66ull, 0x57408F484DEB3AB5ull,
+     0x481E24C150B506C1ull, 0x72C0C3EAF91A40D6ull},
+    {0x1997A481858A5D39ull, 0x539718F4BEF50DC1ull, 0x2EC4DC4787E7E368ull,
+     0xFF1CE78879419845ull, 0xE219A93DD6F6DD30ull, 0x85328618D02FEC1Aull,
+     0xC86E02D969181B20ull, 0xEBEC8CD8BBA34E6Eull},
+    {0x28B55088A16CE947ull, 0xDD25AC11E6350195ull, 0xBD1F176694257B1Cull,
+     0x09459CCF9FCC9402ull, 0xF8047341E386C4E4ull, 0x7E8E9A9AD984C6C0ull,
+     0xA4661E95062AA092ull, 0x70A9947005ED1152ull},
+    {0x4C01CF75DBE98CCDull, 0x0BA076CDFC7373B9ull, 0x6C5E7A004B57FB59ull,
+     0x336B82297FD3BC56ull, 0x7990C0BE74E8D60Full, 0xF0275CC00EC5C8C8ull,
+     0x6CF29E682DFAD2E9ull, 0xFA4361524BD95D72ull},
+    {0x631D2A19FF62F018ull, 0x41C43863B985B3FAull, 0xE052B2267038EFD9ull,
+     0xE2A535FAC575F430ull, 0xE004EEA90B1FF5B8ull, 0x42DFE2CA692A1F26ull,
+     0x90FB0BFC9A189ECCull, 0x4484102BD3536BD0ull},
+    {0xD027134E9ACCA5A5ull, 0xBBAB4F966D476A9Bull, 0x713794A96E03D693ull,
+     0x9F6335E6B94CD44Aull, 0xC5090C80E7471617ull, 0x6D9C1B0C87B58E33ull,
+     0x1969CE82E31185A5ull, 0x2099B97E87754EBEull},
+    {0x60EBAF4ED934350Full, 0xC26FBF0BA5E6ECFFull, 0x9E54150F0312EC57ull,
+     0x0973B48364ED0041ull, 0x800A523241426CFCull, 0x03AB5EC055F75989ull,
+     0x8CF315935DEEB40Aull, 0x83D3FC0190BD1409ull},
+    {0x26D35394CF720A51ull, 0xCE9EAA15243CBAFEull, 0xE2B45FBAF21B29E0ull,
+     0xDB92E98EDE73F9E0ull, 0x79B16F5101C26387ull, 0x1AC15959DE88C86Full,
+     0x387633AEC6D6A580ull, 0xA6FC05807BFC5EB8ull},
+    {0x2D26C8E47C6BADA9ull, 0x820E6EC832D52D73ull, 0xB8432C3E0ED0EE5Bull,
+     0x0F84B3C4063AAA87ull, 0xF393E4366854F651ull, 0x749E1B4D2366A567ull,
+     0x805EACA43480D004ull, 0x244EBF3AA54400A5ull},
+    {0xBFDC3763AA79F75Aull, 0x9E3A74CC751F41DBull, 0xF401302A149DBC55ull,
+     0x6B25F7973D7BF7BCull, 0x13371D34FDBC3DAEull, 0xC5E1998C8F484DCDull,
+     0x7031B8AE5C364464ull, 0x3847F0C4F3DA2C25ull},
+    {0x24C6387D2C0F1225ull, 0x77CCE960255C67A4ull, 0x21A0947E497B10EBull,
+     0xBB5DB73A825A9D7Eull, 0x26294A41999E553Dull, 0x3953E0089F87D925ull,
+     0x3DAE6E5D4E5EAAFEull, 0x74B545460341A7AAull},
+    {0x710E5EB08A7DB820ull, 0x7E43C4E77CAEA025ull, 0xD4C91529C8B060C1ull,
+     0x09AE26D8A7B0CA29ull, 0xAB9F356BB360A772ull, 0xB68834A25F19F6E9ull,
+     0x79B8D9894C5734E2ull, 0xC6847E7C8FFD265Full},
+    {0x10C4BCB06A5111E6ull, 0x57CB50955B6A2516ull, 0xEF53C87798B6995Full,
+     0xAB38E15BBD8D0197ull, 0xA51C6106EFF73C93ull, 0x83D7F0E2270A7134ull,
+     0x0923FD330397FCE5ull, 0xF9DE54EDFE58FB45ull},
+    {0x07D44833ACCD1A94ull, 0xAAD3C9E945E2F9F3ull, 0xABF4C879B876AA37ull,
+     0xF29C69A21B301619ull, 0x2DDCE959111C788Bull, 0x7CEDB48F8AC1729Bull,
+     0x93F3BA9A02B659BEull, 0xF20A87FF17933CBEull},
+    {0x8E96EBE93180CFE6ull, 0x94CAA12873937079ull, 0x05F613D9380D4189ull,
+     0xBCAB40C1DC79F38Aull, 0x0AD8907B7C61D19Eull, 0x88534E189D103910ull,
+     0x2DB2FAABA160AB8Full, 0xA070E7506B06F15Cull},
+    {0x6FB1FCDAFFEF87A9ull, 0xE735CF25337A090Dull, 0x172C6EDCEFEF1825ull,
+     0x76957EA49EF0542Dull, 0x819BF4CD250F7C49ull, 0xD6FF23E4AD00C4D4ull,
+     0xE79673C1EC358FF0ull, 0xAC9C048144337938ull},
+    {0x4C5387FF258B3AF4ull, 0xEDB68FAEC2CB1AA3ull, 0x02A624E67B4E1DA4ull,
+     0x5C44797A38E08AF2ull, 0x36546A70E9411B4Bull, 0x47C17B24D2FD9675ull,
+     0x101957AAA020CA26ull, 0x47A1619D4779F122ull},
+    {0xF84B8BCDC92D9A3Cull, 0x951D7D2C74B3066Bull, 0x7AC287C06EDDD9B2ull,
+     0x4C38FC476608D38Full, 0x224D793B19CB4BCDull, 0x835A255899BF1A41ull,
+     0x4AD250E9F62DB4ABull, 0xD9B44F4B58781096ull},
+    {0xABBAF99A8EB5C6B8ull, 0xFB568E900D3A9F56ull, 0x11EDF63D23C5DF11ull,
+     0xA9C3011D3FA7C5A8ull, 0xAEDD3CF11AFFF725ull, 0xABCA472B5F1EDD6Bull,
+     0x0600B6BB5D879804ull, 0xDB4DE007F22191A0ull},
+    {0xD76CC9EFF0CE9392ull, 0xF5E0A772B59BA49Aull, 0x7D1AE1ED0C1261B5ull,
+     0x79224A33B5EA4F4Aull, 0x6DD825D80C40EA60ull, 0x47FC8E747E51C953ull,
+     0x695C05F72888BF98ull, 0x1A012428440B9015ull},
+    {0xD754DD61F9B772BFull, 0xC4A2FCF4C0F9D4EBull, 0x461167CDF67A24A2ull,
+     0x434748490EBCB9D4ull, 0x274DD9CDCA5781DEull, 0x36BAC63BA9A85209ull,
+     0x30324DAFDA36B70Full, 0x337570DB4FE6DAB3ull},
+    {0xF46CBDD57C551546ull, 0x8E02507E676DA3E3ull, 0xD826245A8C15406Dull,
+     0xDFB38A5B71113B72ull, 0x5EA38454C95B16B5ull, 0x28C054FB87ABF3E1ull,
+     0xAA2724C0BA1A8096ull, 0xECA83EC980304F2Full},
+    {0x6AA76EC294EB3303ull, 0x42D4CDB2A8032E3Bull, 0x7999EDF75DCD8735ull,
+     0xB422BFFE696CCDCCull, 0x8F721461FD7CCDFEull, 0x148E1A5814FDE253ull,
+     0x4DC941F4375EF8FFull, 0x27B2A9E0EB5B49CFull},
+    {0xCEA592EF9343EBE1ull, 0xF7D38B5FA7698903ull, 0x6CCBF352203FEAB6ull,
+     0x830F3095FCCDA9C5ull, 0xDBEEF4B81B81C8F4ull, 0x6D7EB9BCEECA5CF9ull,
+     0xC58ABB0FBE436C69ull, 0xE4B97E6DB2041A4Bull},
+    {0x7E40FC772978AF14ull, 0xCDDA4BBAE28354A1ull, 0xE4F993B832C32613ull,
+     0xD3608093C68A4B35ull, 0x9A3B60E01BEE3699ull, 0x03BEF248F3288713ull,
+     0x70B9294318F3E9B4ull, 0x8D2ABB913B8610DEull},
+    {0x37F209128E7D8B2Cull, 0x81D2AB375BD874BCull, 0xA716A1B7373F7408ull,
+     0x0CEE97BEC4706540ull, 0xA40C5FD9CDBC1512ull, 0x73CAF6C8918409E7ull,
+     0x45E11BCEDF0BBAA1ull, 0x612C612BFF6E6605ull},
+    {0xF8ECB14A12D0F649ull, 0xDA683CD7C01BA1ACull, 0xA2203F7510E124C1ull,
+     0x7F83E52E162F3C78ull, 0x77D2BB73456ACADBull, 0x37FC34FC840BBA6Full,
+     0x3076BC7D4C6EBC1Full, 0x4F514123632B5FA9ull},
+    {0x44D789DED935E884ull, 0xF8291591E09FEC9Full, 0xD9CED2CF32A2E4B7ull,
+     0x95F70E1EB604904Aull, 0xDE438FE43C14F6ABull, 0x4C8D23E4FAFCF8D8ull,
+     0xC716910A3067EB86ull, 0x3D6B7915315095D3ull},
+    {0x3170FDBADAB92095ull, 0x8F1963933FC5650Bull, 0x72F94F00ABECFEABull,
+     0x6E3AE826C6AAB4CEull, 0xA677A2BF31068258ull, 0x9660CDC4F363AF10ull,
+     0xD81A15A152379EF1ull, 0x5D7D285E1080A3F9ull},
+    {0xDAD5DDFF9A2249B3ull, 0x6F9721D926103FAEull, 0x1418CBB83FFA349Aull,
+     0xE71A30AD48C012B2ull, 0xBE76376C63751132ull, 0x3496467ACA713AE6ull,
+     0x8D7EC01369F991A3ull, 0xD8C73A88B96B154Eull},
+    {0x8B5D9C74AEB4833Aull, 0xF914FB3F867B912Full, 0xB894EA034936B1DCull,
+     0x8A16D21BE51C4F5Bull, 0x31FF048ED582D98Eull, 0xB95AB2F4DC65B820ull,
+     0x04082B9170561AF7ull, 0xA215610A5DC836FAull},
+    {0xB2ADE592C092FAACull, 0x7A1E683BCBF13294ull, 0xC7A4DBF86858C096ull,
+     0x3A49940F97BFF316ull, 0xCAE5C06B82C46703ull, 0xC7F413A0F951E2BDull,
+     0x6665E7BB10EB5916ull, 0x86F84A5A94EDE319ull},
+    {0x4EA199D8FAA79CA3ull, 0xDFA26E5BF1981704ull, 0x0F5E081D37FA4E01ull,
+     0x9CB632F89CD675CDull, 0x4A09DB89D48C0304ull, 0x88142742EA3C7672ull,
+     0xAC4F149E6D2E9BDBull, 0x6D9E1C23F8B1C6C6ull},
+    {0xD58BE47B92DEC0E9ull, 0x8E57573645E34328ull, 0x4CC094CCB5FB5126ull,
+     0x5F1D66AF6FB40E3Cull, 0x2BA15509132D3B00ull, 0x0D6545646120E567ull,
+     0x3CF680C45C223666ull, 0x96B28E32930179DAull},
+    {0x5900C45853AC7990ull, 0x61881E3E8B7FF169ull, 0x4DE5F835DF2230FFull,
+     0x4427A9E7932F73FFull, 0x9B641BAD379A8C8Dull, 0xDF271E5BF98F4E5Cull,
+     0xDFDA16DB830FF5EEull, 0x371C7E7CFB89C0E9ull},
+    {0x4410A8576247A250ull, 0x6AD2DA12B45AC0D9ull, 0x18DFC72AAC85EECCull,
+     0x06FC8BB2A0EF25C8ull, 0xEB287619C85E6118ull, 0x19553ECA67F25A2Cull,
+     0x3B9557F1DCEC5BAAull, 0x7BAD9E8B710D1079ull},
+    {0x34F365D66BD22B28ull, 0xE6E124B9F10F835Dull, 0x0573C38ABF2B24DCull,
+     0xD32E6AF10A0125AEull, 0x383590ACEA979519ull, 0x8376ED7A39E28205ull,
+     0xF0B7F184DCBDA435ull, 0x062A203390E31794ull},
+    {0xA2AFFD7E41918760ull, 0x7F90FC1BD0819C86ull, 0x5033C08E5A969533ull,
+     0x2707AF5C6D039590ull, 0x57BBD5980F17DF9Cull, 0xD3FE6E61D763268Aull,
+     0x9E0A0AE40F335A3Bull, 0x43CF4EB0A99613C5ull},
+    {0xD4D2A397CE1A7C2Eull, 0x3DF7CE7CC3212DADull, 0x0880F0D5D356C75Aull,
+     0xA8AFC44DD03B1346ull, 0x79263B46C13A29E0ull, 0x11071B3C0ED58E7Aull,
+     0xED46DC9F538406BFull, 0x2C94974F2B94843Dull},
+    {0xE246E13C39AB5D5Eull, 0xAC1018489D955B20ull, 0x8601B558771852B8ull,
+     0x110BD4C06DB40173ull, 0x738FC8A18CCA0EBBull, 0x6673E09BE0EA76E5ull,
+     0x024BC7A0C7527877ull, 0x45E6B4652E2EC34Eull},
+    {0xD1ED26A1A375CDC8ull, 0xAABC4E896A617CB8ull, 0x0A9C9E8E57D753C6ull,
+     0xA3774A75FEB4C30Eull, 0x30B816C01C93E49Eull, 0xF405BABC06D2408Cull,
+     0xCC0CE6B4CE788ABCull, 0x75E7922D0447956Cull},
+    {0xD07C1676A698BC95ull, 0x5F9AEA4840E2D860ull, 0xD5FC10D58BDF6F02ull,
+     0xF190A2AD4BC2EEA7ull, 0x0C24D11F51726931ull, 0xDB646899A16B6512ull,
+     0x7BC10670047B1DD8ull, 0x2413A5ABCD45F092ull},
+    {0x4E66892190CFD923ull, 0xF10162440365EC8Eull, 0x158ACA5A6A2280AEull,
+     0x0D60ED11C0224166ull, 0x7CD2E9A71B9D7488ull, 0x450D7289706AB2A3ull,
+     0x88FAE34EC9A0D7DCull, 0x96FF9103575A97DAull},
+    {0x77990FAC6046C446ull, 0xB174B5FB30C76676ull, 0xE352CE3EB56CF82Aull,
+     0xC6039B6873A9A082ull, 0xE3F80F3AE333148Aull, 0xB853BA24BA3539B9ull,
+     0xE8863E52ECCB0C74ull, 0x309B4CC1092CC245ull},
+    {0xBC2B70BEE8388D9Full, 0xE48D92AE22216DCEull, 0xF15F3BF3E2C15D8Full,
+     0x1DD964D4812D8B24ull, 0xD56AF02FB4665E4Cull, 0x98002200595BD9A3ull,
+     0x049246D50BB8FA12ull, 0x1B542DF485B579B9ull},
+    {0x2347409ADFA8E497ull, 0x36015C2211D62498ull, 0xE9F141F32EB82690ull,
+     0x1F839912D0449FB9ull, 0x4E4DCFFF2D02D97Cull, 0xF8A03AB4C0F625C9ull,
+     0x0605F575795DAC5Cull, 0x4746C9BEA0DDA6B1ull},
+    {0xCA5BB519ECE7481Bull, 0xFD496155E55CA945ull, 0xF753B9DBB1515F81ull,
+     0x50549E8BAC0F70E7ull, 0x8614FB0271E21C60ull, 0x60C72947EB0F0070ull,
+     0xA6511C10AEE742B6ull, 0x48FB48F2CACCB43Eull}};
+
+#endif  // PRINT_RESULTS
+
+// Ensures Xorshift128+ returns consistent and unchanging values.
+void TestGolden() {
+  HWY_ALIGN Xorshift128Plus rng(12345);
+  for (uint64_t vector = 0; vector < kVectors; ++vector) {
+    HWY_ALIGN uint64_t lanes[Xorshift128Plus::N];
+    rng.Fill(lanes);
+#if PRINT_RESULTS
+    Print(lanes);
+#else
+    for (size_t i = 0; i < Xorshift128Plus::N; ++i) {
+      ASSERT_EQ(kExpected[vector][i], lanes[i])
+          << "Where vector=" << vector << " i=" << i;
+    }
+#endif
+  }
+}
+
+// Output changes when given different seeds
+void TestSeedChanges() {
+  HWY_ALIGN uint64_t lanes[Xorshift128Plus::N];
+
+  std::vector<uint64_t> first;
+  constexpr size_t kNumSeeds = 16384;
+  first.reserve(kNumSeeds);
+
+  // All 14-bit seeds
+  for (size_t seed = 0; seed < kNumSeeds; ++seed) {
+    HWY_ALIGN Xorshift128Plus rng(seed);
+
+    rng.Fill(lanes);
+    first.push_back(lanes[0]);
+  }
+
+  // All outputs are unique
+  ASSERT_EQ(kNumSeeds, first.size());
+  std::sort(first.begin(), first.end());
+  first.erase(std::unique(first.begin(), first.end()), first.end());
+  EXPECT_EQ(kNumSeeds, first.size());
+}
+
+void TestFloat() {
+  test::ThreadPoolForTests pool(8);
+
+#ifdef JXL_DISABLE_SLOW_TESTS
+  const uint32_t kMaxSeed = 256;
+#else   // JXL_DISABLE_SLOW_TESTS
+  const uint32_t kMaxSeed = 4096;
+#endif  // JXL_DISABLE_SLOW_TESTS
+  EXPECT_TRUE(RunOnPool(
+      &pool, 0, kMaxSeed, ThreadPool::NoInit,
+      [](const uint32_t seed, size_t /*thread*/) {
+        HWY_ALIGN Xorshift128Plus rng(seed);
+
+        const HWY_FULL(uint32_t) du;
+        const HWY_FULL(float) df;
+        HWY_ALIGN uint64_t batch[Xorshift128Plus::N];
+        HWY_ALIGN float lanes[MaxLanes(df)];
+        double sum = 0.0;
+        size_t count = 0;
+        const size_t kReps = 2000;
+        for (size_t reps = 0; reps < kReps; ++reps) {
+          rng.Fill(batch);
+          for (size_t i = 0; i < Xorshift128Plus::N * 2; i += Lanes(df)) {
+            const auto bits =
+                Load(du, reinterpret_cast<const uint32_t*>(batch) + i);
+            // 1.0 + 23 random mantissa bits = [1, 2)
+            const auto rand12 =
+                BitCast(df, Or(ShiftRight<9>(bits), Set(du, 0x3F800000)));
+            const auto rand01 = Sub(rand12, Set(df, 1.0f));
+            Store(rand01, df, lanes);
+            for (float lane : lanes) {
+              sum += lane;
+              count += 1;
+              EXPECT_LE(lane, 1.0f);
+              EXPECT_GE(lane, 0.0f);
+            }
+          }
+        }
+
+        // Verify average (uniform distribution)
+        EXPECT_NEAR(0.5, sum / count, 0.00702);
+      },
+      "TestXorShift"));
+}
+
+// Not more than one 64-bit zero
+void TestNotZero() {
+  test::ThreadPoolForTests pool(8);
+
+#ifdef JXL_DISABLE_SLOW_TESTS
+  const uint32_t kMaxSeed = 500;
+#else   // JXL_DISABLE_SLOW_TESTS
+  const uint32_t kMaxSeed = 2000;
+#endif  // JXL_DISABLE_SLOW_TESTS
+  EXPECT_TRUE(RunOnPool(
+      &pool, 0, kMaxSeed, ThreadPool::NoInit,
+      [](const uint32_t task, size_t /*thread*/) {
+        HWY_ALIGN uint64_t lanes[Xorshift128Plus::N];
+
+        HWY_ALIGN Xorshift128Plus rng(task);
+        size_t num_zero = 0;
+        for (size_t vectors = 0; vectors < 10000; ++vectors) {
+          rng.Fill(lanes);
+          for (uint64_t lane : lanes) {
+            num_zero += static_cast<size_t>(lane == 0);
+          }
+        }
+        EXPECT_LE(num_zero, 1u);
+      },
+      "TestNotZero"));
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class Xorshift128Test : public hwy::TestWithParamTarget {};
+
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(Xorshift128Test);
+
+HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestNotZero);
+HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestGolden);
+HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestSeedChanges);
+HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestFloat);
+
+}  // namespace jxl
+#endif
-- 
cgit v1.2.3