Adding upstream version 1:115.7.0.upstream/1%115.7.0 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 17:32:43 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 17:32:43 +0000
commit: 6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch)
tree: a68f146d7fa01f0134297619fbe7e33db084e0aa /third_party/jpeg-xl/lib/jpegli
parent: Initial commit. (diff)
download: thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.tar.xz
thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.zip
66 files changed, 19077 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jpegli/README.md b/third_party/jpeg-xl/lib/jpegli/README.md
new file mode 100644
index 0000000000..1eef402eef
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/README.md
@@ -0,0 +1,28 @@
+# Improved JPEG decoder implementation
+
+This subdirectory contains a JPEG decoder implementation that is API and ABI
+compatible with libjpeg62.
+
+*NOTE*: This is still a work in progress, currently only API functions called
+from libjxl's benchmark_xl tool are implemented.
+
+To decompress an ```input.jpg``` file with this new library:
+
+```
+(from the libjxl root directory)
+$ ./ci.sh opt
+$ LD_PRELOAD=./build/libjpeg.so.62 ./build/tools/benchmark_xl --input input.jpg --codec=jpeg --decode_only --save_decompressed --output_dir .
+```
+
+The decompressed file will be saved as ```input.jpg.jpeg.png```.
+
+To benchmark the jpeg encoding-decoding round-trip on an ```input.png``` with
+the new library, first build a statically linked ```cjpeg-static``` binary,
+which is found in ```$PATH```, and then run:
+
+```
+(from the libjxl root directory)
+$ ./ci.sh opt
+$ LD_PRELOAD=./build/libjpeg.so.62 ./build/tools/benchmark_xl --input input.png --codec=jpeg:cjpeg-static:q90
+```
+
diff --git a/third_party/jpeg-xl/lib/jpegli/adaptive_quantization.cc b/third_party/jpeg-xl/lib/jpegli/adaptive_quantization.cc
new file mode 100644
index 0000000000..a1c0b89ad3
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/adaptive_quantization.cc
@@ -0,0 +1,563 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/adaptive_quantization.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <string>
+#include <vector>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/adaptive_quantization.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::AbsDiff;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Floor;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Min;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::NegMulAdd;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ShiftLeft;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Sqrt;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+static constexpr float kInputScaling = 1.0f / 255.0f;
+
+// Primary template: default to actual division.
+template <typename T, class V>
+struct FastDivision {
+  HWY_INLINE V operator()(const V n, const V d) const { return n / d; }
+};
+// Partial specialization for float vectors.
+template <class V>
+struct FastDivision<float, V> {
+  // One Newton-Raphson iteration.
+  static HWY_INLINE V ReciprocalNR(const V x) {
+    const auto rcp = ApproximateReciprocal(x);
+    const auto sum = Add(rcp, rcp);
+    const auto x_rcp = Mul(x, rcp);
+    return NegMulAdd(x_rcp, rcp, sum);
+  }
+
+  V operator()(const V n, const V d) const {
+#if 1  // Faster on SKX
+    return Div(n, d);
+#else
+    return n * ReciprocalNR(d);
+#endif
+  }
+};
+
+// Approximates smooth functions via rational polynomials (i.e. dividing two
+// polynomials). Evaluates polynomials via Horner's scheme, which is faster than
+// Clenshaw recurrence for Chebyshev polynomials. LoadDup128 allows us to
+// specify constants (replicated 4x) independently of the lane count.
+template <size_t NP, size_t NQ, class D, class V, typename T>
+HWY_INLINE HWY_MAYBE_UNUSED V EvalRationalPolynomial(const D d, const V x,
+                                                     const T (&p)[NP],
+                                                     const T (&q)[NQ]) {
+  constexpr size_t kDegP = NP / 4 - 1;
+  constexpr size_t kDegQ = NQ / 4 - 1;
+  auto yp = LoadDup128(d, &p[kDegP * 4]);
+  auto yq = LoadDup128(d, &q[kDegQ * 4]);
+  // We use pointer arithmetic to refer to &p[(kDegP - n) * 4] to avoid a
+  // compiler warning that the index is out of bounds since we are already
+  // checking that it is not out of bounds with (kDegP >= n) and the access
+  // will be optimized away. Similarly with q and kDegQ.
+  HWY_FENCE;
+  if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 1) * 4)));
+  if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 1) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 2) * 4)));
+  if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 2) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 3) * 4)));
+  if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 3) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 4) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 4) * 4)));
+  if (kDegQ >= 4) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 4) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 5) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 5) * 4)));
+  if (kDegQ >= 5) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 5) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 6) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 6) * 4)));
+  if (kDegQ >= 6) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 6) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 7) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 7) * 4)));
+  if (kDegQ >= 7) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 7) * 4)));
+
+  return FastDivision<T, V>()(yp, yq);
+}
+
+// Computes base-2 logarithm like std::log2. Undefined if negative / NaN.
+// L1 error ~3.9E-6
+template <class DF, class V>
+V FastLog2f(const DF df, V x) {
+  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
+  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
+                                          HWY_REP4(1.4287160470083755E+00f),
+                                          HWY_REP4(7.4245873327820566E-01f)};
+  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
+                                          HWY_REP4(1.0096718572241148E+00f),
+                                          HWY_REP4(1.7409343003366853E-01f)};
+
+  const Rebind<int32_t, DF> di;
+  const auto x_bits = BitCast(di, x);
+
+  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
+  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
+  // Shifted exponent = log2; also used to clear mantissa.
+  const auto exp_shifted = ShiftRight<23>(exp_bits);
+  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
+  const auto exp_val = ConvertTo(df, exp_shifted);
+  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
+             exp_val);
+}
+
+// max relative error ~3e-7
+template <class DF, class V>
+V FastPow2f(const DF df, V x) {
+  const Rebind<int32_t, DF> di;
+  auto floorx = Floor(x);
+  auto exp =
+      BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
+  auto frac = Sub(x, floorx);
+  auto num = Add(frac, Set(df, 1.01749063e+01));
+  num = MulAdd(num, frac, Set(df, 4.88687798e+01));
+  num = MulAdd(num, frac, Set(df, 9.85506591e+01));
+  num = Mul(num, exp);
+  auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02));
+  den = MulAdd(den, frac, Set(df, -1.94414990e+01));
+  den = MulAdd(den, frac, Set(df, 9.85506633e+01));
+  return Div(num, den);
+}
+
+inline float FastPow2f(float f) {
+  HWY_CAPPED(float, 1) D;
+  return GetLane(FastPow2f(D, Set(D, f)));
+}
+
+// The following functions modulate an exponent (out_val) and return the updated
+// value. Their descriptor is limited to 8 lanes for 8x8 blocks.
+
+template <class D, class V>
+V ComputeMask(const D d, const V out_val) {
+  const auto kBase = Set(d, -0.74174993f);
+  const auto kMul4 = Set(d, 3.2353257320940401f);
+  const auto kMul2 = Set(d, 12.906028311180409f);
+  const auto kOffset2 = Set(d, 305.04035728311436f);
+  const auto kMul3 = Set(d, 5.0220313103171232f);
+  const auto kOffset3 = Set(d, 2.1925739705298404f);
+  const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3);
+  const auto kMul0 = Set(d, 0.74760422233706747f);
+  const auto k1 = Set(d, 1.0f);
+
+  // Avoid division by zero.
+  const auto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f));
+  const auto v2 = Div(k1, Add(v1, kOffset2));
+  const auto v3 = Div(k1, MulAdd(v1, v1, kOffset3));
+  const auto v4 = Div(k1, MulAdd(v1, v1, kOffset4));
+  // TODO(jyrki):
+  // A log or two here could make sense. In butteraugli we have effectively
+  // log(log(x + C)) for this kind of use, as a single log is used in
+  // saturating visual masking and here the modulation values are exponential,
+  // another log would counter that.
+  return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3))));
+}
+
+// mul and mul2 represent a scaling difference between jxl and butteraugli.
+static const float kSGmul = 226.0480446705883f;
+static const float kSGmul2 = 1.0f / 73.377132366608819f;
+static const float kLog2 = 0.693147181f;
+// Includes correction factor for std::log -> log2.
+static const float kSGRetMul = kSGmul2 * 18.6580932135f * kLog2;
+static const float kSGVOffset = 7.14672470003f;
+
+template <bool invert, typename D, typename V>
+V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) {
+  // The opsin space in jxl is the cubic root of photons, i.e., v * v * v
+  // is related to the number of photons.
+  //
+  // SimpleGamma(v * v * v) is the psychovisual space in butteraugli.
+  // This ratio allows quantization to move from jxl's opsin space to
+  // butteraugli's log-gamma space.
+  static const float kEpsilon = 1e-2;
+  static const float kNumOffset = kEpsilon / kInputScaling / kInputScaling;
+  static const float kNumMul = kSGRetMul * 3 * kSGmul;
+  static const float kVOffset = (kSGVOffset * kLog2 + kEpsilon) / kInputScaling;
+  static const float kDenMul = kLog2 * kSGmul * kInputScaling * kInputScaling;
+
+  v = ZeroIfNegative(v);
+  const auto num_mul = Set(d, kNumMul);
+  const auto num_offset = Set(d, kNumOffset);
+  const auto den_offset = Set(d, kVOffset);
+  const auto den_mul = Set(d, kDenMul);
+
+  const auto v2 = Mul(v, v);
+
+  const auto num = MulAdd(num_mul, v2, num_offset);
+  const auto den = MulAdd(Mul(den_mul, v), v2, den_offset);
+  return invert ? Div(num, den) : Div(den, num);
+}
+
+template <bool invert = false>
+static float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) {
+  using DScalar = HWY_CAPPED(float, 1);
+  auto vscalar = Load(DScalar(), &v);
+  return GetLane(
+      RatioOfDerivativesOfCubicRootToSimpleGamma<invert>(DScalar(), vscalar));
+}
+
+// TODO(veluca): this function computes an approximation of the derivative of
+// SimpleGamma with (f(x+eps)-f(x))/eps. Consider two-sided approximation or
+// exact derivatives. For reference, SimpleGamma was:
+/*
+template <typename D, typename V>
+V SimpleGamma(const D d, V v) {
+  // A simple HDR compatible gamma function.
+  const auto mul = Set(d, kSGmul);
+  const auto kRetMul = Set(d, kSGRetMul);
+  const auto kRetAdd = Set(d, kSGmul2 * -20.2789020414f);
+  const auto kVOffset = Set(d, kSGVOffset);
+
+  v *= mul;
+
+  // This should happen rarely, but may lead to a NaN, which is rather
+  // undesirable. Since negative photons don't exist we solve the NaNs by
+  // clamping here.
+  // TODO(veluca): with FastLog2f, this no longer leads to NaNs.
+  v = ZeroIfNegative(v);
+  return kRetMul * FastLog2f(d, v + kVOffset) + kRetAdd;
+}
+*/
+
+template <class D, class V>
+V GammaModulation(const D d, const size_t x, const size_t y,
+                  const RowBuffer<float>& input, const V out_val) {
+  static const float kBias = 0.16f / kInputScaling;
+  static const float kScale = kInputScaling / 64.0f;
+  auto overall_ratio = Zero(d);
+  const auto bias = Set(d, kBias);
+  const auto scale = Set(d, kScale);
+  const float* const JXL_RESTRICT block_start = input.Row(y) + x;
+  for (size_t dy = 0; dy < 8; ++dy) {
+    const float* const JXL_RESTRICT row_in = block_start + dy * input.stride();
+    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
+      const auto iny = Add(Load(d, row_in + dx), bias);
+      const auto ratio_g =
+          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, iny);
+      overall_ratio = Add(overall_ratio, ratio_g);
+    }
+  }
+  overall_ratio = Mul(SumOfLanes(d, overall_ratio), scale);
+  // ideally -1.0, but likely optimal correction adds some entropy, so slightly
+  // less than that.
+  // ln(2) constant folded in because we want std::log but have FastLog2f.
+  const auto kGam = Set(d, -0.15526878023684174f * 0.693147180559945f);
+  return MulAdd(kGam, FastLog2f(d, overall_ratio), out_val);
+}
+
+// Change precision in 8x8 blocks that have high frequency content.
+template <class D, class V>
+V HfModulation(const D d, const size_t x, const size_t y,
+               const RowBuffer<float>& input, const V out_val) {
+  // Zero out the invalid differences for the rightmost value per row.
+  const Rebind<uint32_t, D> du;
+  HWY_ALIGN constexpr uint32_t kMaskRight[8] = {~0u, ~0u, ~0u, ~0u,
+                                                ~0u, ~0u, ~0u, 0};
+
+  auto sum = Zero(d);  // sum of absolute differences with right and below
+  static const float kSumCoeff = -2.0052193233688884f * kInputScaling / 112.0;
+  auto sumcoeff = Set(d, kSumCoeff);
+
+  const float* const JXL_RESTRICT block_start = input.Row(y) + x;
+  for (size_t dy = 0; dy < 8; ++dy) {
+    const float* JXL_RESTRICT row_in = block_start + dy * input.stride();
+    const float* JXL_RESTRICT row_in_next =
+        dy == 7 ? row_in : row_in + input.stride();
+
+    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
+      const auto p = Load(d, row_in + dx);
+      const auto pr = LoadU(d, row_in + dx + 1);
+      const auto mask = BitCast(d, Load(du, kMaskRight + dx));
+      sum = Add(sum, And(mask, AbsDiff(p, pr)));
+      const auto pd = Load(d, row_in_next + dx);
+      sum = Add(sum, AbsDiff(p, pd));
+    }
+  }
+
+  sum = SumOfLanes(d, sum);
+  return MulAdd(sum, sumcoeff, out_val);
+}
+
+void PerBlockModulations(const float y_quant_01, const RowBuffer<float>& input,
+                         const size_t yb0, const size_t yblen,
+                         RowBuffer<float>* aq_map) {
+  static const float kAcQuant = 0.841f;
+  float base_level = 0.48f * kAcQuant;
+  float kDampenRampStart = 9.0f;
+  float kDampenRampEnd = 65.0f;
+  float dampen = 1.0f;
+  if (y_quant_01 >= kDampenRampStart) {
+    dampen = 1.0f - ((y_quant_01 - kDampenRampStart) /
+                     (kDampenRampEnd - kDampenRampStart));
+    if (dampen < 0) {
+      dampen = 0;
+    }
+  }
+  const float mul = kAcQuant * dampen;
+  const float add = (1.0f - dampen) * base_level;
+  for (size_t iy = 0; iy < yblen; iy++) {
+    const size_t yb = yb0 + iy;
+    const size_t y = yb * 8;
+    float* const JXL_RESTRICT row_out = aq_map->Row(yb);
+    const HWY_CAPPED(float, 8) df;
+    for (size_t ix = 0; ix < aq_map->xsize(); ix++) {
+      size_t x = ix * 8;
+      auto out_val = Set(df, row_out[ix]);
+      out_val = ComputeMask(df, out_val);
+      out_val = HfModulation(df, x, y, input, out_val);
+      out_val = GammaModulation(df, x, y, input, out_val);
+      // We want multiplicative quantization field, so everything
+      // until this point has been modulating the exponent.
+      row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add;
+    }
+  }
+}
+
+template <typename D, typename V>
+V MaskingSqrt(const D d, V v) {
+  static const float kLogOffset = 28;
+  static const float kMul = 211.50759899638012f;
+  const auto mul_v = Set(d, kMul * 1e8);
+  const auto offset_v = Set(d, kLogOffset);
+  return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v)));
+}
+
+template <typename V>
+void Sort4(V& min0, V& min1, V& min2, V& min3) {
+  const auto tmp0 = Min(min0, min1);
+  const auto tmp1 = Max(min0, min1);
+  const auto tmp2 = Min(min2, min3);
+  const auto tmp3 = Max(min2, min3);
+  const auto tmp4 = Max(tmp0, tmp2);
+  const auto tmp5 = Min(tmp1, tmp3);
+  min0 = Min(tmp0, tmp2);
+  min1 = Min(tmp4, tmp5);
+  min2 = Max(tmp4, tmp5);
+  min3 = Max(tmp1, tmp3);
+}
+
+template <typename V>
+void UpdateMin4(const V v, V& min0, V& min1, V& min2, V& min3) {
+  const auto tmp0 = Max(min0, v);
+  const auto tmp1 = Max(min1, tmp0);
+  const auto tmp2 = Max(min2, tmp1);
+  min0 = Min(min0, v);
+  min1 = Min(min1, tmp0);
+  min2 = Min(min2, tmp1);
+  min3 = Min(min3, tmp2);
+}
+
+// Computes a linear combination of the 4 lowest values of the 3x3 neighborhood
+// of each pixel. Output is downsampled 2x.
+void FuzzyErosion(const RowBuffer<float>& pre_erosion, const size_t yb0,
+                  const size_t yblen, RowBuffer<float>* tmp,
+                  RowBuffer<float>* aq_map) {
+  int xsize_blocks = aq_map->xsize();
+  int xsize = pre_erosion.xsize();
+  HWY_FULL(float) d;
+  const auto mul0 = Set(d, 0.125f);
+  const auto mul1 = Set(d, 0.075f);
+  const auto mul2 = Set(d, 0.06f);
+  const auto mul3 = Set(d, 0.05f);
+  for (size_t iy = 0; iy < 2 * yblen; ++iy) {
+    size_t y = 2 * yb0 + iy;
+    const float* JXL_RESTRICT rowt = pre_erosion.Row(y - 1);
+    const float* JXL_RESTRICT rowm = pre_erosion.Row(y);
+    const float* JXL_RESTRICT rowb = pre_erosion.Row(y + 1);
+    float* row_out = tmp->Row(y);
+    for (int x = 0; x < xsize; x += Lanes(d)) {
+      int xm1 = x - 1;
+      int xp1 = x + 1;
+      auto min0 = LoadU(d, rowm + x);
+      auto min1 = LoadU(d, rowm + xm1);
+      auto min2 = LoadU(d, rowm + xp1);
+      auto min3 = LoadU(d, rowt + xm1);
+      Sort4(min0, min1, min2, min3);
+      UpdateMin4(LoadU(d, rowt + x), min0, min1, min2, min3);
+      UpdateMin4(LoadU(d, rowt + xp1), min0, min1, min2, min3);
+      UpdateMin4(LoadU(d, rowb + xm1), min0, min1, min2, min3);
+      UpdateMin4(LoadU(d, rowb + x), min0, min1, min2, min3);
+      UpdateMin4(LoadU(d, rowb + xp1), min0, min1, min2, min3);
+      const auto v = Add(Add(Mul(mul0, min0), Mul(mul1, min1)),
+                         Add(Mul(mul2, min2), Mul(mul3, min3)));
+      Store(v, d, row_out + x);
+    }
+    if (iy % 2 == 1) {
+      const float* JXL_RESTRICT row_out0 = tmp->Row(y - 1);
+      float* JXL_RESTRICT aq_out = aq_map->Row(yb0 + iy / 2);
+      for (int bx = 0, x = 0; bx < xsize_blocks; ++bx, x += 2) {
+        aq_out[bx] =
+            (row_out[x] + row_out[x + 1] + row_out0[x] + row_out0[x + 1]);
+      }
+    }
+  }
+}
+
+void ComputePreErosion(const RowBuffer<float>& input, const size_t xsize,
+                       const size_t y0, const size_t ylen, int border,
+                       float* diff_buffer, RowBuffer<float>* pre_erosion) {
+  const size_t xsize_out = xsize / 4;
+  const size_t y0_out = y0 / 4;
+
+  // The XYB gamma is 3.0 to be able to decode faster with two muls.
+  // Butteraugli's gamma is matching the gamma of human eye, around 2.6.
+  // We approximate the gamma difference by adding one cubic root into
+  // the adaptive quantization. This gives us a total gamma of 2.6666
+  // for quantization uses.
+  static const float match_gamma_offset = 0.019 / kInputScaling;
+
+  const HWY_CAPPED(float, 8) df;
+
+  static const float limit = 0.2f;
+  // Computes image (padded to multiple of 8x8) of local pixel differences.
+  // Subsample both directions by 4.
+  for (size_t iy = 0; iy < ylen; ++iy) {
+    size_t y = y0 + iy;
+    const float* row_in = input.Row(y);
+    const float* row_in1 = input.Row(y + 1);
+    const float* row_in2 = input.Row(y - 1);
+    float* JXL_RESTRICT row_out = diff_buffer;
+    const auto match_gamma_offset_v = Set(df, match_gamma_offset);
+    const auto quarter = Set(df, 0.25f);
+    for (size_t x = 0; x < xsize; x += Lanes(df)) {
+      const auto in = LoadU(df, row_in + x);
+      const auto in_r = LoadU(df, row_in + x + 1);
+      const auto in_l = LoadU(df, row_in + x - 1);
+      const auto in_t = LoadU(df, row_in2 + x);
+      const auto in_b = LoadU(df, row_in1 + x);
+      const auto base = Mul(quarter, Add(Add(in_r, in_l), Add(in_t, in_b)));
+      const auto gammacv =
+          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/false>(
+              df, Add(in, match_gamma_offset_v));
+      auto diff = Mul(gammacv, Sub(in, base));
+      diff = Mul(diff, diff);
+      diff = Min(diff, Set(df, limit));
+      diff = MaskingSqrt(df, diff);
+      if ((iy & 3) != 0) {
+        diff = Add(diff, LoadU(df, row_out + x));
+      }
+      StoreU(diff, df, row_out + x);
+    }
+    if (iy % 4 == 3) {
+      size_t y_out = y0_out + iy / 4;
+      float* row_dout = pre_erosion->Row(y_out);
+      for (size_t x = 0; x < xsize_out; x++) {
+        row_dout[x] = (row_out[x * 4] + row_out[x * 4 + 1] +
+                       row_out[x * 4 + 2] + row_out[x * 4 + 3]) *
+                      0.25f;
+      }
+      pre_erosion->PadRow(y_out, xsize_out, border);
+    }
+  }
+}
+
+}  // namespace
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+HWY_EXPORT(ComputePreErosion);
+HWY_EXPORT(FuzzyErosion);
+HWY_EXPORT(PerBlockModulations);
+
+namespace {
+
+static constexpr int kPreErosionBorder = 1;
+
+}  // namespace
+
+void ComputeAdaptiveQuantField(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  if (!m->use_adaptive_quantization) {
+    return;
+  }
+  int y_channel = cinfo->jpeg_color_space == JCS_RGB ? 1 : 0;
+  jpeg_component_info* y_comp = &cinfo->comp_info[y_channel];
+  int y_quant_01 = cinfo->quant_tbl_ptrs[y_comp->quant_tbl_no]->quantval[1];
+  if (m->next_iMCU_row == 0) {
+    m->input_buffer[y_channel].CopyRow(-1, 0, 1);
+  }
+  if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
+    size_t last_row = m->ysize_blocks * DCTSIZE - 1;
+    m->input_buffer[y_channel].CopyRow(last_row + 1, last_row, 1);
+  }
+  const RowBuffer<float>& input = m->input_buffer[y_channel];
+  const size_t xsize_blocks = y_comp->width_in_blocks;
+  const size_t xsize = xsize_blocks * DCTSIZE;
+  const size_t yb0 = m->next_iMCU_row * cinfo->max_v_samp_factor;
+  const size_t yblen = cinfo->max_v_samp_factor;
+  size_t y0 = yb0 * DCTSIZE;
+  size_t ylen = cinfo->max_v_samp_factor * DCTSIZE;
+  if (y0 == 0) {
+    ylen += 4;
+  } else {
+    y0 += 4;
+  }
+  if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
+    ylen -= 4;
+  }
+  HWY_DYNAMIC_DISPATCH(ComputePreErosion)
+  (input, xsize, y0, ylen, kPreErosionBorder, m->diff_buffer, &m->pre_erosion);
+  if (y0 == 0) {
+    m->pre_erosion.CopyRow(-1, 0, kPreErosionBorder);
+  }
+  if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
+    size_t last_row = m->ysize_blocks * 2 - 1;
+    m->pre_erosion.CopyRow(last_row + 1, last_row, kPreErosionBorder);
+  }
+  HWY_DYNAMIC_DISPATCH(FuzzyErosion)
+  (m->pre_erosion, yb0, yblen, &m->fuzzy_erosion_tmp, &m->quant_field);
+  HWY_DYNAMIC_DISPATCH(PerBlockModulations)
+  (y_quant_01, input, yb0, yblen, &m->quant_field);
+  for (int y = 0; y < cinfo->max_v_samp_factor; ++y) {
+    float* row = m->quant_field.Row(yb0 + y);
+    for (size_t x = 0; x < xsize_blocks; ++x) {
+      row[x] = std::max(0.0f, (0.6f / row[x]) - 1.0f);
+    }
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/adaptive_quantization.h b/third_party/jpeg-xl/lib/jpegli/adaptive_quantization.h
new file mode 100644
index 0000000000..71f2fcc0af
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/adaptive_quantization.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_ADAPTIVE_QUANTIZATION_H_
+#define LIB_JPEGLI_ADAPTIVE_QUANTIZATION_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+#include <stddef.h>
+/* clang-format on */
+
+namespace jpegli {
+
+void ComputeAdaptiveQuantField(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_ADAPTIVE_QUANTIZATION_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/bit_writer.cc b/third_party/jpeg-xl/lib/jpegli/bit_writer.cc
new file mode 100644
index 0000000000..9788f35b8d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/bit_writer.cc
@@ -0,0 +1,60 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/bit_writer.h"
+
+#include "lib/jpegli/encode_internal.h"
+
+namespace jpegli {
+
+void JpegBitWriterInit(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  JpegBitWriter* bw = &m->bw;
+  size_t buffer_size = m->blocks_per_iMCU_row * (DCTSIZE2 * 16 + 8) + (1 << 16);
+  bw->cinfo = cinfo;
+  bw->data = Allocate<uint8_t>(cinfo, buffer_size, JPOOL_IMAGE);
+  bw->len = buffer_size;
+  bw->pos = 0;
+  bw->output_pos = 0;
+  bw->put_buffer = 0;
+  bw->free_bits = 64;
+  bw->healthy = true;
+}
+
+bool EmptyBitWriterBuffer(JpegBitWriter* bw) {
+  while (bw->output_pos < bw->pos) {
+    j_compress_ptr cinfo = bw->cinfo;
+    if (cinfo->dest->free_in_buffer == 0 &&
+        !(*cinfo->dest->empty_output_buffer)(cinfo)) {
+      return false;
+    }
+    size_t buflen = bw->pos - bw->output_pos;
+    size_t copylen = std::min<size_t>(cinfo->dest->free_in_buffer, buflen);
+    memcpy(cinfo->dest->next_output_byte, bw->data + bw->output_pos, copylen);
+    bw->output_pos += copylen;
+    cinfo->dest->free_in_buffer -= copylen;
+    cinfo->dest->next_output_byte += copylen;
+  }
+  bw->output_pos = bw->pos = 0;
+  return true;
+}
+
+void JumpToByteBoundary(JpegBitWriter* bw) {
+  size_t n_bits = bw->free_bits & 7u;
+  if (n_bits > 0) {
+    WriteBits(bw, n_bits, (1u << n_bits) - 1);
+  }
+  bw->put_buffer <<= bw->free_bits;
+  while (bw->free_bits <= 56) {
+    int c = (bw->put_buffer >> 56) & 0xFF;
+    EmitByte(bw, c);
+    bw->put_buffer <<= 8;
+    bw->free_bits += 8;
+  }
+  bw->put_buffer = 0;
+  bw->free_bits = 64;
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/bit_writer.h b/third_party/jpeg-xl/lib/jpegli/bit_writer.h
new file mode 100644
index 0000000000..0affcdabd3
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/bit_writer.h
@@ -0,0 +1,107 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_BIT_WRITER_H_
+#define LIB_JPEGLI_BIT_WRITER_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+#include <stdint.h>
+#include <string.h>
+/* clang-format on */
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jpegli {
+
+// Handles the packing of bits into output bytes.
+struct JpegBitWriter {
+  j_compress_ptr cinfo;
+  uint8_t* data;
+  size_t len;
+  size_t pos;
+  size_t output_pos;
+  uint64_t put_buffer;
+  int free_bits;
+  bool healthy;
+};
+
+void JpegBitWriterInit(j_compress_ptr cinfo);
+
+bool EmptyBitWriterBuffer(JpegBitWriter* bw);
+
+void JumpToByteBoundary(JpegBitWriter* bw);
+
+// Returns non-zero if and only if x has a zero byte, i.e. one of
+// x & 0xff, x & 0xff00, ..., x & 0xff00000000000000 is zero.
+static JXL_INLINE uint64_t HasZeroByte(uint64_t x) {
+  return (x - 0x0101010101010101ULL) & ~x & 0x8080808080808080ULL;
+}
+
+/**
+ * Writes the given byte to the output, writes an extra zero if byte is 0xFF.
+ *
+ * This method is "careless" - caller must make sure that there is enough
+ * space in the output buffer. Emits up to 2 bytes to buffer.
+ */
+static JXL_INLINE void EmitByte(JpegBitWriter* bw, int byte) {
+  bw->data[bw->pos++] = byte;
+  if (byte == 0xFF) bw->data[bw->pos++] = 0;
+}
+
+static JXL_INLINE void DischargeBitBuffer(JpegBitWriter* bw) {
+  // At this point we are ready to emit the bytes of put_buffer to the output.
+  // The JPEG format requires that after every 0xff byte in the entropy
+  // coded section, there is a zero byte, therefore we first check if any of
+  // the bytes of put_buffer is 0xFF.
+  if (HasZeroByte(~bw->put_buffer)) {
+    // We have a 0xFF byte somewhere, examine each byte and append a zero
+    // byte if necessary.
+    EmitByte(bw, (bw->put_buffer >> 56) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 48) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 40) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 32) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 24) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 16) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 8) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 0) & 0xFF);
+  } else {
+    // We don't have any 0xFF bytes, output all 6 bytes without checking.
+    bw->data[bw->pos] = (bw->put_buffer >> 56) & 0xFF;
+    bw->data[bw->pos + 1] = (bw->put_buffer >> 48) & 0xFF;
+    bw->data[bw->pos + 2] = (bw->put_buffer >> 40) & 0xFF;
+    bw->data[bw->pos + 3] = (bw->put_buffer >> 32) & 0xFF;
+    bw->data[bw->pos + 4] = (bw->put_buffer >> 24) & 0xFF;
+    bw->data[bw->pos + 5] = (bw->put_buffer >> 16) & 0xFF;
+    bw->data[bw->pos + 6] = (bw->put_buffer >> 8) & 0xFF;
+    bw->data[bw->pos + 7] = (bw->put_buffer >> 0) & 0xFF;
+    bw->pos += 8;
+  }
+}
+
+static JXL_INLINE void WriteBits(JpegBitWriter* bw, int nbits, uint64_t bits) {
+  // This is an optimization; if everything goes well,
+  // then |nbits| is positive; if non-existing Huffman symbol is going to be
+  // encoded, its length should be zero; later encoder could check the
+  // "health" of JpegBitWriter.
+  if (nbits == 0) {
+    bw->healthy = false;
+    return;
+  }
+  bw->free_bits -= nbits;
+  if (bw->free_bits < 0) {
+    bw->put_buffer <<= (bw->free_bits + nbits);
+    bw->put_buffer |= (bits >> -bw->free_bits);
+    DischargeBitBuffer(bw);
+    bw->free_bits += 64;
+    bw->put_buffer = nbits;
+  }
+  bw->put_buffer <<= nbits;
+  bw->put_buffer |= bits;
+}
+
+}  // namespace jpegli
+#endif  // LIB_JPEGLI_BIT_WRITER_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/bitstream.cc b/third_party/jpeg-xl/lib/jpegli/bitstream.cc
new file mode 100644
index 0000000000..0313ed3071
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/bitstream.cc
@@ -0,0 +1,1136 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/bitstream.h"
+
+#include <cmath>
+
+#include "lib/jpegli/bit_writer.h"
+#include "lib/jpegli/entropy_coding.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jxl/base/bits.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/bitstream.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/dct-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::AndNot;
+using hwy::HWY_NAMESPACE::Compress;
+using hwy::HWY_NAMESPACE::CountTrue;
+using hwy::HWY_NAMESPACE::Eq;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::MaskFromVec;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Not;
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Shl;
+using hwy::HWY_NAMESPACE::Sub;
+
+using DI = HWY_FULL(int32_t);
+constexpr DI di;
+
+int NumNonZero8x8ExceptDC(const coeff_t* block) {
+  const HWY_CAPPED(coeff_t, 8) di;
+
+  const auto zero = Zero(di);
+  // Add FFFF for every zero coefficient, negate to get #zeros.
+  auto neg_sum_zero = zero;
+  {
+    // First row has DC, so mask
+    const size_t y = 0;
+    HWY_ALIGN const coeff_t dc_mask_lanes[8] = {-1};
+
+    for (size_t x = 0; x < 8; x += Lanes(di)) {
+      const auto dc_mask = Load(di, dc_mask_lanes + x);
+
+      // DC counts as zero so we don't include it in nzeros.
+      const auto coef = AndNot(dc_mask, Load(di, &block[y * 8 + x]));
+
+      neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
+    }
+  }
+  // Remaining rows: no mask
+  for (size_t y = 1; y < 8; y++) {
+    for (size_t x = 0; x < 8; x += Lanes(di)) {
+      const auto coef = Load(di, &block[y * 8 + x]);
+      neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
+    }
+  }
+
+  // We want 64 - sum_zero, add because neg_sum_zero is already negated.
+  return kDCTBlockSize + GetLane(SumOfLanes(di, neg_sum_zero));
+}
+
+void ZigZagShuffle(int32_t* JXL_RESTRICT block) {
+  // TODO(szabadka) SIMDify this.
+  int32_t tmp[DCTSIZE2];
+  for (int k = 0; k < DCTSIZE2; ++k) {
+    tmp[k] = block[kJPEGNaturalOrder[k]];
+  }
+  memcpy(block, tmp, DCTSIZE2 * sizeof(tmp[0]));
+}
+
+template <typename DI, class V>
+JXL_INLINE V NumBits(DI di, const V x) {
+  // TODO(szabadka) Add faster implementations for some specific architectures.
+  const auto b1 = And(x, Set(di, 1));
+  const auto b2 = And(x, Set(di, 2));
+  const auto b3 = Sub((And(x, Set(di, 4))), Set(di, 1));
+  const auto b4 = Sub((And(x, Set(di, 8))), Set(di, 4));
+  const auto b5 = Sub((And(x, Set(di, 16))), Set(di, 11));
+  const auto b6 = Sub((And(x, Set(di, 32))), Set(di, 26));
+  const auto b7 = Sub((And(x, Set(di, 64))), Set(di, 57));
+  const auto b8 = Sub((And(x, Set(di, 128))), Set(di, 120));
+  const auto b9 = Sub((And(x, Set(di, 256))), Set(di, 247));
+  const auto b10 = Sub((And(x, Set(di, 512))), Set(di, 502));
+  const auto b11 = Sub((And(x, Set(di, 1024))), Set(di, 1013));
+  const auto b12 = Sub((And(x, Set(di, 2048))), Set(di, 2036));
+  return Max(Max(Max(Max(b1, b2), Max(b3, b4)), Max(Max(b5, b6), Max(b7, b8))),
+             Max(Max(b9, b10), Max(b11, b12)));
+}
+
+// Coefficient indexes pre-multiplied by 16 for the symbol calculation.
+HWY_ALIGN constexpr int32_t kIndexes[64] = {
+    0,   16,  32,  48,  64,  80,  96,  112, 128, 144, 160, 176,  192,
+    208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384,  400,
+    416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592,  608,
+    624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800,  816,
+    832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008,
+};
+
+JXL_INLINE int CompactBlock(int32_t* JXL_RESTRICT block,
+                            int32_t* JXL_RESTRICT nonzero_idx) {
+  const auto zero = Zero(di);
+  HWY_ALIGN constexpr int32_t dc_mask_lanes[HWY_LANES(DI)] = {-1};
+  const auto dc_mask = MaskFromVec(Load(di, dc_mask_lanes));
+  int num_nonzeros = 0;
+  int k = 0;
+  {
+    const auto coef = Load(di, block);
+    const auto idx = Load(di, kIndexes);
+    const auto nonzero_mask = Or(dc_mask, Not(Eq(coef, zero)));
+    const auto nzero_coef = Compress(coef, nonzero_mask);
+    const auto nzero_idx = Compress(idx, nonzero_mask);
+    StoreU(nzero_coef, di, &block[num_nonzeros]);
+    StoreU(nzero_idx, di, &nonzero_idx[num_nonzeros]);
+    num_nonzeros += CountTrue(di, nonzero_mask);
+    k += Lanes(di);
+  }
+  for (; k < DCTSIZE2; k += Lanes(di)) {
+    const auto coef = Load(di, &block[k]);
+    const auto idx = Load(di, &kIndexes[k]);
+    const auto nonzero_mask = Not(Eq(coef, zero));
+    const auto nzero_coef = Compress(coef, nonzero_mask);
+    const auto nzero_idx = Compress(idx, nonzero_mask);
+    StoreU(nzero_coef, di, &block[num_nonzeros]);
+    StoreU(nzero_idx, di, &nonzero_idx[num_nonzeros]);
+    num_nonzeros += CountTrue(di, nonzero_mask);
+  }
+  return num_nonzeros;
+}
+
+JXL_INLINE void ComputeSymbols(const int num_nonzeros,
+                               int32_t* JXL_RESTRICT nonzero_idx,
+                               int32_t* JXL_RESTRICT block,
+                               int32_t* JXL_RESTRICT symbols) {
+  nonzero_idx[-1] = -16;
+  const auto one = Set(di, 1);
+  const auto offset = Set(di, 16);
+  for (int i = 0; i < num_nonzeros; i += Lanes(di)) {
+    const auto idx = Load(di, &nonzero_idx[i]);
+    const auto prev_idx = LoadU(di, &nonzero_idx[i - 1]);
+    const auto coeff = Load(di, &block[i]);
+    const auto nbits = NumBits(di, Abs(coeff));
+    const auto mask = ShiftRight<8 * sizeof(int32_t) - 1>(coeff);
+    const auto bits = And(Add(coeff, mask), Sub(Shl(one, nbits), one));
+    const auto symbol = Sub(Add(nbits, idx), Add(prev_idx, offset));
+    Store(symbol, di, symbols + i);
+    Store(bits, di, block + i);
+  }
+}
+
+void WriteBlock(int32_t* JXL_RESTRICT block, int32_t* JXL_RESTRICT symbols,
+                int32_t* JXL_RESTRICT nonzero_idx, HuffmanCodeTable* dc_huff,
+                HuffmanCodeTable* ac_huff, JpegBitWriter* bw) {
+  ZigZagShuffle(block);
+  int num_nonzeros = CompactBlock(block, nonzero_idx);
+  ComputeSymbols(num_nonzeros, nonzero_idx, block, symbols);
+  int symbol = symbols[0];
+  WriteBits(bw, dc_huff->depth[symbol], dc_huff->code[symbol] | block[0]);
+  for (int i = 1; i < num_nonzeros; ++i) {
+    symbol = symbols[i];
+    while (symbol > 255) {
+      WriteBits(bw, ac_huff->depth[0xf0], ac_huff->code[0xf0]);
+      symbol -= 256;
+    }
+    WriteBits(bw, ac_huff->depth[symbol], ac_huff->code[symbol] | block[i]);
+  }
+  if (nonzero_idx[num_nonzeros - 1] < 1008) {
+    WriteBits(bw, ac_huff->depth[0], ac_huff->code[0]);
+  }
+}
+
+void WriteiMCURow(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  JpegBitWriter* bw = &m->bw;
+  int xsize_mcus = DivCeil(cinfo->image_width, 8 * cinfo->max_h_samp_factor);
+  int mcu_y = m->next_iMCU_row;
+  int32_t* block = m->block_tmp;
+  int32_t* symbols = m->block_tmp + DCTSIZE2;
+  int32_t* nonzero_idx = m->block_tmp + 3 * DCTSIZE2;
+  coeff_t* JXL_RESTRICT last_dc_coeff = m->last_dc_coeff;
+  const float* imcu_start[kMaxComponents];
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    imcu_start[c] = m->raw_data[c]->Row(mcu_y * comp->v_samp_factor * DCTSIZE);
+  }
+  const float* qf = nullptr;
+  if (m->use_adaptive_quantization) {
+    qf = m->quant_field.Row(0);
+  }
+  const size_t qf_stride = m->quant_field.stride();
+  for (int mcu_x = 0; mcu_x < xsize_mcus; ++mcu_x) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      jpeg_component_info* comp = &cinfo->comp_info[c];
+      HuffmanCodeTable* dc_huff = &m->huff_tables[comp->dc_tbl_no];
+      HuffmanCodeTable* ac_huff = &m->huff_tables[comp->ac_tbl_no + 4];
+      float* JXL_RESTRICT qmc = m->quant_mul[c];
+      const size_t stride = m->raw_data[c]->stride();
+      const int h_factor = m->h_factor[c];
+      const float* zero_bias_offset = m->zero_bias_offset[c];
+      const float* zero_bias_mul = m->zero_bias_mul[c];
+      float aq_strength = 0.0f;
+      for (int iy = 0; iy < comp->v_samp_factor; ++iy) {
+        for (int ix = 0; ix < comp->h_samp_factor; ++ix) {
+          size_t by = mcu_y * comp->v_samp_factor + iy;
+          size_t bx = mcu_x * comp->h_samp_factor + ix;
+          if (bx >= comp->width_in_blocks || by >= comp->height_in_blocks) {
+            WriteBits(bw, dc_huff->depth[0], dc_huff->code[0]);
+            WriteBits(bw, ac_huff->depth[0], ac_huff->code[0]);
+            continue;
+          }
+          if (m->use_adaptive_quantization) {
+            aq_strength = qf[iy * qf_stride + bx * h_factor];
+          }
+          const float* pixels = imcu_start[c] + (iy * stride + bx) * DCTSIZE;
+          ComputeCoefficientBlock(pixels, stride, qmc, aq_strength,
+                                  zero_bias_offset, zero_bias_mul,
+                                  m->dct_buffer, block);
+          block[0] -= last_dc_coeff[c];
+          last_dc_coeff[c] += block[0];
+          WriteBlock(block, symbols, nonzero_idx, dc_huff, ac_huff, bw);
+        }
+      }
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+namespace {
+HWY_EXPORT(NumNonZero8x8ExceptDC);
+
+// Holds data that is buffered between 8x8 blocks in progressive mode.
+struct DCTCodingState {
+  // The run length of end-of-band symbols in a progressive scan.
+  int eob_run_;
+  // The huffman table to be used when flushing the state.
+  HuffmanCodeTable* cur_ac_huff_;
+  // The sequence of currently buffered refinement bits for a successive
+  // approximation scan (one where Ah > 0).
+  std::vector<int> refinement_bits_;
+};
+
+void DCTCodingStateInit(DCTCodingState* s) {
+  s->eob_run_ = 0;
+  s->cur_ac_huff_ = nullptr;
+  s->refinement_bits_.clear();
+  s->refinement_bits_.reserve(kJPEGMaxCorrectionBits);
+}
+
+static JXL_INLINE void WriteSymbol(int symbol, const HuffmanCodeTable* table,
+                                   JpegBitWriter* bw) {
+  WriteBits(bw, table->depth[symbol], table->code[symbol]);
+}
+
+// Emit all buffered data to the bit stream using the given Huffman code and
+// bit writer.
+static JXL_INLINE void Flush(DCTCodingState* s, JpegBitWriter* bw) {
+  if (s->eob_run_ > 0) {
+    int nbits = jxl::FloorLog2Nonzero<uint32_t>(s->eob_run_);
+    int symbol = nbits << 4u;
+    WriteSymbol(symbol, s->cur_ac_huff_, bw);
+    if (nbits > 0) {
+      WriteBits(bw, nbits, s->eob_run_ & ((1 << nbits) - 1));
+    }
+    s->eob_run_ = 0;
+  }
+  for (size_t i = 0; i < s->refinement_bits_.size(); ++i) {
+    WriteBits(bw, 1, s->refinement_bits_[i]);
+  }
+  s->refinement_bits_.clear();
+}
+
+// Buffer some more data at the end-of-band (the last non-zero or newly
+// non-zero coefficient within the [Ss, Se] spectral band).
+static JXL_INLINE void BufferEndOfBand(DCTCodingState* s,
+                                       HuffmanCodeTable* ac_huff,
+                                       const std::vector<int>* new_bits,
+                                       JpegBitWriter* bw) {
+  if (s->eob_run_ == 0) {
+    s->cur_ac_huff_ = ac_huff;
+  }
+  ++s->eob_run_;
+  if (new_bits) {
+    s->refinement_bits_.insert(s->refinement_bits_.end(), new_bits->begin(),
+                               new_bits->end());
+  }
+  if (s->eob_run_ == 0x7FFF ||
+      s->refinement_bits_.size() > kJPEGMaxCorrectionBits - kDCTBlockSize + 1) {
+    Flush(s, bw);
+  }
+}
+
+bool BuildHuffmanCodeTable(const JPEGHuffmanCode& huff, HuffmanCodeTable* table,
+                           bool pre_shifted = false) {
+  int huff_code[kJpegHuffmanAlphabetSize];
+  // +1 for a sentinel element.
+  uint32_t huff_size[kJpegHuffmanAlphabetSize + 1];
+  int p = 0;
+  for (size_t l = 1; l <= kJpegHuffmanMaxBitLength; ++l) {
+    int i = huff.counts[l];
+    if (p + i > kJpegHuffmanAlphabetSize + 1) {
+      return false;
+    }
+    while (i--) huff_size[p++] = l;
+  }
+
+  if (p == 0) {
+    return true;
+  }
+
+  // Reuse sentinel element.
+  int last_p = p - 1;
+  huff_size[last_p] = 0;
+
+  int code = 0;
+  uint32_t si = huff_size[0];
+  p = 0;
+  while (huff_size[p]) {
+    while ((huff_size[p]) == si) {
+      huff_code[p++] = code;
+      code++;
+    }
+    code <<= 1;
+    si++;
+  }
+  for (p = 0; p < last_p; p++) {
+    int i = huff.values[p];
+    table->depth[i] = huff_size[p];
+    table->code[i] = huff_code[p];
+    if (pre_shifted) {
+      int nbits = i & 0xf;
+      table->depth[i] += nbits;
+      table->code[i] <<= nbits;
+    }
+  }
+  return true;
+}
+
+bool EncodeDCTBlockSequential(const coeff_t* block, HuffmanCodeTable* dc_huff,
+                              HuffmanCodeTable* ac_huff, coeff_t* last_dc_coeff,
+                              JpegBitWriter* bw) {
+  coeff_t temp2;
+  coeff_t temp;
+  temp2 = block[0];
+  temp = temp2 - *last_dc_coeff;
+  if (temp == 0) {
+    WriteSymbol(0, dc_huff, bw);
+  } else {
+    *last_dc_coeff = temp2;
+    temp2 = temp;
+    if (temp < 0) {
+      temp = -temp;
+      temp2--;
+    }
+    int dc_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int dc_mask = (1 << dc_nbits) - 1;
+    WriteSymbol(dc_nbits, dc_huff, bw);
+    WriteBits(bw, dc_nbits, temp2 & dc_mask);
+  }
+  int num_nonzeros = HWY_DYNAMIC_DISPATCH(NumNonZero8x8ExceptDC)(block);
+  for (int k = 1; k < 64; ++k) {
+    if (num_nonzeros == 0) {
+      WriteSymbol(0, ac_huff, bw);
+      break;
+    }
+    int r = 0;
+    while ((temp = block[kJPEGNaturalOrder[k]]) == 0) {
+      r++;
+      k++;
+    }
+    --num_nonzeros;
+    if (temp < 0) {
+      temp = -temp;
+      temp2 = ~temp;
+    } else {
+      temp2 = temp;
+    }
+    while (r > 15) {
+      WriteSymbol(0xf0, ac_huff, bw);
+      r -= 16;
+    }
+    int ac_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int ac_mask = (1 << ac_nbits) - 1;
+    int symbol = (r << 4u) + ac_nbits;
+    WriteSymbol(symbol, ac_huff, bw);
+    WriteBits(bw, ac_nbits, temp2 & ac_mask);
+  }
+  return true;
+}
+
+bool EncodeDCTBlockProgressive(const coeff_t* coeffs, HuffmanCodeTable* dc_huff,
+                               HuffmanCodeTable* ac_huff, int Ss, int Se,
+                               int Al, DCTCodingState* coding_state,
+                               coeff_t* last_dc_coeff, JpegBitWriter* bw) {
+  bool eob_run_allowed = Ss > 0;
+  coeff_t temp2;
+  coeff_t temp;
+  if (Ss == 0) {
+    temp2 = coeffs[0] >> Al;
+    temp = temp2 - *last_dc_coeff;
+    *last_dc_coeff = temp2;
+    temp2 = temp;
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp2--;
+    }
+    int nbits = (temp == 0) ? 0 : (jxl::FloorLog2Nonzero<uint32_t>(temp) + 1);
+    WriteSymbol(nbits, dc_huff, bw);
+    if (nbits > 0) {
+      WriteBits(bw, nbits, temp2 & ((1 << nbits) - 1));
+    }
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int r = 0;
+  for (int k = Ss; k <= Se; ++k) {
+    if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) {
+      r++;
+      continue;
+    }
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp >>= Al;
+      temp2 = ~temp;
+    } else {
+      temp >>= Al;
+      temp2 = temp;
+    }
+    if (temp == 0) {
+      r++;
+      continue;
+    }
+    Flush(coding_state, bw);
+    while (r > 15) {
+      WriteSymbol(0xf0, ac_huff, bw);
+      r -= 16;
+    }
+    int nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int symbol = (r << 4u) + nbits;
+    WriteSymbol(symbol, ac_huff, bw);
+    WriteBits(bw, nbits, temp2 & ((1 << nbits) - 1));
+    r = 0;
+  }
+  if (r > 0) {
+    BufferEndOfBand(coding_state, ac_huff, nullptr, bw);
+    if (!eob_run_allowed) {
+      Flush(coding_state, bw);
+    }
+  }
+  return true;
+}
+
+bool EncodeRefinementBits(const coeff_t* coeffs, HuffmanCodeTable* ac_huff,
+                          int Ss, int Se, int Al, DCTCodingState* coding_state,
+                          JpegBitWriter* bw) {
+  bool eob_run_allowed = Ss > 0;
+  if (Ss == 0) {
+    // Emit next bit of DC component.
+    WriteBits(bw, 1, (coeffs[0] >> Al) & 1);
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int abs_values[kDCTBlockSize];
+  int eob = 0;
+  for (int k = Ss; k <= Se; k++) {
+    const coeff_t abs_val = std::abs(coeffs[kJPEGNaturalOrder[k]]);
+    abs_values[k] = abs_val >> Al;
+    if (abs_values[k] == 1) {
+      eob = k;
+    }
+  }
+  int r = 0;
+  std::vector<int> refinement_bits;
+  refinement_bits.reserve(kDCTBlockSize);
+  for (int k = Ss; k <= Se; k++) {
+    if (abs_values[k] == 0) {
+      r++;
+      continue;
+    }
+    while (r > 15 && k <= eob) {
+      Flush(coding_state, bw);
+      WriteSymbol(0xf0, ac_huff, bw);
+      r -= 16;
+      for (int bit : refinement_bits) {
+        WriteBits(bw, 1, bit);
+      }
+      refinement_bits.clear();
+    }
+    if (abs_values[k] > 1) {
+      refinement_bits.push_back(abs_values[k] & 1u);
+      continue;
+    }
+    Flush(coding_state, bw);
+    int symbol = (r << 4u) + 1;
+    int new_non_zero_bit = (coeffs[kJPEGNaturalOrder[k]] < 0) ? 0 : 1;
+    WriteSymbol(symbol, ac_huff, bw);
+    WriteBits(bw, 1, new_non_zero_bit);
+    for (int bit : refinement_bits) {
+      WriteBits(bw, 1, bit);
+    }
+    refinement_bits.clear();
+    r = 0;
+  }
+  if (r > 0 || !refinement_bits.empty()) {
+    BufferEndOfBand(coding_state, ac_huff, &refinement_bits, bw);
+    if (!eob_run_allowed) {
+      Flush(coding_state, bw);
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+void WriteOutput(j_compress_ptr cinfo, const uint8_t* buf, size_t bufsize) {
+  size_t pos = 0;
+  while (pos < bufsize) {
+    if (cinfo->dest->free_in_buffer == 0 &&
+        !(*cinfo->dest->empty_output_buffer)(cinfo)) {
+      JPEGLI_ERROR("Destination suspension is not supported in markers.");
+    }
+    size_t len = std::min<size_t>(cinfo->dest->free_in_buffer, bufsize - pos);
+    memcpy(cinfo->dest->next_output_byte, buf + pos, len);
+    pos += len;
+    cinfo->dest->free_in_buffer -= len;
+    cinfo->dest->next_output_byte += len;
+  }
+}
+
+void WriteOutput(j_compress_ptr cinfo, const std::vector<uint8_t>& bytes) {
+  WriteOutput(cinfo, bytes.data(), bytes.size());
+}
+
+void WriteOutput(j_compress_ptr cinfo, std::initializer_list<uint8_t> bytes) {
+  WriteOutput(cinfo, bytes.begin(), bytes.size());
+}
+
+void EncodeAPP0(j_compress_ptr cinfo) {
+  WriteOutput(cinfo,
+              {0xff, 0xe0, 0, 16, 'J', 'F', 'I', 'F', '\0',
+               cinfo->JFIF_major_version, cinfo->JFIF_minor_version,
+               cinfo->density_unit, static_cast<uint8_t>(cinfo->X_density >> 8),
+               static_cast<uint8_t>(cinfo->X_density & 0xff),
+               static_cast<uint8_t>(cinfo->Y_density >> 8),
+               static_cast<uint8_t>(cinfo->Y_density & 0xff), 0, 0});
+}
+
+void EncodeAPP14(j_compress_ptr cinfo) {
+  uint8_t color_transform = cinfo->jpeg_color_space == JCS_YCbCr  ? 1
+                            : cinfo->jpeg_color_space == JCS_YCCK ? 2
+                                                                  : 0;
+  WriteOutput(cinfo, {0xff, 0xee, 0, 14, 'A', 'd', 'o', 'b', 'e', 0, 100, 0, 0,
+                      0, 0, color_transform});
+}
+
+void EncodeSOF(j_compress_ptr cinfo, bool is_baseline) {
+  if (cinfo->data_precision != kJpegPrecision) {
+    is_baseline = false;
+    JPEGLI_ERROR("Unsupported data precision %d", cinfo->data_precision);
+  }
+  const uint8_t marker = cinfo->progressive_mode ? 0xc2
+                         : is_baseline           ? 0xc0
+                                                 : 0xc1;
+  const size_t n_comps = cinfo->num_components;
+  const size_t marker_len = 8 + 3 * n_comps;
+  std::vector<uint8_t> data(marker_len + 2);
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = marker;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  data[pos++] = kJpegPrecision;
+  data[pos++] = cinfo->image_height >> 8u;
+  data[pos++] = cinfo->image_height & 0xFFu;
+  data[pos++] = cinfo->image_width >> 8u;
+  data[pos++] = cinfo->image_width & 0xFFu;
+  data[pos++] = n_comps;
+  for (size_t i = 0; i < n_comps; ++i) {
+    jpeg_component_info* comp = &cinfo->comp_info[i];
+    data[pos++] = comp->component_id;
+    data[pos++] = ((comp->h_samp_factor << 4u) | (comp->v_samp_factor));
+    const uint32_t quant_idx = comp->quant_tbl_no;
+    if (cinfo->quant_tbl_ptrs[quant_idx] == nullptr) {
+      JPEGLI_ERROR("Invalid component quant table index %u.", quant_idx);
+    }
+    data[pos++] = quant_idx;
+  }
+  WriteOutput(cinfo, data);
+}
+
+void EncodeSOS(j_compress_ptr cinfo, int scan_index) {
+  const jpeg_scan_info* scan_info = &cinfo->scan_info[scan_index];
+  const ScanCodingInfo& sci = cinfo->master->scan_coding_info[scan_index];
+  const size_t marker_len = 6 + 2 * scan_info->comps_in_scan;
+  std::vector<uint8_t> data(marker_len + 2);
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = 0xDA;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  data[pos++] = scan_info->comps_in_scan;
+  for (int i = 0; i < scan_info->comps_in_scan; ++i) {
+    int comp_idx = scan_info->component_index[i];
+    data[pos++] = cinfo->comp_info[comp_idx].component_id;
+    data[pos++] = (sci.dc_tbl_idx[i] << 4u) + (sci.ac_tbl_idx[i] - 4);
+  }
+  data[pos++] = scan_info->Ss;
+  data[pos++] = scan_info->Se;
+  data[pos++] = ((scan_info->Ah << 4u) | (scan_info->Al));
+  WriteOutput(cinfo, data);
+}
+
+void EncodeDHT(j_compress_ptr cinfo, const JPEGHuffmanCode* huffman_codes,
+               size_t num_huffman_codes, bool pre_shifted) {
+  if (num_huffman_codes == 0) {
+    return;
+  }
+
+  size_t marker_len = 2;
+  for (size_t i = 0; i < num_huffman_codes; ++i) {
+    const JPEGHuffmanCode& huff = huffman_codes[i];
+    if (huff.sent_table) continue;
+    marker_len += kJpegHuffmanMaxBitLength;
+    for (size_t j = 0; j <= kJpegHuffmanMaxBitLength; ++j) {
+      marker_len += huff.counts[j];
+    }
+  }
+  std::vector<uint8_t> data(marker_len + 2);
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = 0xC4;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  for (size_t i = 0; i < num_huffman_codes; ++i) {
+    const JPEGHuffmanCode& huff = huffman_codes[i];
+    size_t index = huff.slot_id;
+    HuffmanCodeTable* huff_table;
+    if (index & 0x10) {
+      huff_table = &cinfo->master->huff_tables[index - 12];
+    } else {
+      huff_table = &cinfo->master->huff_tables[index];
+    }
+    // TODO(eustas): cache
+    // TODO(eustas): set up non-existing symbols
+    if (!BuildHuffmanCodeTable(huff, huff_table, pre_shifted)) {
+      JPEGLI_ERROR("Failed to build Huffman code table.");
+    }
+    if (huff.sent_table) continue;
+    size_t total_count = 0;
+    size_t max_length = 0;
+    for (size_t i = 0; i <= kJpegHuffmanMaxBitLength; ++i) {
+      if (huff.counts[i] != 0) {
+        max_length = i;
+      }
+      total_count += huff.counts[i];
+    }
+    --total_count;
+    data[pos++] = huff.slot_id;
+    for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+      data[pos++] = (i == max_length ? huff.counts[i] - 1 : huff.counts[i]);
+    }
+    for (size_t i = 0; i < total_count; ++i) {
+      data[pos++] = huff.values[i];
+    }
+  }
+  if (marker_len > 2) {
+    WriteOutput(cinfo, data);
+  }
+}
+
+void EncodeDQT(j_compress_ptr cinfo, bool write_all_tables, bool* is_baseline) {
+  uint8_t data[4 + NUM_QUANT_TBLS * (1 + 2 * DCTSIZE2)];  // 520 bytes
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = 0xDB;
+  pos += 2;  // Length will be filled in later.
+
+  int send_table[NUM_QUANT_TBLS] = {};
+  if (write_all_tables) {
+    for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+      if (cinfo->quant_tbl_ptrs[i]) send_table[i] = 1;
+    }
+  } else {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      send_table[cinfo->comp_info[c].quant_tbl_no] = 1;
+    }
+  }
+
+  for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+    if (!send_table[i]) continue;
+    JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[i];
+    if (quant_table == nullptr) {
+      JPEGLI_ERROR("Missing quant table %d", i);
+    }
+    int precision = 0;
+    for (size_t k = 0; k < DCTSIZE2; ++k) {
+      if (quant_table->quantval[k] > 255) {
+        precision = 1;
+        *is_baseline = false;
+      }
+    }
+    if (quant_table->sent_table) {
+      continue;
+    }
+    data[pos++] = (precision << 4) + i;
+    for (size_t j = 0; j < DCTSIZE2; ++j) {
+      int val_idx = kJPEGNaturalOrder[j];
+      int val = quant_table->quantval[val_idx];
+      if (val == 0) {
+        JPEGLI_ERROR("Invalid quantval 0.");
+      }
+      if (precision) {
+        data[pos++] = val >> 8;
+      }
+      data[pos++] = val & 0xFFu;
+    }
+    quant_table->sent_table = TRUE;
+  }
+  if (pos > 4) {
+    data[2] = (pos - 2) >> 8u;
+    data[3] = (pos - 2) & 0xFFu;
+    WriteOutput(cinfo, data, pos);
+  }
+}
+
+bool EncodeDRI(j_compress_ptr cinfo) {
+  WriteOutput(cinfo, {0xFF, 0xDD, 0, 4,
+                      static_cast<uint8_t>(cinfo->restart_interval >> 8),
+                      static_cast<uint8_t>(cinfo->restart_interval & 0xFF)});
+  return true;
+}
+
+static JXL_INLINE void EmitMarker(JpegBitWriter* bw, int marker) {
+  bw->data[bw->pos++] = 0xFF;
+  bw->data[bw->pos++] = marker;
+}
+
+void ProgressMonitorEncodePass(j_compress_ptr cinfo, size_t scan_index,
+                               size_t mcu_y) {
+  if (cinfo->progress == nullptr) {
+    return;
+  }
+  cinfo->progress->completed_passes = 1 + scan_index;
+  cinfo->progress->pass_counter = mcu_y;
+  cinfo->progress->pass_limit = cinfo->total_iMCU_rows;
+  (*cinfo->progress->progress_monitor)(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+bool EncodeScan(j_compress_ptr cinfo, int scan_index) {
+  jpeg_comp_master* m = cinfo->master;
+  const int restart_interval = cinfo->restart_interval;
+  int restarts_to_go = restart_interval;
+  int next_restart_marker = 0;
+
+  JpegBitWriter* bw = &m->bw;
+  coeff_t last_dc_coeff[MAX_COMPS_IN_SCAN] = {0};
+  DCTCodingState coding_state;
+  DCTCodingStateInit(&coding_state);
+
+  const jpeg_scan_info* scan_info = &cinfo->scan_info[scan_index];
+  const ScanCodingInfo& sci = m->scan_coding_info[scan_index];
+  // "Non-interleaved" means color data comes in separate scans, in other words
+  // each scan can contain only one color component.
+  const bool is_interleaved = (scan_info->comps_in_scan > 1);
+  jpeg_component_info* base_comp =
+      &cinfo->comp_info[scan_info->component_index[0]];
+  // h_group / v_group act as numerators for converting number of blocks to
+  // number of MCU. In interleaved mode it is 1, so MCU is represented with
+  // max_*_samp_factor blocks. In non-interleaved mode we choose numerator to
+  // be the samping factor, consequently MCU is always represented with single
+  // block.
+  const int h_group = is_interleaved ? 1 : base_comp->h_samp_factor;
+  const int v_group = is_interleaved ? 1 : base_comp->v_samp_factor;
+  int MCUs_per_row =
+      DivCeil(cinfo->image_width * h_group, 8 * cinfo->max_h_samp_factor);
+  int MCU_rows =
+      DivCeil(cinfo->image_height * v_group, 8 * cinfo->max_v_samp_factor);
+  const bool is_progressive = cinfo->progressive_mode;
+  const int Al = scan_info->Al;
+  const int Ah = scan_info->Ah;
+  const int Ss = scan_info->Ss;
+  const int Se = scan_info->Se;
+  HWY_ALIGN constexpr coeff_t kDummyBlock[DCTSIZE2] = {0};
+
+  JBLOCKARRAY ba[MAX_COMPS_IN_SCAN];
+  for (int mcu_y = 0; mcu_y < MCU_rows; ++mcu_y) {
+    ProgressMonitorEncodePass(cinfo, scan_index, mcu_y);
+    for (int i = 0; i < scan_info->comps_in_scan; ++i) {
+      int comp_idx = scan_info->component_index[i];
+      jpeg_component_info* comp = &cinfo->comp_info[comp_idx];
+      int n_blocks_y = is_interleaved ? comp->v_samp_factor : 1;
+      int by0 = mcu_y * n_blocks_y;
+      int block_rows_left = comp->height_in_blocks - by0;
+      int max_block_rows = std::min(n_blocks_y, block_rows_left);
+      ba[i] = (*cinfo->mem->access_virt_barray)(
+          reinterpret_cast<j_common_ptr>(cinfo), m->coeff_buffers[comp_idx],
+          by0, max_block_rows, false);
+    }
+    for (int mcu_x = 0; mcu_x < MCUs_per_row; ++mcu_x) {
+      // Possibly emit a restart marker.
+      if (restart_interval > 0 && restarts_to_go == 0) {
+        Flush(&coding_state, bw);
+        JumpToByteBoundary(bw);
+        EmitMarker(bw, 0xD0 + next_restart_marker);
+        next_restart_marker += 1;
+        next_restart_marker &= 0x7;
+        restarts_to_go = restart_interval;
+        memset(last_dc_coeff, 0, sizeof(last_dc_coeff));
+      }
+      // Encode one MCU
+      for (int i = 0; i < scan_info->comps_in_scan; ++i) {
+        int comp_idx = scan_info->component_index[i];
+        jpeg_component_info* comp = &cinfo->comp_info[comp_idx];
+        HuffmanCodeTable* dc_huff = &m->huff_tables[sci.dc_tbl_idx[i]];
+        HuffmanCodeTable* ac_huff = &m->huff_tables[sci.ac_tbl_idx[i]];
+        int n_blocks_y = is_interleaved ? comp->v_samp_factor : 1;
+        int n_blocks_x = is_interleaved ? comp->h_samp_factor : 1;
+        for (int iy = 0; iy < n_blocks_y; ++iy) {
+          for (int ix = 0; ix < n_blocks_x; ++ix) {
+            size_t block_y = mcu_y * n_blocks_y + iy;
+            size_t block_x = mcu_x * n_blocks_x + ix;
+            const coeff_t* block;
+            if (block_x >= comp->width_in_blocks ||
+                block_y >= comp->height_in_blocks) {
+              block = kDummyBlock;
+            } else {
+              block = &ba[i][iy][block_x][0];
+            }
+            bool ok;
+            if (!is_progressive) {
+              ok = EncodeDCTBlockSequential(block, dc_huff, ac_huff,
+                                            last_dc_coeff + i, bw);
+            } else if (Ah == 0) {
+              ok = EncodeDCTBlockProgressive(block, dc_huff, ac_huff, Ss, Se,
+                                             Al, &coding_state,
+                                             last_dc_coeff + i, bw);
+            } else {
+              ok = EncodeRefinementBits(block, ac_huff, Ss, Se, Al,
+                                        &coding_state, bw);
+            }
+            if (!ok) return false;
+          }
+        }
+      }
+      --restarts_to_go;
+    }
+    if (!EmptyBitWriterBuffer(bw)) {
+      JPEGLI_ERROR("Output suspension is not supported in finish_compress");
+    }
+  }
+  Flush(&coding_state, bw);
+  JumpToByteBoundary(bw);
+  if (!EmptyBitWriterBuffer(bw)) {
+    JPEGLI_ERROR("Output suspension is not supported in finish_compress");
+  }
+  if (!bw->healthy) return false;
+
+  return true;
+}
+
+struct Token {
+  uint8_t histo_idx;
+  uint8_t symbol;
+  uint16_t bits;
+  Token(int i, int s, int b) : histo_idx(i), symbol(s), bits(b) {}
+};
+
+void ComputeTokensForBlock(const coeff_t* block, int histo_dc, int histo_ac,
+                           coeff_t* last_dc_coeff, Token** tokens_ptr) {
+  Token* next_token = *tokens_ptr;
+  coeff_t temp2;
+  coeff_t temp;
+  temp2 = block[0];
+  temp = temp2 - *last_dc_coeff;
+  if (temp == 0) {
+    *next_token++ = Token(histo_dc, 0, 0);
+  } else {
+    *last_dc_coeff = temp2;
+    temp2 = temp;
+    if (temp < 0) {
+      temp = -temp;
+      temp2--;
+    }
+    int dc_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int dc_mask = (1 << dc_nbits) - 1;
+    *next_token++ = Token(histo_dc, dc_nbits, temp2 & dc_mask);
+  }
+  int num_nonzeros = HWY_DYNAMIC_DISPATCH(NumNonZero8x8ExceptDC)(block);
+  for (int k = 1; k < 64; ++k) {
+    if (num_nonzeros == 0) {
+      *next_token++ = Token(histo_ac, 0, 0);
+      break;
+    }
+    int r = 0;
+    while ((temp = block[kJPEGNaturalOrder[k]]) == 0) {
+      r++;
+      k++;
+    }
+    --num_nonzeros;
+    if (temp < 0) {
+      temp = -temp;
+      temp2 = ~temp;
+    } else {
+      temp2 = temp;
+    }
+    while (r > 15) {
+      *next_token++ = Token(histo_ac, 0xf0, 0);
+      r -= 16;
+    }
+    int ac_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int ac_mask = (1 << ac_nbits) - 1;
+    int symbol = (r << 4u) + ac_nbits;
+    *next_token++ = Token(histo_ac, symbol, temp2 & ac_mask);
+  }
+  *tokens_ptr = next_token;
+}
+
+struct TokenArray {
+  Token* tokens = nullptr;
+  size_t num_tokens = 0;
+};
+
+size_t MaxNumTokensPerMCURow(j_compress_ptr cinfo) {
+  int MCUs_per_row = DivCeil(cinfo->image_width, 8 * cinfo->max_h_samp_factor);
+  size_t blocks_per_mcu = 0;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    blocks_per_mcu += comp->h_samp_factor * comp->v_samp_factor;
+  }
+  return kDCTBlockSize * blocks_per_mcu * MCUs_per_row;
+}
+
+size_t EstimateNumTokens(j_compress_ptr cinfo, size_t mcu_y, size_t ysize_mcus,
+                         size_t num_tokens, size_t max_per_row) {
+  size_t estimate;
+  if (mcu_y == 0) {
+    estimate = 16 * max_per_row;
+  } else {
+    estimate = (4 * ysize_mcus * num_tokens) / (3 * mcu_y);
+  }
+  size_t mcus_left = ysize_mcus - mcu_y;
+  return std::min(mcus_left * max_per_row,
+                  std::max(max_per_row, estimate - num_tokens));
+}
+
+void ComputeTokens(j_compress_ptr cinfo,
+                   std::vector<TokenArray>* token_arrays) {
+  jpeg_comp_master* m = cinfo->master;
+  TokenArray ta;
+  Token* next_token = ta.tokens;
+  size_t num_tokens = 0;
+  size_t total_num_tokens = 0;
+  size_t max_tokens_per_mcu_row = MaxNumTokensPerMCURow(cinfo);
+  int xsize_mcus = DivCeil(cinfo->image_width, 8 * cinfo->max_h_samp_factor);
+  int ysize_mcus = DivCeil(cinfo->image_height, 8 * cinfo->max_v_samp_factor);
+  coeff_t last_dc_coeff[MAX_COMPS_IN_SCAN] = {0};
+  JBLOCKARRAY ba[MAX_COMPS_IN_SCAN];
+  for (int mcu_y = 0; mcu_y < ysize_mcus; ++mcu_y) {
+    ProgressMonitorEncodePass(cinfo, 0, mcu_y);
+    ta.num_tokens = next_token - ta.tokens;
+    if (ta.num_tokens + max_tokens_per_mcu_row > num_tokens) {
+      if (ta.tokens) {
+        token_arrays->push_back(ta);
+        total_num_tokens += ta.num_tokens;
+      }
+      num_tokens = EstimateNumTokens(cinfo, mcu_y, ysize_mcus, total_num_tokens,
+                                     max_tokens_per_mcu_row);
+      ta.tokens = Allocate<Token>(cinfo, num_tokens, JPOOL_IMAGE);
+      next_token = ta.tokens;
+    }
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      jpeg_component_info* comp = &cinfo->comp_info[c];
+      int by0 = mcu_y * comp->v_samp_factor;
+      int block_rows_left = comp->height_in_blocks - by0;
+      int max_block_rows = std::min(comp->v_samp_factor, block_rows_left);
+      ba[c] = (*cinfo->mem->access_virt_barray)(
+          reinterpret_cast<j_common_ptr>(cinfo), m->coeff_buffers[c], by0,
+          max_block_rows, false);
+    }
+    if (cinfo->max_h_samp_factor == 1 && cinfo->max_v_samp_factor == 1) {
+      for (int mcu_x = 0; mcu_x < xsize_mcus; ++mcu_x) {
+        for (int c = 0; c < cinfo->num_components; ++c) {
+          ComputeTokensForBlock(&ba[c][0][mcu_x][0], c, c + 4,
+                                &last_dc_coeff[c], &next_token);
+        }
+      }
+      continue;
+    }
+    for (int mcu_x = 0; mcu_x < xsize_mcus; ++mcu_x) {
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        jpeg_component_info* comp = &cinfo->comp_info[c];
+        for (int iy = 0; iy < comp->v_samp_factor; ++iy) {
+          for (int ix = 0; ix < comp->h_samp_factor; ++ix) {
+            size_t block_y = mcu_y * comp->v_samp_factor + iy;
+            size_t block_x = mcu_x * comp->h_samp_factor + ix;
+            if (block_x >= comp->width_in_blocks ||
+                block_y >= comp->height_in_blocks) {
+              *next_token++ = Token(c, 0, 0);
+              *next_token++ = Token(c + 4, 0, 0);
+              continue;
+            }
+            ComputeTokensForBlock(&ba[c][iy][block_x][0], c, c + 4,
+                                  &last_dc_coeff[c], &next_token);
+          }
+        }
+      }
+    }
+  }
+  ta.num_tokens = next_token - ta.tokens;
+  token_arrays->push_back(ta);
+}
+
+void WriteTokens(j_compress_ptr cinfo, const Token* tokens, size_t num_tokens,
+                 const HuffmanCodeTable* huff_tables, const int* context_map,
+                 JpegBitWriter* bw) {
+  size_t cycle_len = bw->len / 8;
+  size_t next_cycle = cycle_len;
+  for (size_t i = 0; i < num_tokens; ++i) {
+    Token t = tokens[i];
+    int nbits = t.symbol & 0xf;
+    WriteSymbol(t.symbol, &huff_tables[context_map[t.histo_idx]], bw);
+    if (nbits > 0) {
+      WriteBits(bw, nbits, t.bits);
+    }
+    if (--next_cycle == 0) {
+      if (!EmptyBitWriterBuffer(bw)) {
+        JPEGLI_ERROR("Output suspension is not supported in finish_compress");
+      }
+      next_cycle = cycle_len;
+    }
+  }
+}
+
+void BuildHistograms(const Token* tokens, size_t num_tokens,
+                     Histogram* histograms) {
+  for (size_t j = 0; j < num_tokens; ++j) {
+    Token t = tokens[j];
+    ++histograms[t.histo_idx].count[t.symbol];
+  }
+}
+
+void EncodeSingleScan(j_compress_ptr cinfo) {
+  std::vector<TokenArray> token_arrays;
+  ComputeTokens(cinfo, &token_arrays);
+  Histogram histograms[8] = {};
+  for (size_t i = 0; i < token_arrays.size(); ++i) {
+    Token* tokens = token_arrays[i].tokens;
+    size_t num_tokens = token_arrays[i].num_tokens;
+    BuildHistograms(tokens, num_tokens, histograms);
+  }
+  JpegClusteredHistograms dc_clusters;
+  ClusterJpegHistograms(histograms, 4, &dc_clusters);
+  JpegClusteredHistograms ac_clusters;
+  ClusterJpegHistograms(histograms + 4, 4, &ac_clusters);
+
+  JPEGHuffmanCode* huffman_codes =
+      Allocate<JPEGHuffmanCode>(cinfo, 8, JPOOL_IMAGE);
+  size_t num_huffman_codes = 0;
+  for (size_t i = 0; i < dc_clusters.histograms.size(); ++i) {
+    AddJpegHuffmanCode(dc_clusters.histograms[i], i, huffman_codes,
+                       &num_huffman_codes);
+  }
+  for (size_t i = 0; i < ac_clusters.histograms.size(); ++i) {
+    AddJpegHuffmanCode(ac_clusters.histograms[i], 0x10 + i, huffman_codes,
+                       &num_huffman_codes);
+  }
+
+  bool is_baseline = true;
+  int context_map[8];
+  ScanCodingInfo sci = {};
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    if (dc_clusters.histogram_indexes[c] > 1 ||
+        ac_clusters.histogram_indexes[c] > 1) {
+      is_baseline = false;
+    }
+    sci.dc_tbl_idx[c] = dc_clusters.histogram_indexes[c];
+    sci.ac_tbl_idx[c] = ac_clusters.histogram_indexes[c] + 4;
+    context_map[c] = sci.dc_tbl_idx[c];
+    context_map[c + 4] = sci.ac_tbl_idx[c];
+  }
+  sci.num_huffman_codes = num_huffman_codes;
+  memcpy(cinfo->master->scan_coding_info, &sci, sizeof(sci));
+  EncodeDQT(cinfo, /*write_all_tables=*/false, &is_baseline);
+  EncodeSOF(cinfo, is_baseline);
+  EncodeDHT(cinfo, huffman_codes, num_huffman_codes);
+  EncodeSOS(cinfo, 0);
+
+  JpegBitWriter* bw = &cinfo->master->bw;
+  HuffmanCodeTable* huff_tables = cinfo->master->huff_tables;
+  for (size_t i = 0; i < token_arrays.size(); ++i) {
+    Token* tokens = token_arrays[i].tokens;
+    size_t num_tokens = token_arrays[i].num_tokens;
+    WriteTokens(cinfo, tokens, num_tokens, huff_tables, context_map, bw);
+  }
+  JumpToByteBoundary(bw);
+  if (!EmptyBitWriterBuffer(bw)) {
+    JPEGLI_ERROR("Output suspension is not supported in finish_compress");
+  }
+  if (!bw->healthy) {
+    JPEGLI_ERROR("Failed to encode scan.");
+  }
+}
+
+HWY_EXPORT(WriteiMCURow);
+void WriteiMCURow(j_compress_ptr cinfo) {
+  HWY_DYNAMIC_DISPATCH(WriteiMCURow)(cinfo);
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/bitstream.h b/third_party/jpeg-xl/lib/jpegli/bitstream.h
new file mode 100644
index 0000000000..18b6a09c29
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/bitstream.h
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_BITSTREAM_H_
+#define LIB_JPEGLI_BITSTREAM_H_
+
+#include <initializer_list>
+#include <vector>
+
+#include "lib/jpegli/encode_internal.h"
+
+namespace jpegli {
+
+void WriteOutput(j_compress_ptr cinfo, const uint8_t* buf, size_t bufsize);
+void WriteOutput(j_compress_ptr cinfo, const std::vector<uint8_t>& bytes);
+void WriteOutput(j_compress_ptr cinfo, std::initializer_list<uint8_t> bytes);
+
+void EncodeAPP0(j_compress_ptr cinfo);
+void EncodeAPP14(j_compress_ptr cinfo);
+void EncodeSOF(j_compress_ptr cinfo, bool is_baseline);
+void EncodeSOS(j_compress_ptr cinfo, int scan_index);
+void EncodeDHT(j_compress_ptr cinfo, const JPEGHuffmanCode* huffman_codes,
+               size_t num_huffman_codes, bool pre_shifted = false);
+void EncodeDQT(j_compress_ptr cinfo, bool write_all_tables, bool* is_baseline);
+bool EncodeDRI(j_compress_ptr cinfo);
+
+bool EncodeScan(j_compress_ptr cinfo, int scan_index);
+
+void EncodeSingleScan(j_compress_ptr cinfo);
+
+void WriteiMCURow(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_BITSTREAM_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/color_quantize.cc b/third_party/jpeg-xl/lib/jpegli/color_quantize.cc
new file mode 100644
index 0000000000..1079c45c9f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/color_quantize.cc
@@ -0,0 +1,533 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/color_quantize.h"
+
+#include <cmath>
+#include <limits>
+#include <unordered_map>
+
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/error.h"
+
+namespace jpegli {
+
+namespace {
+
+static constexpr int kNumColorCellBits[kMaxComponents] = {3, 4, 3, 3};
+static constexpr int kCompW[kMaxComponents] = {2, 3, 1, 1};
+
+int Pow(int a, int b) {
+  int r = 1;
+  for (int i = 0; i < b; ++i) {
+    r *= a;
+  }
+  return r;
+}
+
+int ComponentOrder(j_decompress_ptr cinfo, int i) {
+  if (cinfo->out_color_components == 3) {
+    return i < 2 ? 1 - i : i;
+  }
+  return i;
+}
+
+int GetColorComponent(int i, int N) {
+  return (i * 255 + (N - 1) / 2) / (N - 1);
+}
+
+}  // namespace
+
+void ChooseColorMap1Pass(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  int components = cinfo->out_color_components;
+  int desired = std::min(cinfo->desired_number_of_colors, 256);
+  int num = 1;
+  while (Pow(num + 1, components) <= desired) {
+    ++num;
+  }
+  if (num == 1) {
+    JPEGLI_ERROR("Too few colors (%d) in requested colormap", desired);
+  }
+  int actual = Pow(num, components);
+  for (int i = 0; i < components; ++i) {
+    m->num_colors_[i] = num;
+  }
+  while (actual < desired) {
+    int total = actual;
+    for (int i = 0; i < components; ++i) {
+      int c = ComponentOrder(cinfo, i);
+      int new_total = (actual / m->num_colors_[c]) * (m->num_colors_[c] + 1);
+      if (new_total <= desired) {
+        ++m->num_colors_[c];
+        actual = new_total;
+      }
+    }
+    if (actual == total) {
+      break;
+    }
+  }
+  cinfo->actual_number_of_colors = actual;
+  cinfo->colormap = (*cinfo->mem->alloc_sarray)(
+      reinterpret_cast<j_common_ptr>(cinfo), JPOOL_IMAGE, actual, components);
+  int next_color[kMaxComponents] = {0};
+  for (int i = 0; i < actual; ++i) {
+    for (int c = 0; c < components; ++c) {
+      cinfo->colormap[c][i] =
+          GetColorComponent(next_color[c], m->num_colors_[c]);
+    }
+    int c = components - 1;
+    while (c > 0 && next_color[c] + 1 == m->num_colors_[c]) {
+      next_color[c--] = 0;
+    }
+    ++next_color[c];
+  }
+  if (!m->colormap_lut_) {
+    m->colormap_lut_ = Allocate<uint8_t>(cinfo, components * 256, JPOOL_IMAGE);
+  }
+  int stride = actual;
+  for (int c = 0; c < components; ++c) {
+    int N = m->num_colors_[c];
+    stride /= N;
+    for (int i = 0; i < 256; ++i) {
+      int index = ((2 * i - 1) * (N - 1) + 254) / 510;
+      m->colormap_lut_[c * 256 + i] = index * stride;
+    }
+  }
+}
+
+namespace {
+
+// 2^13 priority levels for the PQ seems to be a good compromise between
+// accuracy, running time and stack space usage.
+static const int kMaxPriority = 1 << 13;
+static const int kMaxLevel = 3;
+
+// This function is used in the multi-resolution grid to be able to compute
+// the keys for the different resolutions by just shifting the first key.
+inline int InterlaceBitsRGB(uint8_t r, uint8_t g, uint8_t b) {
+  int z = 0;
+  for (int i = 0; i < 7; ++i) {
+    z += (r >> 5) & 4;
+    z += (g >> 6) & 2;
+    z += (b >> 7);
+    z <<= 3;
+    r <<= 1;
+    g <<= 1;
+    b <<= 1;
+  }
+  z += (r >> 5) & 4;
+  z += (g >> 6) & 2;
+  z += (b >> 7);
+  return z;
+}
+
+// This function will compute the actual priorities of the colors based on
+// the current distance from the palette, the population count and the signals
+// from the multi-resolution grid.
+inline int Priority(int d, int n, const int* density, const int* radius) {
+  int p = d * n;
+  for (int level = 0; level < kMaxLevel; ++level) {
+    if (d > radius[level]) {
+      p += density[level] * (d - radius[level]);
+    }
+  }
+  return std::min(kMaxPriority - 1, p >> 4);
+}
+
+inline int ColorIntQuadDistanceRGB(uint8_t r1, uint8_t g1, uint8_t b1,
+                                   uint8_t r2, uint8_t g2, uint8_t b2) {
+  // weights for the intensity calculation
+  static constexpr int ired = 2;
+  static constexpr int igreen = 5;
+  static constexpr int iblue = 1;
+  // normalization factor for the intensity calculation (2^ishift)
+  static constexpr int ishift = 3;
+  const int rd = r1 - r2;
+  const int gd = g1 - g2;
+  const int bd = b1 - b2;
+  const int id = ired * rd + igreen * gd + iblue * bd;
+  return rd * rd + gd * gd + bd * bd + ((id * id) >> (2 * ishift));
+}
+
+inline int ScaleQuadDistanceRGB(int d) {
+  return static_cast<int>(sqrt(d * 0.25) + 0.5);
+}
+
+// The function updates the minimal distances, the clustering and the
+// quantization error after the insertion of the new color into the palette.
+void AddToRGBPalette(const uint8_t* red, const uint8_t* green,
+                     const uint8_t* blue,
+                     const int* count,  // histogram of colors
+                     const int index,   // index of color to be added
+                     const int k,       // size of current palette
+                     const int n,       // number of colors
+                     int* dist,         // array of distances from palette
+                     int* cluster,      // mapping of color indices to palette
+                     int* center,       // the inverse mapping
+                     int64_t* error) {  // measure of the quantization error
+  center[k] = index;
+  cluster[index] = k;
+  *error -=
+      static_cast<int64_t>(dist[index]) * static_cast<int64_t>(count[index]);
+  dist[index] = 0;
+  for (int j = 0; j < n; ++j) {
+    if (dist[j] > 0) {
+      const int d = ColorIntQuadDistanceRGB(
+          red[index], green[index], blue[index], red[j], green[j], blue[j]);
+      if (d < dist[j]) {
+        *error += static_cast<int64_t>((d - dist[j])) *
+                  static_cast<int64_t>(count[j]);
+        dist[j] = d;
+        cluster[j] = k;
+      }
+    }
+  }
+}
+
+struct RGBPixelHasher {
+  // A quick but good-enough hash to get 24 bits of RGB into the lower 12 bits.
+  size_t operator()(uint32_t a) const { return (a ^ (a >> 12)) * 0x9e3779b9; }
+};
+
+struct WangHasher {
+  // Thomas Wang's Hash.  Nearly perfect and still quite fast.  Above (for
+  // pixels) we use a simpler hash because the number of hash calls is
+  // proportional to the number of pixels and that hash dominates; we want the
+  // cost to be minimal and we start with a large table.  We can use a better
+  // hash for the histogram since the number of hash calls is proportional to
+  // the number of unique colors in the image, which is hopefully much smaller.
+  // Note that the difference is slight; e.g. replacing RGBPixelHasher with
+  // WangHasher only slows things down by 5% on an Opteron.
+  size_t operator()(uint32_t a) const {
+    a = (a ^ 61) ^ (a >> 16);
+    a = a + (a << 3);
+    a = a ^ (a >> 4);
+    a = a * 0x27d4eb2d;
+    a = a ^ (a >> 15);
+    return a;
+  }
+};
+
+// Build an index of all the different colors in the input
+// image. To do this we map the 24 bit RGB representation of the colors
+// to a unique integer index assigned to the different colors in order of
+// appearence in the image.  Return the number of unique colors found.
+// The colors are pre-quantized to 3 * 6 bits precision.
+static int BuildRGBColorIndex(const uint8_t* const image, int const num_pixels,
+                              int* const count, uint8_t* const red,
+                              uint8_t* const green, uint8_t* const blue) {
+  // Impossible because rgb are in the low 24 bits, and the upper 8 bits is 0.
+  const uint32_t impossible_pixel_value = 0x10000000;
+  std::unordered_map<uint32_t, int, RGBPixelHasher> index_map(1 << 12);
+  std::unordered_map<uint32_t, int, RGBPixelHasher>::iterator index_map_lookup;
+  const uint8_t* imagep = &image[0];
+  uint32_t prev_pixel = impossible_pixel_value;
+  int index = 0;
+  int n = 0;
+  for (int i = 0; i < num_pixels; ++i) {
+    uint8_t r = ((*imagep++) & 0xfc) + 2;
+    uint8_t g = ((*imagep++) & 0xfc) + 2;
+    uint8_t b = ((*imagep++) & 0xfc) + 2;
+    uint32_t pixel = (b << 16) | (g << 8) | r;
+    if (pixel != prev_pixel) {
+      prev_pixel = pixel;
+      index_map_lookup = index_map.find(pixel);
+      if (index_map_lookup != index_map.end()) {
+        index = index_map_lookup->second;
+      } else {
+        index_map[pixel] = index = n++;
+        red[index] = r;
+        green[index] = g;
+        blue[index] = b;
+      }
+    }
+    ++count[index];
+  }
+  return n;
+}
+
+}  // namespace
+
+void ChooseColorMap2Pass(j_decompress_ptr cinfo) {
+  if (cinfo->out_color_space != JCS_RGB) {
+    JPEGLI_ERROR("Two-pass quantizer must use RGB output color space.");
+  }
+  jpeg_decomp_master* m = cinfo->master;
+  const size_t num_pixels = cinfo->output_width * cinfo->output_height;
+  const int max_color_count = std::max<size_t>(num_pixels, 1u << 18);
+  const int max_palette_size = cinfo->desired_number_of_colors;
+  std::unique_ptr<uint8_t[]> red(new uint8_t[max_color_count]);
+  std::unique_ptr<uint8_t[]> green(new uint8_t[max_color_count]);
+  std::unique_ptr<uint8_t[]> blue(new uint8_t[max_color_count]);
+  std::vector<int> count(max_color_count, 0);
+  // number of colors
+  int n = BuildRGBColorIndex(m->pixels_, num_pixels, &count[0], &red[0],
+                             &green[0], &blue[0]);
+
+  std::vector<int> dist(n, std::numeric_limits<int>::max());
+  std::vector<int> cluster(n);
+  std::vector<bool> in_palette(n, false);
+  int center[256];
+  int k = 0;  // palette size
+  const int count_threshold = (num_pixels * 4) / max_palette_size;
+  static constexpr int kAveragePixelErrorThreshold = 1;
+  const int64_t error_threshold = num_pixels * kAveragePixelErrorThreshold;
+  int64_t error = 0;  // quantization error
+
+  int max_count = 0;
+  int winner = 0;
+  for (int i = 0; i < n; ++i) {
+    if (count[i] > max_count) {
+      max_count = count[i];
+      winner = i;
+    }
+    if (!in_palette[i] && count[i] > count_threshold) {
+      AddToRGBPalette(&red[0], &green[0], &blue[0], &count[0], i, k++, n,
+                      &dist[0], &cluster[0], &center[0], &error);
+      in_palette[i] = true;
+    }
+  }
+  if (k == 0) {
+    AddToRGBPalette(&red[0], &green[0], &blue[0], &count[0], winner, k++, n,
+                    &dist[0], &cluster[0], &center[0], &error);
+    in_palette[winner] = true;
+  }
+
+  // Calculation of the multi-resolution density grid.
+  std::vector<int> density(n * kMaxLevel);
+  std::vector<int> radius(n * kMaxLevel);
+  std::unordered_map<uint32_t, int, WangHasher> histogram[kMaxLevel];
+  for (int level = 0; level < kMaxLevel; ++level) {
+    // This value is never used because key = InterlaceBitsRGB(...) >> 6
+  }
+
+  for (int i = 0; i < n; ++i) {
+    if (!in_palette[i]) {
+      const int key = InterlaceBitsRGB(red[i], green[i], blue[i]) >> 6;
+      for (int level = 0; level < kMaxLevel; ++level) {
+        histogram[level][key >> (3 * level)] += count[i];
+      }
+    }
+  }
+  for (int i = 0; i < n; ++i) {
+    if (!in_palette[i]) {
+      for (int level = 0; level < kMaxLevel; ++level) {
+        const int mask = (4 << level) - 1;
+        const int rd = std::max(red[i] & mask, mask - (red[i] & mask));
+        const int gd = std::max(green[i] & mask, mask - (green[i] & mask));
+        const int bd = std::max(blue[i] & mask, mask - (blue[i] & mask));
+        radius[i * kMaxLevel + level] =
+            ScaleQuadDistanceRGB(ColorIntQuadDistanceRGB(0, 0, 0, rd, gd, bd));
+      }
+      const int key = InterlaceBitsRGB(red[i], green[i], blue[i]) >> 6;
+      if (kMaxLevel > 0) {
+        density[i * kMaxLevel] = histogram[0][key] - count[i];
+      }
+      for (int level = 1; level < kMaxLevel; ++level) {
+        density[i * kMaxLevel + level] =
+            (histogram[level][key >> (3 * level)] -
+             histogram[level - 1][key >> (3 * level - 3)]);
+      }
+    }
+  }
+
+  // Calculate the initial error now that the palette has been initialized.
+  error = 0;
+  for (int i = 0; i < n; ++i) {
+    error += static_cast<int64_t>(dist[i]) * static_cast<int64_t>(count[i]);
+  }
+
+  std::unique_ptr<std::vector<int>[]> bucket_array(
+      new std::vector<int>[kMaxPriority]);
+  int top_priority = -1;
+  for (int i = 0; i < n; ++i) {
+    if (!in_palette[i]) {
+      int priority = Priority(ScaleQuadDistanceRGB(dist[i]), count[i],
+                              &density[i * kMaxLevel], &radius[i * kMaxLevel]);
+      bucket_array[priority].push_back(i);
+      top_priority = std::max(priority, top_priority);
+    }
+  }
+  double error_accum = 0;
+  while (top_priority >= 0 && k < max_palette_size) {
+    if (error < error_threshold) {
+      error_accum += std::min(error_threshold, error_threshold - error);
+      if (error_accum >= 10 * error_threshold) {
+        break;
+      }
+    }
+    int i = bucket_array[top_priority].back();
+    int priority = Priority(ScaleQuadDistanceRGB(dist[i]), count[i],
+                            &density[i * kMaxLevel], &radius[i * kMaxLevel]);
+    if (priority < top_priority) {
+      bucket_array[priority].push_back(i);
+    } else {
+      AddToRGBPalette(&red[0], &green[0], &blue[0], &count[0], i, k++, n,
+                      &dist[0], &cluster[0], &center[0], &error);
+    }
+    bucket_array[top_priority].pop_back();
+    while (top_priority >= 0 && bucket_array[top_priority].empty()) {
+      --top_priority;
+    }
+  }
+
+  cinfo->actual_number_of_colors = k;
+  cinfo->colormap = (*cinfo->mem->alloc_sarray)(
+      reinterpret_cast<j_common_ptr>(cinfo), JPOOL_IMAGE, k, 3);
+  for (int i = 0; i < k; ++i) {
+    int index = center[i];
+    cinfo->colormap[0][i] = red[index];
+    cinfo->colormap[1][i] = green[index];
+    cinfo->colormap[2][i] = blue[index];
+  }
+}
+
+namespace {
+
+void FindCandidatesForCell(j_decompress_ptr cinfo, int ncomp, int cell[],
+                           std::vector<uint8_t>* candidates) {
+  int cell_min[kMaxComponents];
+  int cell_max[kMaxComponents];
+  int cell_center[kMaxComponents];
+  for (int c = 0; c < ncomp; ++c) {
+    cell_min[c] = cell[c] << (8 - kNumColorCellBits[c]);
+    cell_max[c] = cell_min[c] + (1 << (8 - kNumColorCellBits[c])) - 1;
+    cell_center[c] = (cell_min[c] + cell_max[c]) >> 1;
+  }
+  int min_maxdist = std::numeric_limits<int>::max();
+  int mindist[256];
+  for (int i = 0; i < cinfo->actual_number_of_colors; ++i) {
+    int dmin = 0;
+    int dmax = 0;
+    for (int c = 0; c < ncomp; ++c) {
+      int palette_c = cinfo->colormap[c][i];
+      int dminc = 0, dmaxc;
+      if (palette_c < cell_min[c]) {
+        dminc = cell_min[c] - palette_c;
+        dmaxc = cell_max[c] - palette_c;
+      } else if (palette_c > cell_max[c]) {
+        dminc = palette_c - cell_max[c];
+        dmaxc = palette_c - cell_min[c];
+      } else if (palette_c > cell_center[c]) {
+        dmaxc = palette_c - cell_min[c];
+      } else {
+        dmaxc = cell_max[c] - palette_c;
+      }
+      dminc *= kCompW[c];
+      dmaxc *= kCompW[c];
+      dmin += dminc * dminc;
+      dmax += dmaxc * dmaxc;
+    }
+    mindist[i] = dmin;
+    min_maxdist = std::min(dmax, min_maxdist);
+  }
+  for (int i = 0; i < cinfo->actual_number_of_colors; ++i) {
+    if (mindist[i] < min_maxdist) {
+      candidates->push_back(i);
+    }
+  }
+}
+
+}  // namespace
+
+void CreateInverseColorMap(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  int ncomp = cinfo->out_color_components;
+  int num_cells = 1;
+  for (int c = 0; c < ncomp; ++c) {
+    num_cells *= (1 << kNumColorCellBits[c]);
+  }
+  m->candidate_lists_.resize(num_cells);
+
+  int next_cell[kMaxComponents] = {0};
+  for (int i = 0; i < num_cells; ++i) {
+    m->candidate_lists_[i].clear();
+    FindCandidatesForCell(cinfo, ncomp, next_cell, &m->candidate_lists_[i]);
+    int c = ncomp - 1;
+    while (c > 0 && next_cell[c] + 1 == (1 << kNumColorCellBits[c])) {
+      next_cell[c--] = 0;
+    }
+    ++next_cell[c];
+  }
+  m->regenerate_inverse_colormap_ = false;
+}
+
+int LookupColorIndex(j_decompress_ptr cinfo, JSAMPLE* pixel) {
+  jpeg_decomp_master* m = cinfo->master;
+  int num_channels = cinfo->out_color_components;
+  int index = 0;
+  if (m->quant_mode_ == 1) {
+    for (int c = 0; c < num_channels; ++c) {
+      index += m->colormap_lut_[c * 256 + pixel[c]];
+    }
+  } else {
+    size_t cell_idx = 0;
+    size_t stride = 1;
+    for (int c = num_channels - 1; c >= 0; --c) {
+      cell_idx += (pixel[c] >> (8 - kNumColorCellBits[c])) * stride;
+      stride <<= kNumColorCellBits[c];
+    }
+    JXL_ASSERT(cell_idx < m->candidate_lists_.size());
+    int mindist = std::numeric_limits<int>::max();
+    const auto& candidates = m->candidate_lists_[cell_idx];
+    for (uint8_t i : candidates) {
+      int dist = 0;
+      for (int c = 0; c < num_channels; ++c) {
+        int d = (cinfo->colormap[c][i] - pixel[c]) * kCompW[c];
+        dist += d * d;
+      }
+      if (dist < mindist) {
+        mindist = dist;
+        index = i;
+      }
+    }
+  }
+  JXL_ASSERT(index < cinfo->actual_number_of_colors);
+  return index;
+}
+
+void CreateOrderedDitherTables(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  static constexpr size_t kDitherSize = 4;
+  static constexpr size_t kDitherMask = kDitherSize - 1;
+  static constexpr float kBaseDitherMatrix[] = {
+      0,  8,  2,  10,  //
+      12, 4,  14, 6,   //
+      3,  11, 1,  9,   //
+      15, 7,  13, 5,   //
+  };
+  m->dither_size_ = kDitherSize;
+  m->dither_mask_ = kDitherMask;
+  size_t ncells = m->dither_size_ * m->dither_size_;
+  for (int c = 0; c < cinfo->out_color_components; ++c) {
+    float spread = 1.0f / (m->num_colors_[c] - 1);
+    float mul = spread / ncells;
+    float offset = 0.5f * spread;
+    if (m->dither_[c] == nullptr) {
+      m->dither_[c] = Allocate<float>(cinfo, ncells, JPOOL_IMAGE_ALIGNED);
+    }
+    for (size_t idx = 0; idx < ncells; ++idx) {
+      m->dither_[c][idx] = kBaseDitherMatrix[idx] * mul - offset;
+    }
+  }
+}
+
+void InitFSDitherState(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  for (int c = 0; c < cinfo->out_color_components; ++c) {
+    if (m->error_row_[c] == nullptr) {
+      m->error_row_[c] =
+          Allocate<float>(cinfo, cinfo->output_width, JPOOL_IMAGE_ALIGNED);
+      m->error_row_[c + kMaxComponents] =
+          Allocate<float>(cinfo, cinfo->output_width, JPOOL_IMAGE_ALIGNED);
+    }
+    memset(m->error_row_[c], 0.0, cinfo->output_width * sizeof(float));
+    memset(m->error_row_[c + kMaxComponents], 0.0,
+           cinfo->output_width * sizeof(float));
+  }
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/color_quantize.h b/third_party/jpeg-xl/lib/jpegli/color_quantize.h
new file mode 100644
index 0000000000..36a92d2f77
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/color_quantize.h
@@ -0,0 +1,30 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_COLOR_QUANTIZE_H_
+#define LIB_JPEGLI_COLOR_QUANTIZE_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+namespace jpegli {
+
+void ChooseColorMap1Pass(j_decompress_ptr cinfo);
+
+void ChooseColorMap2Pass(j_decompress_ptr cinfo);
+
+void CreateInverseColorMap(j_decompress_ptr cinfo);
+
+void CreateOrderedDitherTables(j_decompress_ptr cinfo);
+
+void InitFSDitherState(j_decompress_ptr cinfo);
+
+int LookupColorIndex(j_decompress_ptr cinfo, JSAMPLE* pixel);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_COLOR_QUANTIZE_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/color_transform.cc b/third_party/jpeg-xl/lib/jpegli/color_transform.cc
new file mode 100644
index 0000000000..020a6fd80c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/color_transform.cc
@@ -0,0 +1,281 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/color_transform.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/color_transform.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Sub;
+
+void YCbCrToRGB(float* row[kMaxComponents], size_t xsize) {
+  const HWY_CAPPED(float, 8) df;
+  float* JXL_RESTRICT row0 = row[0];
+  float* JXL_RESTRICT row1 = row[1];
+  float* JXL_RESTRICT row2 = row[2];
+
+  // Full-range BT.601 as defined by JFIF Clause 7:
+  // https://www.itu.int/rec/T-REC-T.871-201105-I/en
+  const auto crcr = Set(df, 1.402f);
+  const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f);
+  const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f);
+  const auto cbcb = Set(df, 1.772f);
+
+  for (size_t x = 0; x < xsize; x += Lanes(df)) {
+    const auto y_vec = Load(df, row0 + x);
+    const auto cb_vec = Load(df, row1 + x);
+    const auto cr_vec = Load(df, row2 + x);
+    const auto r_vec = MulAdd(crcr, cr_vec, y_vec);
+    const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec));
+    const auto b_vec = MulAdd(cbcb, cb_vec, y_vec);
+    Store(r_vec, df, row0 + x);
+    Store(g_vec, df, row1 + x);
+    Store(b_vec, df, row2 + x);
+  }
+}
+
+void YCCKToCMYK(float* row[kMaxComponents], size_t xsize) {
+  const HWY_CAPPED(float, 8) df;
+  float* JXL_RESTRICT row0 = row[0];
+  float* JXL_RESTRICT row1 = row[1];
+  float* JXL_RESTRICT row2 = row[2];
+  YCbCrToRGB(row, xsize);
+  const auto offset = Set(df, -1.0f / 255.0f);
+  for (size_t x = 0; x < xsize; x += Lanes(df)) {
+    Store(Sub(offset, Load(df, row0 + x)), df, row0 + x);
+    Store(Sub(offset, Load(df, row1 + x)), df, row1 + x);
+    Store(Sub(offset, Load(df, row2 + x)), df, row2 + x);
+  }
+}
+
+void RGBToYCbCr(float* row[kMaxComponents], size_t xsize) {
+  const HWY_CAPPED(float, 8) df;
+  float* JXL_RESTRICT row0 = row[0];
+  float* JXL_RESTRICT row1 = row[1];
+  float* JXL_RESTRICT row2 = row[2];
+  // Full-range BT.601 as defined by JFIF Clause 7:
+  // https://www.itu.int/rec/T-REC-T.871-201105-I/en
+  const auto c128 = Set(df, 128.0f);
+  const auto kR = Set(df, 0.299f);  // NTSC luma
+  const auto kG = Set(df, 0.587f);
+  const auto kB = Set(df, 0.114f);
+  const auto kAmpR = Set(df, 0.701f);
+  const auto kAmpB = Set(df, 0.886f);
+  const auto kDiffR = Add(kAmpR, kR);
+  const auto kDiffB = Add(kAmpB, kB);
+  const auto kNormR = Div(Set(df, 1.0f), (Add(kAmpR, Add(kG, kB))));
+  const auto kNormB = Div(Set(df, 1.0f), (Add(kR, Add(kG, kAmpB))));
+
+  for (size_t x = 0; x < xsize; x += Lanes(df)) {
+    const auto r = Load(df, row0 + x);
+    const auto g = Load(df, row1 + x);
+    const auto b = Load(df, row2 + x);
+    const auto r_base = Mul(r, kR);
+    const auto r_diff = Mul(r, kDiffR);
+    const auto g_base = Mul(g, kG);
+    const auto b_base = Mul(b, kB);
+    const auto b_diff = Mul(b, kDiffB);
+    const auto y_base = Add(r_base, Add(g_base, b_base));
+    const auto cb_vec = MulAdd(Sub(b_diff, y_base), kNormB, c128);
+    const auto cr_vec = MulAdd(Sub(r_diff, y_base), kNormR, c128);
+    Store(y_base, df, row0 + x);
+    Store(cb_vec, df, row1 + x);
+    Store(cr_vec, df, row2 + x);
+  }
+}
+
+void CMYKToYCCK(float* row[kMaxComponents], size_t xsize) {
+  const HWY_CAPPED(float, 8) df;
+  float* JXL_RESTRICT row0 = row[0];
+  float* JXL_RESTRICT row1 = row[1];
+  float* JXL_RESTRICT row2 = row[2];
+  const auto unity = Set(df, 255.0f);
+  for (size_t x = 0; x < xsize; x += Lanes(df)) {
+    Store(Sub(unity, Load(df, row0 + x)), df, row0 + x);
+    Store(Sub(unity, Load(df, row1 + x)), df, row1 + x);
+    Store(Sub(unity, Load(df, row2 + x)), df, row2 + x);
+  }
+  RGBToYCbCr(row, xsize);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(CMYKToYCCK);
+HWY_EXPORT(YCCKToCMYK);
+HWY_EXPORT(YCbCrToRGB);
+HWY_EXPORT(RGBToYCbCr);
+
+bool CheckColorSpaceComponents(int num_components, J_COLOR_SPACE colorspace) {
+  switch (colorspace) {
+    case JCS_GRAYSCALE:
+      return num_components == 1;
+    case JCS_RGB:
+    case JCS_YCbCr:
+    case JCS_EXT_RGB:
+    case JCS_EXT_BGR:
+      return num_components == 3;
+    case JCS_CMYK:
+    case JCS_YCCK:
+    case JCS_EXT_RGBX:
+    case JCS_EXT_BGRX:
+    case JCS_EXT_XBGR:
+    case JCS_EXT_XRGB:
+    case JCS_EXT_RGBA:
+    case JCS_EXT_BGRA:
+    case JCS_EXT_ABGR:
+    case JCS_EXT_ARGB:
+      return num_components == 4;
+    default:
+      // Unrecognized colorspaces can have any number of channels, since no
+      // color transform will be performed on them.
+      return true;
+  }
+}
+
+void NullTransform(float* row[kMaxComponents], size_t len) {}
+
+void GrayscaleToRGB(float* row[kMaxComponents], size_t len) {
+  memcpy(row[1], row[0], len * sizeof(row[1][0]));
+  memcpy(row[2], row[0], len * sizeof(row[2][0]));
+}
+
+void GrayscaleToYCbCr(float* row[kMaxComponents], size_t len) {
+  memset(row[1], 0, len * sizeof(row[1][0]));
+  memset(row[2], 0, len * sizeof(row[2][0]));
+}
+
+void ChooseColorTransform(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  if (!CheckColorSpaceComponents(cinfo->input_components,
+                                 cinfo->in_color_space)) {
+    JPEGLI_ERROR("Invalid number of input components %d for colorspace %d",
+                 cinfo->input_components, cinfo->in_color_space);
+  }
+  if (!CheckColorSpaceComponents(cinfo->num_components,
+                                 cinfo->jpeg_color_space)) {
+    JPEGLI_ERROR("Invalid number of components %d for colorspace %d",
+                 cinfo->num_components, cinfo->jpeg_color_space);
+  }
+  if (cinfo->jpeg_color_space == cinfo->in_color_space) {
+    if (cinfo->num_components != cinfo->input_components) {
+      JPEGLI_ERROR("Input/output components mismatch:  %d vs %d",
+                   cinfo->input_components, cinfo->num_components);
+    }
+    // No color transform requested.
+    m->color_transform = NullTransform;
+    return;
+  }
+
+  if (cinfo->in_color_space == JCS_RGB && m->xyb_mode) {
+    JPEGLI_ERROR("Color transform on XYB colorspace is not supported.");
+  }
+
+  m->color_transform = nullptr;
+  if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
+    if (cinfo->in_color_space == JCS_RGB) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(RGBToYCbCr);
+    } else if (cinfo->in_color_space == JCS_YCbCr ||
+               cinfo->in_color_space == JCS_YCCK) {
+      // Since the first luminance channel is the grayscale version of the
+      // image, nothing to do here
+      m->color_transform = NullTransform;
+    }
+  } else if (cinfo->jpeg_color_space == JCS_RGB) {
+    if (cinfo->in_color_space == JCS_GRAYSCALE) {
+      m->color_transform = GrayscaleToRGB;
+    }
+  } else if (cinfo->jpeg_color_space == JCS_YCbCr) {
+    if (cinfo->in_color_space == JCS_RGB) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(RGBToYCbCr);
+    } else if (cinfo->in_color_space == JCS_GRAYSCALE) {
+      m->color_transform = GrayscaleToYCbCr;
+    }
+  } else if (cinfo->jpeg_color_space == JCS_YCCK) {
+    if (cinfo->in_color_space == JCS_CMYK) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(CMYKToYCCK);
+    }
+  }
+
+  if (m->color_transform == nullptr) {
+    // TODO(szabadka) Support more color transforms.
+    JPEGLI_ERROR("Unsupported color transform %d -> %d", cinfo->in_color_space,
+                 cinfo->jpeg_color_space);
+  }
+}
+
+void ChooseColorTransform(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!CheckColorSpaceComponents(cinfo->out_color_components,
+                                 cinfo->out_color_space)) {
+    JPEGLI_ERROR("Invalid number of output components %d for colorspace %d",
+                 cinfo->out_color_components, cinfo->out_color_space);
+  }
+  if (!CheckColorSpaceComponents(cinfo->num_components,
+                                 cinfo->jpeg_color_space)) {
+    JPEGLI_ERROR("Invalid number of components %d for colorspace %d",
+                 cinfo->num_components, cinfo->jpeg_color_space);
+  }
+  if (cinfo->jpeg_color_space == cinfo->out_color_space) {
+    if (cinfo->num_components != cinfo->out_color_components) {
+      JPEGLI_ERROR("Input/output components mismatch:  %d vs %d",
+                   cinfo->num_components, cinfo->out_color_components);
+    }
+    // No color transform requested.
+    m->color_transform = NullTransform;
+    return;
+  }
+
+  m->color_transform = nullptr;
+  if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
+    if (cinfo->out_color_space == JCS_RGB) {
+      m->color_transform = GrayscaleToRGB;
+    }
+  } else if (cinfo->jpeg_color_space == JCS_RGB) {
+    if (cinfo->out_color_space == JCS_GRAYSCALE) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(RGBToYCbCr);
+    }
+  } else if (cinfo->jpeg_color_space == JCS_YCbCr) {
+    if (cinfo->out_color_space == JCS_RGB) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(YCbCrToRGB);
+    } else if (cinfo->out_color_space == JCS_GRAYSCALE) {
+      m->color_transform = NullTransform;
+    }
+  } else if (cinfo->jpeg_color_space == JCS_YCCK) {
+    if (cinfo->out_color_space == JCS_CMYK) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(YCCKToCMYK);
+    }
+  }
+
+  if (m->color_transform == nullptr) {
+    // TODO(szabadka) Support more color transforms.
+    JPEGLI_ERROR("Unsupported color transform %d -> %d",
+                 cinfo->jpeg_color_space, cinfo->out_color_space);
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/color_transform.h b/third_party/jpeg-xl/lib/jpegli/color_transform.h
new file mode 100644
index 0000000000..27570858f7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/color_transform.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_COLOR_TRANSFORM_H_
+#define LIB_JPEGLI_COLOR_TRANSFORM_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jpegli {
+
+void ChooseColorTransform(j_compress_ptr cinfo);
+
+void ChooseColorTransform(j_decompress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_COLOR_TRANSFORM_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/common.cc b/third_party/jpeg-xl/lib/jpegli/common.cc
new file mode 100644
index 0000000000..5f34372f3e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/common.cc
@@ -0,0 +1,59 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/common.h"
+
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/memory_manager.h"
+
+void jpegli_abort(j_common_ptr cinfo) {
+  if (cinfo->mem == nullptr) return;
+  for (int pool_id = 0; pool_id < JPOOL_NUMPOOLS; ++pool_id) {
+    if (pool_id == JPOOL_PERMANENT) continue;
+    (*cinfo->mem->free_pool)(cinfo, pool_id);
+  }
+  if (cinfo->is_decompressor) {
+    cinfo->global_state = jpegli::kDecStart;
+  } else {
+    cinfo->global_state = jpegli::kEncStart;
+  }
+}
+
+void jpegli_destroy(j_common_ptr cinfo) {
+  if (cinfo->mem == nullptr) return;
+  (*cinfo->mem->self_destruct)(cinfo);
+  if (cinfo->is_decompressor) {
+    cinfo->global_state = jpegli::kDecNull;
+    delete reinterpret_cast<j_decompress_ptr>(cinfo)->master;
+  } else {
+    cinfo->global_state = jpegli::kEncNull;
+  }
+}
+
+JQUANT_TBL* jpegli_alloc_quant_table(j_common_ptr cinfo) {
+  JQUANT_TBL* table = jpegli::Allocate<JQUANT_TBL>(cinfo, 1);
+  table->sent_table = FALSE;
+  return table;
+}
+
+JHUFF_TBL* jpegli_alloc_huff_table(j_common_ptr cinfo) {
+  JHUFF_TBL* table = jpegli::Allocate<JHUFF_TBL>(cinfo, 1);
+  table->sent_table = FALSE;
+  return table;
+}
+
+int jpegli_bytes_per_sample(JpegliDataType data_type) {
+  switch (data_type) {
+    case JPEGLI_TYPE_UINT8:
+      return 1;
+    case JPEGLI_TYPE_UINT16:
+      return 2;
+    case JPEGLI_TYPE_FLOAT:
+      return 4;
+    default:
+      return 0;
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jpegli/common.h b/third_party/jpeg-xl/lib/jpegli/common.h
new file mode 100644
index 0000000000..f46b751018
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/common.h
@@ -0,0 +1,67 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// This file conatins the C API of the common encoder/decoder part of libjpegli
+// library, which is based on the C API of libjpeg, with the function names
+// changed from jpeg_* to jpegli_*, while compressor and dempressor object
+// definitions are included directly from jpeglib.h
+//
+// Applications can use the libjpegli library in one of the following ways:
+//
+//  (1) Include jpegli/encode.h and/or jpegli/decode.h, update the function
+//      names of the API and link against libjpegli.
+//
+//  (2) Leave the application code unchanged, but replace the libjpeg.so library
+//      with the one built by this project that is API- and ABI-compatible with
+//      libjpeg-turbo's version of libjpeg.so.
+
+#ifndef LIB_JPEGLI_COMMON_H_
+#define LIB_JPEGLI_COMMON_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+struct jpeg_error_mgr* jpegli_std_error(struct jpeg_error_mgr* err);
+
+void jpegli_abort(j_common_ptr cinfo);
+
+void jpegli_destroy(j_common_ptr cinfo);
+
+JQUANT_TBL* jpegli_alloc_quant_table(j_common_ptr cinfo);
+
+JHUFF_TBL* jpegli_alloc_huff_table(j_common_ptr cinfo);
+
+//
+// New API structs and functions that are not available in libjpeg
+//
+// NOTE: This part of the API is still experimental and will probably change in
+// the future.
+//
+
+typedef enum {
+  JPEGLI_TYPE_FLOAT = 0,
+  JPEGLI_TYPE_UINT8 = 2,
+  JPEGLI_TYPE_UINT16 = 3,
+} JpegliDataType;
+
+typedef enum {
+  JPEGLI_NATIVE_ENDIAN = 0,
+  JPEGLI_LITTLE_ENDIAN = 1,
+  JPEGLI_BIG_ENDIAN = 2,
+} JpegliEndianness;
+
+int jpegli_bytes_per_sample(JpegliDataType data_type);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}  // extern "C"
+#endif
+
+#endif  // LIB_JPEGLI_COMMON_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/common_internal.h b/third_party/jpeg-xl/lib/jpegli/common_internal.h
new file mode 100644
index 0000000000..248d3154e1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/common_internal.h
@@ -0,0 +1,150 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_COMMON_INTERNAL_H_
+#define LIB_JPEGLI_COMMON_INTERNAL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <hwy/aligned_allocator.h>
+
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jpegli/simd.h"
+#include "lib/jxl/base/compiler_specific.h"  // for ssize_t
+#include "lib/jxl/base/status.h"             // for JXL_CHECK
+
+namespace jpegli {
+
+enum State {
+  kDecNull,
+  kDecStart,
+  kDecInHeader,
+  kDecHeaderDone,
+  kDecProcessMarkers,
+  kDecProcessScan,
+  kEncNull,
+  kEncStart,
+  kEncHeader,
+  kEncReadImage,
+  kEncWriteCoeffs,
+};
+
+template <typename T1, typename T2>
+constexpr inline T1 DivCeil(T1 a, T2 b) {
+  return (a + b - 1) / b;
+}
+
+template <typename T1, typename T2>
+constexpr inline T1 RoundUpTo(T1 a, T2 b) {
+  return DivCeil(a, b) * b;
+}
+
+constexpr size_t kDCTBlockSize = 64;
+// This is set to the same value as MAX_COMPS_IN_SCAN, because that is the
+// maximum number of channels the libjpeg-turbo decoder can decode.
+constexpr int kMaxComponents = 4;
+constexpr int kMaxQuantTables = 4;
+constexpr int kJpegPrecision = 8;
+constexpr int kMaxHuffmanTables = 4;
+constexpr size_t kJpegHuffmanMaxBitLength = 16;
+constexpr int kJpegHuffmanAlphabetSize = 256;
+constexpr int kJpegDCAlphabetSize = 12;
+constexpr int kMaxDHTMarkers = 512;
+constexpr int kMaxDimPixels = 65535;
+constexpr uint8_t kApp1 = 0xE1;
+constexpr uint8_t kApp2 = 0xE2;
+const uint8_t kIccProfileTag[12] = "ICC_PROFILE";
+const uint8_t kExifTag[6] = "Exif\0";
+const uint8_t kXMPTag[29] = "http://ns.adobe.com/xap/1.0/";
+
+/* clang-format off */
+constexpr uint32_t kJPEGNaturalOrder[80] = {
+  0,   1,  8, 16,  9,  2,  3, 10,
+  17, 24, 32, 25, 18, 11,  4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34,
+  27, 20, 13,  6,  7, 14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36,
+  29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46,
+  53, 60, 61, 54, 47, 55, 62, 63,
+  // extra entries for safety in decoder
+  63, 63, 63, 63, 63, 63, 63, 63,
+  63, 63, 63, 63, 63, 63, 63, 63
+};
+
+constexpr uint32_t kJPEGZigZagOrder[64] = {
+  0,   1,  5,  6, 14, 15, 27, 28,
+  2,   4,  7, 13, 16, 26, 29, 42,
+  3,   8, 12, 17, 25, 30, 41, 43,
+  9,  11, 18, 24, 31, 40, 44, 53,
+  10, 19, 23, 32, 39, 45, 52, 54,
+  20, 22, 33, 38, 46, 51, 55, 60,
+  21, 34, 37, 47, 50, 56, 59, 61,
+  35, 36, 48, 49, 57, 58, 62, 63
+};
+/* clang-format on */
+
+template <typename T>
+class RowBuffer {
+ public:
+  template <typename CInfoType>
+  void Allocate(CInfoType cinfo, size_t num_rows, size_t rowsize) {
+    size_t vec_size = std::max(VectorSize(), sizeof(T));
+    JXL_CHECK(vec_size % sizeof(T) == 0);
+    size_t alignment = std::max<size_t>(HWY_ALIGNMENT, vec_size);
+    size_t min_memstride = alignment + rowsize * sizeof(T) + vec_size;
+    size_t memstride = RoundUpTo(min_memstride, alignment);
+    xsize_ = rowsize;
+    ysize_ = num_rows;
+    stride_ = memstride / sizeof(T);
+    offset_ = alignment / sizeof(T);
+    data_ = ::jpegli::Allocate<T>(cinfo, ysize_ * stride_, JPOOL_IMAGE_ALIGNED);
+  }
+
+  T* Row(ssize_t y) const {
+    return &data_[((ysize_ + y) % ysize_) * stride_ + offset_];
+  }
+
+  size_t xsize() const { return xsize_; };
+  size_t ysize() const { return ysize_; };
+  size_t stride() const { return stride_; }
+
+  void PadRow(size_t y, size_t from, int border) {
+    float* row = Row(y);
+    for (int offset = -border; offset < 0; ++offset) {
+      row[offset] = row[0];
+    }
+    float last_val = row[from - 1];
+    for (size_t x = from; x < xsize_ + border; ++x) {
+      row[x] = last_val;
+    }
+  }
+
+  void CopyRow(ssize_t dst_row, ssize_t src_row, int border) {
+    memcpy(Row(dst_row) - border, Row(src_row) - border,
+           (xsize_ + 2 * border) * sizeof(T));
+  }
+
+  void FillRow(ssize_t y, T val, size_t len) {
+    T* row = Row(y);
+    for (size_t x = 0; x < len; ++x) {
+      row[x] = val;
+    }
+  }
+
+ private:
+  size_t xsize_;
+  size_t ysize_;
+  size_t stride_;
+  size_t offset_;
+  T* data_;
+};
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_COMMON_INTERNAL_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/dct-inl.h b/third_party/jpeg-xl/lib/jpegli/dct-inl.h
new file mode 100644
index 0000000000..0524e220d6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/dct-inl.h
@@ -0,0 +1,266 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JPEGLI_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JPEGLI_DCT_INL_H_
+#undef LIB_JPEGLI_DCT_INL_H_
+#else
+#define LIB_JPEGLI_DCT_INL_H_
+#endif
+
+#include "lib/jpegli/transpose-inl.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::DemoteTo;
+using hwy::HWY_NAMESPACE::Ge;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Round;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::Vec;
+
+using D = HWY_FULL(float);
+using DI = HWY_FULL(int32_t);
+
+template <size_t N>
+void AddReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2,
+                float* JXL_RESTRICT aout) {
+  HWY_CAPPED(float, 8) d8;
+  for (size_t i = 0; i < N; i++) {
+    auto in1 = Load(d8, ain1 + i * 8);
+    auto in2 = Load(d8, ain2 + (N - i - 1) * 8);
+    Store(Add(in1, in2), d8, aout + i * 8);
+  }
+}
+
+template <size_t N>
+void SubReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2,
+                float* JXL_RESTRICT aout) {
+  HWY_CAPPED(float, 8) d8;
+  for (size_t i = 0; i < N; i++) {
+    auto in1 = Load(d8, ain1 + i * 8);
+    auto in2 = Load(d8, ain2 + (N - i - 1) * 8);
+    Store(Sub(in1, in2), d8, aout + i * 8);
+  }
+}
+
+template <size_t N>
+void B(float* JXL_RESTRICT coeff) {
+  HWY_CAPPED(float, 8) d8;
+  constexpr float kSqrt2 = 1.41421356237f;
+  auto sqrt2 = Set(d8, kSqrt2);
+  auto in1 = Load(d8, coeff);
+  auto in2 = Load(d8, coeff + 8);
+  Store(MulAdd(in1, sqrt2, in2), d8, coeff);
+  for (size_t i = 1; i + 1 < N; i++) {
+    auto in1 = Load(d8, coeff + i * 8);
+    auto in2 = Load(d8, coeff + (i + 1) * 8);
+    Store(Add(in1, in2), d8, coeff + i * 8);
+  }
+}
+
+// Ideally optimized away by compiler (except the multiply).
+template <size_t N>
+void InverseEvenOdd(const float* JXL_RESTRICT ain, float* JXL_RESTRICT aout) {
+  HWY_CAPPED(float, 8) d8;
+  for (size_t i = 0; i < N / 2; i++) {
+    auto in1 = Load(d8, ain + i * 8);
+    Store(in1, d8, aout + 2 * i * 8);
+  }
+  for (size_t i = N / 2; i < N; i++) {
+    auto in1 = Load(d8, ain + i * 8);
+    Store(in1, d8, aout + (2 * (i - N / 2) + 1) * 8);
+  }
+}
+
+// Constants for DCT implementation. Generated by the following snippet:
+// for i in range(N // 2):
+//    print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
+template <size_t N>
+struct WcMultipliers;
+
+template <>
+struct WcMultipliers<4> {
+  static constexpr float kMultipliers[] = {
+      0.541196100146197,
+      1.3065629648763764,
+  };
+};
+
+template <>
+struct WcMultipliers<8> {
+  static constexpr float kMultipliers[] = {
+      0.5097955791041592,
+      0.6013448869350453,
+      0.8999762231364156,
+      2.5629154477415055,
+  };
+};
+
+constexpr float WcMultipliers<4>::kMultipliers[];
+constexpr float WcMultipliers<8>::kMultipliers[];
+
+// Invoked on full vector.
+template <size_t N>
+void Multiply(float* JXL_RESTRICT coeff) {
+  HWY_CAPPED(float, 8) d8;
+  for (size_t i = 0; i < N / 2; i++) {
+    auto in1 = Load(d8, coeff + (N / 2 + i) * 8);
+    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
+    Store(Mul(in1, mul), d8, coeff + (N / 2 + i) * 8);
+  }
+}
+
+void LoadFromBlock(const float* JXL_RESTRICT pixels, size_t pixels_stride,
+                   size_t off, float* JXL_RESTRICT coeff) {
+  HWY_CAPPED(float, 8) d8;
+  for (size_t i = 0; i < 8; i++) {
+    Store(LoadU(d8, pixels + i * pixels_stride + off), d8, coeff + i * 8);
+  }
+}
+
+void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, float* output,
+                          size_t off) {
+  HWY_CAPPED(float, 8) d8;
+  auto mul = Set(d8, 1.0f / 8);
+  for (size_t i = 0; i < 8; i++) {
+    StoreU(Mul(mul, Load(d8, coeff + i * 8)), d8, output + i * 8 + off);
+  }
+}
+
+template <size_t N>
+struct DCT1DImpl;
+
+template <>
+struct DCT1DImpl<1> {
+  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {}
+};
+
+template <>
+struct DCT1DImpl<2> {
+  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {
+    HWY_CAPPED(float, 8) d8;
+    auto in1 = Load(d8, mem);
+    auto in2 = Load(d8, mem + 8);
+    Store(Add(in1, in2), d8, mem);
+    Store(Sub(in1, in2), d8, mem + 8);
+  }
+};
+
+template <size_t N>
+struct DCT1DImpl {
+  void operator()(float* JXL_RESTRICT mem) {
+    HWY_ALIGN float tmp[N * 8];
+    AddReverse<N / 2>(mem, mem + N * 4, tmp);
+    DCT1DImpl<N / 2>()(tmp);
+    SubReverse<N / 2>(mem, mem + N * 4, tmp + N * 4);
+    Multiply<N>(tmp);
+    DCT1DImpl<N / 2>()(tmp + N * 4);
+    B<N / 2>(tmp + N * 4);
+    InverseEvenOdd<N>(tmp, mem);
+  }
+};
+
+void DCT1D(const float* JXL_RESTRICT pixels, size_t pixels_stride,
+           float* JXL_RESTRICT output) {
+  HWY_CAPPED(float, 8) d8;
+  HWY_ALIGN float tmp[64];
+  for (size_t i = 0; i < 8; i += Lanes(d8)) {
+    // TODO(veluca): consider removing the temporary memory here (as is done in
+    // IDCT), if it turns out that some compilers don't optimize away the loads
+    // and this is performance-critical.
+    LoadFromBlock(pixels, pixels_stride, i, tmp);
+    DCT1DImpl<8>()(tmp);
+    StoreToBlockAndScale(tmp, output, i);
+  }
+}
+
+void TransformFromPixels(const float* JXL_RESTRICT pixels, size_t pixels_stride,
+                         float* JXL_RESTRICT coefficients,
+                         float* JXL_RESTRICT scratch_space) {
+  DCT1D(pixels, pixels_stride, scratch_space);
+  Transpose8x8Block(scratch_space, coefficients);
+  DCT1D(coefficients, 8, scratch_space);
+  Transpose8x8Block(scratch_space, coefficients);
+}
+
+void StoreQuantizedValue(const Vec<DI>& ival, int16_t* out) {
+  Rebind<int16_t, DI> di16;
+  Store(DemoteTo(di16, ival), di16, out);
+}
+
+void StoreQuantizedValue(const Vec<DI>& ival, int32_t* out) {
+  DI di;
+  Store(ival, di, out);
+}
+
+template <typename T>
+void QuantizeBlock(const float* dct, const float* qmc, float aq_strength,
+                   const float* zero_bias_offset, const float* zero_bias_mul,
+                   T* block) {
+  D d;
+  DI di;
+  const auto aq_mul = Set(d, aq_strength);
+  for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) {
+    const auto val = Load(d, dct + k);
+    const auto q = Load(d, qmc + k);
+    const auto qval = Mul(val, q);
+    const auto zb_offset = Load(d, zero_bias_offset + k);
+    const auto zb_mul = Load(d, zero_bias_mul + k);
+    const auto threshold = Add(zb_offset, Mul(zb_mul, aq_mul));
+    const auto nzero_mask = Ge(Abs(qval), threshold);
+    const auto ival = ConvertTo(di, IfThenElseZero(nzero_mask, Round(qval)));
+    StoreQuantizedValue(ival, block + k);
+  }
+}
+
+template <typename T>
+void QuantizeBlockNoAQ(const float* dct, const float* qmc, T* block) {
+  D d;
+  DI di;
+  for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) {
+    const auto val = Load(d, dct + k);
+    const auto q = Load(d, qmc + k);
+    const auto ival = ConvertTo(di, Round(Mul(val, q)));
+    StoreQuantizedValue(ival, block + k);
+  }
+}
+
+template <typename T>
+void ComputeCoefficientBlock(const float* JXL_RESTRICT pixels, size_t stride,
+                             const float* JXL_RESTRICT qmc, float aq_strength,
+                             const float* zero_bias_offset,
+                             const float* zero_bias_mul,
+                             float* JXL_RESTRICT tmp, T* block) {
+  float* JXL_RESTRICT dct = tmp;
+  float* JXL_RESTRICT scratch_space = tmp + DCTSIZE2;
+  TransformFromPixels(pixels, stride, dct, scratch_space);
+  if (aq_strength > 0.0f) {
+    QuantizeBlock(dct, qmc, aq_strength, zero_bias_offset, zero_bias_mul,
+                  block);
+  } else {
+    QuantizeBlockNoAQ(dct, qmc, block);
+  }
+  // Center DC values around zero.
+  static constexpr float kDCBias = 128.0f;
+  block[0] = std::round((dct[0] - kDCBias) * qmc[0]);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+#endif  // LIB_JPEGLI_DCT_INL_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/dct.cc b/third_party/jpeg-xl/lib/jpegli/dct.cc
new file mode 100644
index 0000000000..4320abe4c6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/dct.cc
@@ -0,0 +1,75 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/dct.h"
+
+#include <cmath>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/dct.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/dct-inl.h"
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/memory_manager.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+namespace {
+
+void ComputeDCTCoefficients(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  float* tmp = m->dct_buffer;
+  for (int c = 0; c < cinfo->num_components; c++) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    int by0 = m->next_iMCU_row * comp->v_samp_factor;
+    int block_rows_left = comp->height_in_blocks - by0;
+    int max_block_rows = std::min(comp->v_samp_factor, block_rows_left);
+    JBLOCKARRAY ba = (*cinfo->mem->access_virt_barray)(
+        reinterpret_cast<j_common_ptr>(cinfo), m->coeff_buffers[c], by0,
+        max_block_rows, true);
+    float* qmc = m->quant_mul[c];
+    RowBuffer<float>* plane = m->raw_data[c];
+    const int h_factor = m->h_factor[c];
+    const int v_factor = m->v_factor[c];
+    const float* zero_bias_offset = m->zero_bias_offset[c];
+    const float* zero_bias_mul = m->zero_bias_mul[c];
+    float aq_strength = 0.0f;
+    for (int iy = 0; iy < comp->v_samp_factor; iy++) {
+      size_t by = by0 + iy;
+      if (by >= comp->height_in_blocks) continue;
+      JBLOCKROW brow = ba[iy];
+      const float* row = plane->Row(8 * by);
+      for (size_t bx = 0; bx < comp->width_in_blocks; bx++) {
+        JCOEF* block = &brow[bx][0];
+        if (m->use_adaptive_quantization) {
+          aq_strength = m->quant_field.Row(by * v_factor)[bx * h_factor];
+        }
+        ComputeCoefficientBlock(row + 8 * bx, plane->stride(), qmc, aq_strength,
+                                zero_bias_offset, zero_bias_mul, tmp, block);
+      }
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(ComputeDCTCoefficients);
+
+void ComputeDCTCoefficients(j_compress_ptr cinfo) {
+  HWY_DYNAMIC_DISPATCH(ComputeDCTCoefficients)(cinfo);
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/dct.h b/third_party/jpeg-xl/lib/jpegli/dct.h
new file mode 100644
index 0000000000..9ae5f9f7c2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/dct.h
@@ -0,0 +1,20 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_DCT_H_
+#define LIB_JPEGLI_DCT_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+namespace jpegli {
+
+void ComputeDCTCoefficients(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_DCT_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/decode.cc b/third_party/jpeg-xl/lib/jpegli/decode.cc
new file mode 100644
index 0000000000..cf87673705
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode.cc
@@ -0,0 +1,981 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode.h"
+
+#include <string.h>
+
+#include <vector>
+
+#include "lib/jpegli/color_quantize.h"
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/decode_marker.h"
+#include "lib/jpegli/decode_scan.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jpegli/render.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/status.h"
+
+namespace jpegli {
+
+void InitializeImage(j_decompress_ptr cinfo) {
+  cinfo->restart_interval = 0;
+  cinfo->saw_JFIF_marker = FALSE;
+  cinfo->JFIF_major_version = 1;
+  cinfo->JFIF_minor_version = 1;
+  cinfo->density_unit = 0;
+  cinfo->X_density = 1;
+  cinfo->Y_density = 1;
+  cinfo->saw_Adobe_marker = FALSE;
+  cinfo->Adobe_transform = 0;
+  cinfo->CCIR601_sampling = FALSE;  // not used
+  cinfo->marker_list = nullptr;
+  cinfo->comp_info = nullptr;
+  cinfo->input_scan_number = 0;
+  cinfo->input_iMCU_row = 0;
+  cinfo->output_scan_number = 0;
+  cinfo->output_iMCU_row = 0;
+  cinfo->output_scanline = 0;
+  cinfo->unread_marker = 0;
+  cinfo->coef_bits = nullptr;
+  // We set all these to zero since we don't yet support arithmetic coding.
+  memset(cinfo->arith_dc_L, 0, sizeof(cinfo->arith_dc_L));
+  memset(cinfo->arith_dc_U, 0, sizeof(cinfo->arith_dc_U));
+  memset(cinfo->arith_ac_K, 0, sizeof(cinfo->arith_ac_K));
+  // Initialize the private fields.
+  jpeg_decomp_master* m = cinfo->master;
+  m->input_buffer_.clear();
+  m->input_buffer_pos_ = 0;
+  m->codestream_bits_ahead_ = 0;
+  m->is_multiscan_ = false;
+  m->found_soi_ = false;
+  m->found_dri_ = false;
+  m->found_sof_ = false;
+  m->found_eoi_ = false;
+  m->icc_index_ = 0;
+  m->icc_total_ = 0;
+  m->icc_profile_.clear();
+  memset(m->dc_huff_lut_, 0, sizeof(m->dc_huff_lut_));
+  memset(m->ac_huff_lut_, 0, sizeof(m->ac_huff_lut_));
+  // Initialize the values to an invalid symbol so that we can recognize it
+  // when reading the bit stream using a Huffman code with space > 0.
+  for (size_t i = 0; i < kAllHuffLutSize; ++i) {
+    m->dc_huff_lut_[i].bits = 0;
+    m->dc_huff_lut_[i].value = 0xffff;
+    m->ac_huff_lut_[i].bits = 0;
+    m->ac_huff_lut_[i].value = 0xffff;
+  }
+  m->colormap_lut_ = nullptr;
+  m->pixels_ = nullptr;
+  m->scanlines_ = nullptr;
+  m->regenerate_inverse_colormap_ = true;
+  for (int i = 0; i < kMaxComponents; ++i) {
+    m->dither_[i] = nullptr;
+    m->error_row_[i] = nullptr;
+  }
+  m->output_passes_done_ = 0;
+  m->xoffset_ = 0;
+  m->dequant_ = nullptr;
+}
+
+void InitializeDecompressParams(j_decompress_ptr cinfo) {
+  cinfo->jpeg_color_space = JCS_UNKNOWN;
+  cinfo->out_color_space = JCS_UNKNOWN;
+  cinfo->scale_num = 1;
+  cinfo->scale_denom = 1;
+  cinfo->output_gamma = 0.0f;
+  cinfo->buffered_image = FALSE;
+  cinfo->raw_data_out = FALSE;
+  cinfo->dct_method = JDCT_DEFAULT;
+  cinfo->do_fancy_upsampling = TRUE;
+  cinfo->do_block_smoothing = TRUE;
+  cinfo->quantize_colors = FALSE;
+  cinfo->dither_mode = JDITHER_FS;
+  cinfo->two_pass_quantize = TRUE;
+  cinfo->desired_number_of_colors = 256;
+  cinfo->enable_1pass_quant = FALSE;
+  cinfo->enable_external_quant = FALSE;
+  cinfo->enable_2pass_quant = FALSE;
+  cinfo->actual_number_of_colors = 0;
+  cinfo->colormap = nullptr;
+}
+
+void InitProgressMonitor(j_decompress_ptr cinfo, bool coef_only) {
+  if (!cinfo->progress) return;
+  jpeg_decomp_master* m = cinfo->master;
+  int nc = cinfo->num_components;
+  int estimated_num_scans =
+      cinfo->progressive_mode ? 2 + 3 * nc : (m->is_multiscan_ ? nc : 1);
+  cinfo->progress->pass_limit = cinfo->total_iMCU_rows * estimated_num_scans;
+  cinfo->progress->pass_counter = 0;
+  if (coef_only) {
+    cinfo->progress->total_passes = 1;
+  } else {
+    int input_passes = !cinfo->buffered_image && m->is_multiscan_ ? 1 : 0;
+    bool two_pass_quant = cinfo->quantize_colors && !cinfo->colormap &&
+                          cinfo->two_pass_quantize && cinfo->enable_2pass_quant;
+    cinfo->progress->total_passes = input_passes + (two_pass_quant ? 2 : 1);
+  }
+  cinfo->progress->completed_passes = 0;
+}
+
+void InitProgressMonitorForOutput(j_decompress_ptr cinfo) {
+  if (!cinfo->progress) return;
+  jpeg_decomp_master* m = cinfo->master;
+  int passes_per_output = cinfo->enable_2pass_quant ? 2 : 1;
+  int output_passes_left = cinfo->buffered_image && !m->found_eoi_ ? 2 : 1;
+  cinfo->progress->total_passes =
+      m->output_passes_done_ + passes_per_output * output_passes_left;
+  cinfo->progress->completed_passes = m->output_passes_done_;
+}
+
+void ProgressMonitorInputPass(j_decompress_ptr cinfo) {
+  if (!cinfo->progress) return;
+  cinfo->progress->pass_counter =
+      ((cinfo->input_scan_number - 1) * cinfo->total_iMCU_rows +
+       cinfo->input_iMCU_row);
+  if (cinfo->progress->pass_counter > cinfo->progress->pass_limit) {
+    cinfo->progress->pass_limit =
+        cinfo->input_scan_number * cinfo->total_iMCU_rows;
+  }
+  (*cinfo->progress->progress_monitor)(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void ProgressMonitorOutputPass(j_decompress_ptr cinfo) {
+  if (!cinfo->progress) return;
+  jpeg_decomp_master* m = cinfo->master;
+  int input_passes = !cinfo->buffered_image && m->is_multiscan_ ? 1 : 0;
+  cinfo->progress->pass_counter = cinfo->output_scanline;
+  cinfo->progress->pass_limit = cinfo->output_height;
+  cinfo->progress->completed_passes = input_passes + m->output_passes_done_;
+  (*cinfo->progress->progress_monitor)(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void BuildHuffmanLookupTable(j_decompress_ptr cinfo, JHUFF_TBL* table,
+                             HuffmanTableEntry* huff_lut) {
+  uint32_t counts[kJpegHuffmanMaxBitLength + 1] = {};
+  counts[0] = 0;
+  int total_count = 0;
+  int space = 1 << kJpegHuffmanMaxBitLength;
+  int max_depth = 1;
+  for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+    int count = table->bits[i];
+    if (count != 0) {
+      max_depth = i;
+    }
+    counts[i] = count;
+    total_count += count;
+    space -= count * (1 << (kJpegHuffmanMaxBitLength - i));
+  }
+  uint32_t values[kJpegHuffmanAlphabetSize + 1] = {};
+  uint8_t values_seen[256] = {0};
+  for (int i = 0; i < total_count; ++i) {
+    int value = table->huffval[i];
+    if (values_seen[value]) {
+      return JPEGLI_ERROR("Duplicate Huffman code value %d", value);
+    }
+    values_seen[value] = 1;
+    values[i] = value;
+  }
+  // Add an invalid symbol that will have the all 1 code.
+  ++counts[max_depth];
+  values[total_count] = kJpegHuffmanAlphabetSize;
+  space -= (1 << (kJpegHuffmanMaxBitLength - max_depth));
+  if (space < 0) {
+    JPEGLI_ERROR("Invalid Huffman code lengths.");
+  } else if (space > 0 && huff_lut[0].value != 0xffff) {
+    // Re-initialize the values to an invalid symbol so that we can recognize
+    // it when reading the bit stream using a Huffman code with space > 0.
+    for (int i = 0; i < kJpegHuffmanLutSize; ++i) {
+      huff_lut[i].bits = 0;
+      huff_lut[i].value = 0xffff;
+    }
+  }
+  BuildJpegHuffmanTable(&counts[0], &values[0], huff_lut);
+}
+
+void PrepareForScan(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    int comp_idx = cinfo->cur_comp_info[i]->component_index;
+    int* prev_coef_bits = cinfo->coef_bits[comp_idx + cinfo->num_components];
+    for (int k = std::min(cinfo->Ss, 1); k <= std::max(cinfo->Se, 9); k++) {
+      prev_coef_bits[k] =
+          (cinfo->input_scan_number > 0) ? cinfo->coef_bits[comp_idx][k] : 0;
+    }
+    for (int k = cinfo->Ss; k <= cinfo->Se; ++k) {
+      cinfo->coef_bits[comp_idx][k] = cinfo->Al;
+    }
+  }
+  AddStandardHuffmanTables(reinterpret_cast<j_common_ptr>(cinfo),
+                           /*is_dc=*/false);
+  AddStandardHuffmanTables(reinterpret_cast<j_common_ptr>(cinfo),
+                           /*is_dc=*/true);
+  // Check that all the Huffman tables needed for this scan are defined and
+  // build derived lookup tables.
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    if (cinfo->Ss == 0) {
+      int dc_tbl_idx = cinfo->cur_comp_info[i]->dc_tbl_no;
+      JHUFF_TBL* table = cinfo->dc_huff_tbl_ptrs[dc_tbl_idx];
+      HuffmanTableEntry* huff_lut =
+          &m->dc_huff_lut_[dc_tbl_idx * kJpegHuffmanLutSize];
+      if (!table) {
+        return JPEGLI_ERROR("DC Huffman table %d not found", dc_tbl_idx);
+      }
+      BuildHuffmanLookupTable(cinfo, table, huff_lut);
+    }
+    if (cinfo->Se > 0) {
+      int ac_tbl_idx = cinfo->cur_comp_info[i]->ac_tbl_no;
+      JHUFF_TBL* table = cinfo->ac_huff_tbl_ptrs[ac_tbl_idx];
+      HuffmanTableEntry* huff_lut =
+          &m->ac_huff_lut_[ac_tbl_idx * kJpegHuffmanLutSize];
+      if (!table) {
+        return JPEGLI_ERROR("AC Huffman table %d not found", ac_tbl_idx);
+      }
+      BuildHuffmanLookupTable(cinfo, table, huff_lut);
+    }
+  }
+  // Copy quantization tables into comp_info.
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    jpeg_component_info* comp = cinfo->cur_comp_info[i];
+    if (comp->quant_table == nullptr) {
+      comp->quant_table = Allocate<JQUANT_TBL>(cinfo, 1, JPOOL_IMAGE);
+      memcpy(comp->quant_table, cinfo->quant_tbl_ptrs[comp->quant_tbl_no],
+             sizeof(JQUANT_TBL));
+    }
+  }
+  if (cinfo->comps_in_scan == 1) {
+    const auto& comp = *cinfo->cur_comp_info[0];
+    cinfo->MCUs_per_row = DivCeil(cinfo->image_width * comp.h_samp_factor,
+                                  cinfo->max_h_samp_factor * DCTSIZE);
+    cinfo->MCU_rows_in_scan = DivCeil(cinfo->image_height * comp.v_samp_factor,
+                                      cinfo->max_v_samp_factor * DCTSIZE);
+    m->mcu_rows_per_iMCU_row_ = cinfo->cur_comp_info[0]->v_samp_factor;
+  } else {
+    cinfo->MCU_rows_in_scan = cinfo->total_iMCU_rows;
+    cinfo->MCUs_per_row = m->iMCU_cols_;
+    m->mcu_rows_per_iMCU_row_ = 1;
+    size_t mcu_size = 0;
+    for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+      jpeg_component_info* comp = cinfo->cur_comp_info[i];
+      mcu_size += comp->h_samp_factor * comp->v_samp_factor;
+    }
+    if (mcu_size > D_MAX_BLOCKS_IN_MCU) {
+      JPEGLI_ERROR("MCU size too big");
+    }
+  }
+  memset(m->last_dc_coeff_, 0, sizeof(m->last_dc_coeff_));
+  m->restarts_to_go_ = cinfo->restart_interval;
+  m->next_restart_marker_ = 0;
+  m->eobrun_ = -1;
+  m->scan_mcu_row_ = 0;
+  m->scan_mcu_col_ = 0;
+  m->codestream_bits_ahead_ = 0;
+  ++cinfo->input_scan_number;
+  cinfo->input_iMCU_row = 0;
+  PrepareForiMCURow(cinfo);
+  cinfo->global_state = kDecProcessScan;
+}
+
+int ConsumeInput(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (cinfo->global_state == kDecProcessScan && m->streaming_mode_ &&
+      cinfo->input_iMCU_row > cinfo->output_iMCU_row) {
+    // Prevent input from getting ahead of output in streaming mode.
+    return JPEG_SUSPENDED;
+  }
+  jpeg_source_mgr* src = cinfo->src;
+  int status;
+  for (;;) {
+    const uint8_t* data;
+    size_t len;
+    if (m->input_buffer_.empty()) {
+      data = cinfo->src->next_input_byte;
+      len = cinfo->src->bytes_in_buffer;
+    } else {
+      data = &m->input_buffer_[m->input_buffer_pos_];
+      len = m->input_buffer_.size() - m->input_buffer_pos_;
+    }
+    size_t pos = 0;
+    if (cinfo->global_state == kDecProcessScan) {
+      status = ProcessScan(cinfo, data, len, &pos, &m->codestream_bits_ahead_);
+    } else {
+      status = ProcessMarkers(cinfo, data, len, &pos);
+    }
+    if (m->input_buffer_.empty()) {
+      cinfo->src->next_input_byte += pos;
+      cinfo->src->bytes_in_buffer -= pos;
+    } else {
+      m->input_buffer_pos_ += pos;
+      size_t bytes_left = m->input_buffer_.size() - m->input_buffer_pos_;
+      if (bytes_left <= src->bytes_in_buffer) {
+        src->next_input_byte += (src->bytes_in_buffer - bytes_left);
+        src->bytes_in_buffer = bytes_left;
+        m->input_buffer_.clear();
+        m->input_buffer_pos_ = 0;
+      }
+    }
+    if (status == kHandleRestart) {
+      JXL_DASSERT(m->input_buffer_.size() <=
+                  m->input_buffer_pos_ + src->bytes_in_buffer);
+      m->input_buffer_.clear();
+      m->input_buffer_pos_ = 0;
+      if (cinfo->unread_marker == 0xd0 + m->next_restart_marker_) {
+        cinfo->unread_marker = 0;
+      } else {
+        if (!(*cinfo->src->resync_to_restart)(cinfo, m->next_restart_marker_)) {
+          return JPEG_SUSPENDED;
+        }
+      }
+      m->next_restart_marker_ += 1;
+      m->next_restart_marker_ &= 0x7;
+      m->restarts_to_go_ = cinfo->restart_interval;
+      if (cinfo->unread_marker != 0) {
+        JPEGLI_WARN("Failed to resync to next restart marker, skipping scan.");
+        return JPEG_SCAN_COMPLETED;
+      }
+      continue;
+    }
+    if (status == kHandleMarkerProcessor) {
+      JXL_DASSERT(m->input_buffer_.size() <=
+                  m->input_buffer_pos_ + src->bytes_in_buffer);
+      m->input_buffer_.clear();
+      m->input_buffer_pos_ = 0;
+      if (!(*GetMarkerProcessor(cinfo))(cinfo)) {
+        return JPEG_SUSPENDED;
+      }
+      cinfo->unread_marker = 0;
+      continue;
+    }
+    if (status != kNeedMoreInput) {
+      break;
+    }
+    if (m->input_buffer_.empty()) {
+      JXL_DASSERT(m->input_buffer_pos_ == 0);
+      m->input_buffer_.assign(src->next_input_byte,
+                              src->next_input_byte + src->bytes_in_buffer);
+    }
+    if (!(*cinfo->src->fill_input_buffer)(cinfo)) {
+      m->input_buffer_.clear();
+      m->input_buffer_pos_ = 0;
+      return JPEG_SUSPENDED;
+    }
+    if (src->bytes_in_buffer == 0) {
+      JPEGLI_ERROR("Empty input.");
+    }
+    m->input_buffer_.insert(m->input_buffer_.end(), src->next_input_byte,
+                            src->next_input_byte + src->bytes_in_buffer);
+  }
+  if (status == JPEG_SCAN_COMPLETED) {
+    cinfo->global_state = kDecProcessMarkers;
+  } else if (status == JPEG_REACHED_SOS) {
+    if (cinfo->global_state == kDecInHeader) {
+      cinfo->global_state = kDecHeaderDone;
+    } else {
+      PrepareForScan(cinfo);
+    }
+  }
+  return status;
+}
+
+bool IsInputReady(j_decompress_ptr cinfo) {
+  if (cinfo->master->found_eoi_) {
+    return true;
+  }
+  if (cinfo->input_scan_number > cinfo->output_scan_number) {
+    return true;
+  }
+  if (cinfo->input_scan_number < cinfo->output_scan_number) {
+    return false;
+  }
+  if (cinfo->input_iMCU_row == cinfo->total_iMCU_rows) {
+    return true;
+  }
+  return cinfo->input_iMCU_row >
+         cinfo->output_iMCU_row + (cinfo->master->streaming_mode_ ? 0 : 2);
+}
+
+bool ReadOutputPass(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!m->pixels_) {
+    size_t stride = cinfo->out_color_components * cinfo->output_width;
+    size_t num_samples = cinfo->output_height * stride;
+    m->pixels_ = Allocate<uint8_t>(cinfo, num_samples, JPOOL_IMAGE);
+    m->scanlines_ =
+        Allocate<JSAMPROW>(cinfo, cinfo->output_height, JPOOL_IMAGE);
+    for (size_t i = 0; i < cinfo->output_height; ++i) {
+      m->scanlines_[i] = &m->pixels_[i * stride];
+    }
+  }
+  size_t num_output_rows = 0;
+  while (num_output_rows < cinfo->output_height) {
+    if (IsInputReady(cinfo)) {
+      ProgressMonitorOutputPass(cinfo);
+      ProcessOutput(cinfo, &num_output_rows, m->scanlines_,
+                    cinfo->output_height);
+    } else if (ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+      return false;
+    }
+  }
+  cinfo->output_scanline = 0;
+  cinfo->output_iMCU_row = 0;
+  return true;
+}
+
+boolean PrepareQuantizedOutput(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (cinfo->raw_data_out) {
+    JPEGLI_ERROR("Color quantization is not supported in raw data mode.");
+  }
+  if (m->output_data_type_ != JPEGLI_TYPE_UINT8) {
+    JPEGLI_ERROR("Color quantization must use 8-bit mode.");
+  }
+  if (cinfo->colormap) {
+    m->quant_mode_ = 3;
+  } else if (cinfo->two_pass_quantize && cinfo->enable_2pass_quant) {
+    m->quant_mode_ = 2;
+  } else if (cinfo->enable_1pass_quant) {
+    m->quant_mode_ = 1;
+  } else {
+    JPEGLI_ERROR("Invalid quantization mode change");
+  }
+  if (m->quant_mode_ > 1 && cinfo->dither_mode == JDITHER_ORDERED) {
+    cinfo->dither_mode = JDITHER_FS;
+  }
+  if (m->quant_mode_ == 1) {
+    ChooseColorMap1Pass(cinfo);
+  } else if (m->quant_mode_ == 2) {
+    m->quant_pass_ = 0;
+    if (!ReadOutputPass(cinfo)) {
+      return FALSE;
+    }
+    ChooseColorMap2Pass(cinfo);
+  }
+  if (m->quant_mode_ == 2 ||
+      (m->quant_mode_ == 3 && m->regenerate_inverse_colormap_)) {
+    CreateInverseColorMap(cinfo);
+  }
+  if (cinfo->dither_mode == JDITHER_ORDERED) {
+    CreateOrderedDitherTables(cinfo);
+  } else if (cinfo->dither_mode == JDITHER_FS) {
+    InitFSDitherState(cinfo);
+  }
+  m->quant_pass_ = 1;
+  return TRUE;
+}
+
+void AllocateCoefficientBuffer(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  j_common_ptr comptr = reinterpret_cast<j_common_ptr>(cinfo);
+  jvirt_barray_ptr* coef_arrays = jpegli::Allocate<jvirt_barray_ptr>(
+      cinfo, cinfo->num_components, JPOOL_IMAGE);
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    size_t height_in_blocks =
+        m->streaming_mode_ ? comp->v_samp_factor : comp->height_in_blocks;
+    coef_arrays[c] = (*cinfo->mem->request_virt_barray)(
+        comptr, JPOOL_IMAGE, TRUE, comp->width_in_blocks, height_in_blocks,
+        comp->v_samp_factor);
+  }
+  cinfo->master->coef_arrays = coef_arrays;
+  (*cinfo->mem->realize_virt_arrays)(comptr);
+}
+
+}  // namespace jpegli
+
+void jpegli_CreateDecompress(j_decompress_ptr cinfo, int version,
+                             size_t structsize) {
+  cinfo->mem = nullptr;
+  if (structsize != sizeof(*cinfo)) {
+    JPEGLI_ERROR("jpeg_decompress_struct has wrong size.");
+  }
+  jpegli::InitMemoryManager(reinterpret_cast<j_common_ptr>(cinfo));
+  cinfo->is_decompressor = TRUE;
+  cinfo->progress = nullptr;
+  cinfo->src = nullptr;
+  for (int i = 0; i < NUM_QUANT_TBLS; i++) {
+    cinfo->quant_tbl_ptrs[i] = nullptr;
+  }
+  for (int i = 0; i < NUM_HUFF_TBLS; i++) {
+    cinfo->dc_huff_tbl_ptrs[i] = nullptr;
+    cinfo->ac_huff_tbl_ptrs[i] = nullptr;
+  }
+  cinfo->global_state = jpegli::kDecStart;
+  cinfo->sample_range_limit = nullptr;  // not used
+  cinfo->rec_outbuf_height = 1;         // output works with any buffer height
+  cinfo->master = new jpeg_decomp_master;
+  jpeg_decomp_master* m = cinfo->master;
+  for (int i = 0; i < 16; ++i) {
+    m->app_marker_parsers[i] = nullptr;
+  }
+  m->com_marker_parser = nullptr;
+  memset(m->markers_to_save_, 0, sizeof(m->markers_to_save_));
+  jpegli::InitializeDecompressParams(cinfo);
+  jpegli::InitializeImage(cinfo);
+}
+
+void jpegli_destroy_decompress(j_decompress_ptr cinfo) {
+  jpegli_destroy(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void jpegli_abort_decompress(j_decompress_ptr cinfo) {
+  jpegli_abort(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void jpegli_save_markers(j_decompress_ptr cinfo, int marker_code,
+                         unsigned int length_limit) {
+  // TODO(szabadka) Limit our memory usage by taking into account length_limit.
+  jpeg_decomp_master* m = cinfo->master;
+  if (marker_code < 0xe0) {
+    JPEGLI_ERROR("jpegli_save_markers: invalid marker code %d", marker_code);
+  }
+  m->markers_to_save_[marker_code - 0xe0] = 1;
+}
+
+void jpegli_set_marker_processor(j_decompress_ptr cinfo, int marker_code,
+                                 jpeg_marker_parser_method routine) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (marker_code == 0xfe) {
+    m->com_marker_parser = routine;
+  } else if (marker_code >= 0xe0 && marker_code <= 0xef) {
+    m->app_marker_parsers[marker_code - 0xe0] = routine;
+  } else {
+    JPEGLI_ERROR("jpegli_set_marker_processor: invalid marker code %d",
+                 marker_code);
+  }
+}
+
+int jpegli_consume_input(j_decompress_ptr cinfo) {
+  if (cinfo->global_state == jpegli::kDecStart) {
+    (*cinfo->err->reset_error_mgr)(reinterpret_cast<j_common_ptr>(cinfo));
+    (*cinfo->src->init_source)(cinfo);
+    jpegli::InitializeDecompressParams(cinfo);
+    jpegli::InitializeImage(cinfo);
+    cinfo->global_state = jpegli::kDecInHeader;
+  }
+  if (cinfo->global_state == jpegli::kDecHeaderDone) {
+    return JPEG_REACHED_SOS;
+  }
+  if (cinfo->master->found_eoi_) {
+    return JPEG_REACHED_EOI;
+  }
+  if (cinfo->global_state == jpegli::kDecInHeader ||
+      cinfo->global_state == jpegli::kDecProcessMarkers ||
+      cinfo->global_state == jpegli::kDecProcessScan) {
+    return jpegli::ConsumeInput(cinfo);
+  }
+  JPEGLI_ERROR("Unexpected state %d", cinfo->global_state);
+  return JPEG_REACHED_EOI;  // return value does not matter
+}
+
+int jpegli_read_header(j_decompress_ptr cinfo, boolean require_image) {
+  if (cinfo->global_state != jpegli::kDecStart &&
+      cinfo->global_state != jpegli::kDecInHeader) {
+    JPEGLI_ERROR("jpegli_read_header: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (cinfo->src == nullptr) {
+    JPEGLI_ERROR("Missing source.");
+  }
+  for (;;) {
+    int retcode = jpegli_consume_input(cinfo);
+    if (retcode == JPEG_SUSPENDED) {
+      return retcode;
+    } else if (retcode == JPEG_REACHED_SOS) {
+      break;
+    } else if (retcode == JPEG_REACHED_EOI) {
+      if (require_image) {
+        JPEGLI_ERROR("jpegli_read_header: unexpected EOI marker.");
+      }
+      jpegli_abort_decompress(cinfo);
+      return JPEG_HEADER_TABLES_ONLY;
+    }
+  };
+  return JPEG_HEADER_OK;
+}
+
+boolean jpegli_read_icc_profile(j_decompress_ptr cinfo, JOCTET** icc_data_ptr,
+                                unsigned int* icc_data_len) {
+  if (cinfo->global_state == jpegli::kDecStart ||
+      cinfo->global_state == jpegli::kDecInHeader) {
+    JPEGLI_ERROR("jpegli_read_icc_profile: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (icc_data_ptr == nullptr || icc_data_len == nullptr) {
+    JPEGLI_ERROR("jpegli_read_icc_profile: invalid output buffer");
+  }
+  jpeg_decomp_master* m = cinfo->master;
+  if (m->icc_profile_.empty()) {
+    *icc_data_ptr = nullptr;
+    *icc_data_len = 0;
+    return FALSE;
+  }
+  *icc_data_len = m->icc_profile_.size();
+  *icc_data_ptr = (JOCTET*)malloc(*icc_data_len);
+  if (*icc_data_ptr == nullptr) {
+    JPEGLI_ERROR("jpegli_read_icc_profile: Out of memory");
+  }
+  memcpy(*icc_data_ptr, m->icc_profile_.data(), *icc_data_len);
+  return TRUE;
+}
+
+void jpegli_core_output_dimensions(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!m->found_sof_) {
+    JPEGLI_ERROR("No SOF marker found.");
+  }
+  if (cinfo->raw_data_out) {
+    if (cinfo->scale_num != 1 || cinfo->scale_denom != 1) {
+      JPEGLI_ERROR("Output scaling is not supported in raw output mode");
+    }
+  }
+  if (cinfo->scale_num != 1 || cinfo->scale_denom != 1) {
+    int dctsize = 16;
+    while (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * (dctsize - 1)) {
+      --dctsize;
+    }
+    m->min_scaled_dct_size = dctsize;
+    cinfo->output_width =
+        jpegli::DivCeil(cinfo->image_width * dctsize, DCTSIZE);
+    cinfo->output_height =
+        jpegli::DivCeil(cinfo->image_height * dctsize, DCTSIZE);
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      m->scaled_dct_size[c] = m->min_scaled_dct_size;
+    }
+  } else {
+    cinfo->output_width = cinfo->image_width;
+    cinfo->output_height = cinfo->image_height;
+    m->min_scaled_dct_size = DCTSIZE;
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      m->scaled_dct_size[c] = DCTSIZE;
+    }
+  }
+}
+
+void jpegli_calc_output_dimensions(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  jpegli_core_output_dimensions(cinfo);
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    m->h_factor[c] = cinfo->max_h_samp_factor / comp->h_samp_factor;
+    m->v_factor[c] = cinfo->max_v_samp_factor / comp->v_samp_factor;
+  }
+  if (cinfo->scale_num != 1 || cinfo->scale_denom != 1) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      // Prefer IDCT scaling over 2x upsampling.
+      while (m->scaled_dct_size[c] < DCTSIZE && (m->v_factor[c] % 2) == 0 &&
+             (m->h_factor[c] % 2) == 0) {
+        m->scaled_dct_size[c] *= 2;
+        m->v_factor[c] /= 2;
+        m->h_factor[c] /= 2;
+      }
+    }
+  }
+  if (cinfo->out_color_space == JCS_GRAYSCALE) {
+    cinfo->out_color_components = 1;
+  } else if (cinfo->out_color_space == JCS_RGB ||
+             cinfo->out_color_space == JCS_YCbCr) {
+    cinfo->out_color_components = 3;
+  } else if (cinfo->out_color_space == JCS_CMYK ||
+             cinfo->out_color_space == JCS_YCCK) {
+    cinfo->out_color_components = 4;
+  } else {
+    cinfo->out_color_components = cinfo->num_components;
+  }
+  cinfo->output_components =
+      cinfo->quantize_colors ? 1 : cinfo->out_color_components;
+  cinfo->rec_outbuf_height = 1;
+}
+
+boolean jpegli_has_multiple_scans(j_decompress_ptr cinfo) {
+  if (cinfo->input_scan_number == 0) {
+    JPEGLI_ERROR("No SOS marker found.");
+  }
+  return cinfo->master->is_multiscan_;
+}
+
+boolean jpegli_input_complete(j_decompress_ptr cinfo) {
+  return cinfo->master->found_eoi_;
+}
+
+boolean jpegli_start_decompress(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (cinfo->global_state == jpegli::kDecHeaderDone) {
+    m->streaming_mode_ = !m->is_multiscan_ && !cinfo->buffered_image &&
+                         (!cinfo->quantize_colors || !cinfo->two_pass_quantize);
+    jpegli::AllocateCoefficientBuffer(cinfo);
+    jpegli_calc_output_dimensions(cinfo);
+    jpegli::PrepareForScan(cinfo);
+    if (cinfo->quantize_colors) {
+      if (cinfo->colormap != nullptr) {
+        cinfo->enable_external_quant = TRUE;
+      } else if (cinfo->two_pass_quantize &&
+                 cinfo->out_color_space == JCS_RGB) {
+        cinfo->enable_2pass_quant = TRUE;
+      } else {
+        cinfo->enable_1pass_quant = TRUE;
+      }
+    }
+    jpegli::InitProgressMonitor(cinfo, /*coef_only=*/false);
+    if (cinfo->buffered_image == TRUE) {
+      cinfo->output_scan_number = 0;
+      return TRUE;
+    }
+  } else if (!m->is_multiscan_) {
+    JPEGLI_ERROR("jpegli_start_decompress: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (m->is_multiscan_) {
+    if (cinfo->global_state != jpegli::kDecProcessScan &&
+        cinfo->global_state != jpegli::kDecProcessMarkers) {
+      JPEGLI_ERROR("jpegli_start_decompress: unexpected state %d",
+                   cinfo->global_state);
+    }
+    while (!m->found_eoi_) {
+      jpegli::ProgressMonitorInputPass(cinfo);
+      if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+        return FALSE;
+      }
+    }
+  }
+  cinfo->output_scan_number = cinfo->input_scan_number;
+  jpegli::PrepareForOutput(cinfo);
+  if (cinfo->quantize_colors) {
+    return jpegli::PrepareQuantizedOutput(cinfo);
+  } else {
+    return TRUE;
+  }
+}
+
+boolean jpegli_start_output(j_decompress_ptr cinfo, int scan_number) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!cinfo->buffered_image) {
+    JPEGLI_ERROR("jpegli_start_output: buffered image mode was not set");
+  }
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_start_output: unexpected state %d",
+                 cinfo->global_state);
+  }
+  cinfo->output_scan_number = std::max(1, scan_number);
+  if (m->found_eoi_) {
+    cinfo->output_scan_number =
+        std::min(cinfo->output_scan_number, cinfo->input_scan_number);
+  }
+  jpegli::InitProgressMonitorForOutput(cinfo);
+  // TODO(szabadka): Figure out how much we can reuse.
+  jpegli::PrepareForOutput(cinfo);
+  if (cinfo->quantize_colors) {
+    return jpegli::PrepareQuantizedOutput(cinfo);
+  } else {
+    return TRUE;
+  }
+}
+
+boolean jpegli_finish_output(j_decompress_ptr cinfo) {
+  if (!cinfo->buffered_image) {
+    JPEGLI_ERROR("jpegli_finish_output: buffered image mode was not set");
+  }
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_finish_output: unexpected state %d",
+                 cinfo->global_state);
+  }
+  // Advance input to the start of the next scan, or to the end of input.
+  while (cinfo->input_scan_number <= cinfo->output_scan_number &&
+         !cinfo->master->found_eoi_) {
+    if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+      return FALSE;
+    }
+  }
+  return TRUE;
+}
+
+JDIMENSION jpegli_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
+                                 JDIMENSION max_lines) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_read_scanlines: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (cinfo->buffered_image) {
+    if (cinfo->output_scan_number == 0) {
+      JPEGLI_ERROR(
+          "jpegli_read_scanlines: "
+          "jpegli_start_output() was not called");
+    }
+  } else if (m->is_multiscan_ && !m->found_eoi_) {
+    JPEGLI_ERROR(
+        "jpegli_read_scanlines: "
+        "jpegli_start_decompress() did not finish");
+  }
+  if (cinfo->output_scanline + max_lines > cinfo->output_height) {
+    max_lines = cinfo->output_height - cinfo->output_scanline;
+  }
+  jpegli::ProgressMonitorOutputPass(cinfo);
+  size_t num_output_rows = 0;
+  while (num_output_rows < max_lines) {
+    if (jpegli::IsInputReady(cinfo)) {
+      jpegli::ProcessOutput(cinfo, &num_output_rows, scanlines, max_lines);
+    } else if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+      break;
+    }
+  }
+  return num_output_rows;
+}
+
+JDIMENSION jpegli_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines) {
+  // TODO(szabadka) Skip the IDCT for skipped over blocks.
+  return jpegli_read_scanlines(cinfo, nullptr, num_lines);
+}
+
+void jpegli_crop_scanline(j_decompress_ptr cinfo, JDIMENSION* xoffset,
+                          JDIMENSION* width) {
+  jpeg_decomp_master* m = cinfo->master;
+  if ((cinfo->global_state != jpegli::kDecProcessScan &&
+       cinfo->global_state != jpegli::kDecProcessMarkers) ||
+      cinfo->output_scanline != 0) {
+    JPEGLI_ERROR("jpegli_crop_decompress: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (cinfo->raw_data_out) {
+    JPEGLI_ERROR("Output cropping is not supported in raw data mode");
+  }
+  if (xoffset == nullptr || width == nullptr || *width == 0 ||
+      *xoffset + *width > cinfo->output_width) {
+    JPEGLI_ERROR("jpegli_crop_scanline: Invalid arguments");
+  }
+  // TODO(szabadka) Skip the IDCT for skipped over blocks.
+  size_t xend = *xoffset + *width;
+  size_t iMCU_width = m->min_scaled_dct_size * cinfo->max_h_samp_factor;
+  *xoffset = (*xoffset / iMCU_width) * iMCU_width;
+  *width = xend - *xoffset;
+  cinfo->master->xoffset_ = *xoffset;
+  cinfo->output_width = *width;
+}
+
+JDIMENSION jpegli_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                                JDIMENSION max_lines) {
+  if ((cinfo->global_state != jpegli::kDecProcessScan &&
+       cinfo->global_state != jpegli::kDecProcessMarkers) ||
+      !cinfo->raw_data_out) {
+    JPEGLI_ERROR("jpegli_read_raw_data: unexpected state %d",
+                 cinfo->global_state);
+  }
+  size_t iMCU_height = cinfo->max_v_samp_factor * DCTSIZE;
+  if (max_lines < iMCU_height) {
+    JPEGLI_ERROR("jpegli_read_raw_data: output buffer too small");
+  }
+  jpegli::ProgressMonitorOutputPass(cinfo);
+  while (!jpegli::IsInputReady(cinfo)) {
+    if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+      return 0;
+    }
+  }
+  if (cinfo->output_iMCU_row < cinfo->total_iMCU_rows) {
+    jpegli::ProcessRawOutput(cinfo, data);
+    return iMCU_height;
+  }
+  return 0;
+}
+
+jvirt_barray_ptr* jpegli_read_coefficients(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  m->streaming_mode_ = false;
+  if (!cinfo->buffered_image && cinfo->global_state == jpegli::kDecHeaderDone) {
+    jpegli::AllocateCoefficientBuffer(cinfo);
+    jpegli_calc_output_dimensions(cinfo);
+    jpegli::InitProgressMonitor(cinfo, /*coef_only=*/true);
+    jpegli::PrepareForScan(cinfo);
+  }
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_read_coefficients: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (!cinfo->buffered_image) {
+    while (!m->found_eoi_) {
+      jpegli::ProgressMonitorInputPass(cinfo);
+      if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+        return nullptr;
+      }
+    }
+    cinfo->output_scanline = cinfo->output_height;
+  }
+  return m->coef_arrays;
+}
+
+boolean jpegli_finish_decompress(j_decompress_ptr cinfo) {
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_finish_decompress: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (!cinfo->buffered_image && cinfo->output_scanline < cinfo->output_height) {
+    JPEGLI_ERROR("Incomplete output");
+  }
+  while (!cinfo->master->found_eoi_) {
+    if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+      return FALSE;
+    }
+  }
+  (*cinfo->src->term_source)(cinfo);
+  jpegli_abort_decompress(cinfo);
+  return TRUE;
+}
+
+boolean jpegli_resync_to_restart(j_decompress_ptr cinfo, int desired) {
+  JPEGLI_WARN("Invalid restart marker found: 0x%02x vs 0x%02x.",
+              cinfo->unread_marker, 0xd0 + desired);
+  // This is a trivial implementation, we just let the decoder skip the entire
+  // scan and attempt to render the partial input.
+  return TRUE;
+}
+
+void jpegli_new_colormap(j_decompress_ptr cinfo) {
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_new_colormap: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (!cinfo->buffered_image) {
+    JPEGLI_ERROR("jpegli_new_colormap: not in  buffered image mode");
+  }
+  if (!cinfo->enable_external_quant) {
+    JPEGLI_ERROR("external colormap quantizer was not enabled");
+  }
+  if (!cinfo->quantize_colors || cinfo->colormap == nullptr) {
+    JPEGLI_ERROR("jpegli_new_colormap: not in external colormap mode");
+  }
+  cinfo->master->regenerate_inverse_colormap_ = true;
+}
+
+void jpegli_set_output_format(j_decompress_ptr cinfo, JpegliDataType data_type,
+                              JpegliEndianness endianness) {
+  switch (data_type) {
+    case JPEGLI_TYPE_UINT8:
+    case JPEGLI_TYPE_UINT16:
+    case JPEGLI_TYPE_FLOAT:
+      cinfo->master->output_data_type_ = data_type;
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported data type %d", data_type);
+  }
+  switch (endianness) {
+    case JPEGLI_NATIVE_ENDIAN:
+      cinfo->master->swap_endianness_ = false;
+      break;
+    case JPEGLI_LITTLE_ENDIAN:
+      cinfo->master->swap_endianness_ = !IsLittleEndian();
+      break;
+    case JPEGLI_BIG_ENDIAN:
+      cinfo->master->swap_endianness_ = IsLittleEndian();
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported endianness %d", endianness);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jpegli/decode.h b/third_party/jpeg-xl/lib/jpegli/decode.h
new file mode 100644
index 0000000000..7b7a2034ad
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode.h
@@ -0,0 +1,111 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// This file conatins the C API of the decoder part of the libjpegli library,
+// which is based on the C API of libjpeg, with the function names changed from
+// jpeg_* to jpegli_*, while dempressor object definitions are included directly
+// from jpeglib.h
+//
+// Applications can use the libjpegli library in one of the following ways:
+//
+//  (1) Include jpegli/encode.h and/or jpegli/decode.h, update the function
+//      names of the API and link against libjpegli.
+//
+//  (2) Leave the application code unchanged, but replace the libjpeg.so library
+//      with the one built by this project that is API- and ABI-compatible with
+//      libjpeg-turbo's version of libjpeg.so.
+
+#ifndef LIB_JPEGLI_DECODE_H_
+#define LIB_JPEGLI_DECODE_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#include "lib/jpegli/common.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define jpegli_create_decompress(cinfo)              \
+  jpegli_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
+                          (size_t)sizeof(struct jpeg_decompress_struct))
+
+void jpegli_CreateDecompress(j_decompress_ptr cinfo, int version,
+                             size_t structsize);
+
+void jpegli_stdio_src(j_decompress_ptr cinfo, FILE *infile);
+
+void jpegli_mem_src(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+                    unsigned long insize);
+
+int jpegli_read_header(j_decompress_ptr cinfo, boolean require_image);
+
+boolean jpegli_start_decompress(j_decompress_ptr cinfo);
+
+JDIMENSION jpegli_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
+                                 JDIMENSION max_lines);
+
+JDIMENSION jpegli_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines);
+
+void jpegli_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                          JDIMENSION *width);
+
+boolean jpegli_finish_decompress(j_decompress_ptr cinfo);
+
+JDIMENSION jpegli_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                                JDIMENSION max_lines);
+
+jvirt_barray_ptr *jpegli_read_coefficients(j_decompress_ptr cinfo);
+
+boolean jpegli_has_multiple_scans(j_decompress_ptr cinfo);
+
+boolean jpegli_start_output(j_decompress_ptr cinfo, int scan_number);
+
+boolean jpegli_finish_output(j_decompress_ptr cinfo);
+
+boolean jpegli_input_complete(j_decompress_ptr cinfo);
+
+int jpegli_consume_input(j_decompress_ptr cinfo);
+
+#if JPEG_LIB_VERSION >= 80
+void jpegli_core_output_dimensions(j_decompress_ptr cinfo);
+#endif
+void jpegli_calc_output_dimensions(j_decompress_ptr cinfo);
+
+void jpegli_save_markers(j_decompress_ptr cinfo, int marker_code,
+                         unsigned int length_limit);
+
+void jpegli_set_marker_processor(j_decompress_ptr cinfo, int marker_code,
+                                 jpeg_marker_parser_method routine);
+
+boolean jpegli_resync_to_restart(j_decompress_ptr cinfo, int desired);
+
+boolean jpegli_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
+                                unsigned int *icc_data_len);
+
+void jpegli_abort_decompress(j_decompress_ptr cinfo);
+
+void jpegli_destroy_decompress(j_decompress_ptr cinfo);
+
+void jpegli_new_colormap(j_decompress_ptr cinfo);
+
+//
+// New API functions that are not available in libjpeg
+//
+// NOTE: This part of the API is still experimental and will probably change in
+// the future.
+//
+
+void jpegli_set_output_format(j_decompress_ptr cinfo, JpegliDataType data_type,
+                              JpegliEndianness endianness);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}  // extern "C"
+#endif
+
+#endif  // LIB_JPEGLI_DECODE_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/decode_api_test.cc b/third_party/jpeg-xl/lib/jpegli/decode_api_test.cc
new file mode 100644
index 0000000000..0bb7321db4
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode_api_test.cc
@@ -0,0 +1,1305 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#include <cmath>
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jpegli {
+namespace {
+
+static constexpr uint8_t kFakeEoiMarker[2] = {0xff, 0xd9};
+static constexpr size_t kNumSourceBuffers = 4;
+
+// Custom source manager that refills the input buffer in chunks, simulating
+// a file reader with a fixed buffer size.
+class SourceManager {
+ public:
+  SourceManager(const uint8_t* data, size_t len, size_t max_chunk_size)
+      : data_(data), len_(len), max_chunk_size_(max_chunk_size) {
+    pub_.skip_input_data = skip_input_data;
+    pub_.resync_to_restart = jpegli_resync_to_restart;
+    pub_.term_source = term_source;
+    pub_.init_source = init_source;
+    pub_.fill_input_buffer = fill_input_buffer;
+    if (max_chunk_size_ == 0) max_chunk_size_ = len;
+    buffers_.resize(kNumSourceBuffers, std::vector<uint8_t>(max_chunk_size_));
+    Reset();
+  }
+
+  void Reset() {
+    pub_.next_input_byte = nullptr;
+    pub_.bytes_in_buffer = 0;
+    pos_ = 0;
+    chunk_idx_ = 0;
+  }
+
+  ~SourceManager() {
+    EXPECT_EQ(0, pub_.bytes_in_buffer);
+    EXPECT_EQ(len_, pos_);
+  }
+
+ private:
+  jpeg_source_mgr pub_;
+  const uint8_t* data_;
+  size_t len_;
+  size_t chunk_idx_;
+  size_t pos_;
+  size_t max_chunk_size_;
+  std::vector<std::vector<uint8_t>> buffers_;
+
+  static void init_source(j_decompress_ptr cinfo) {}
+
+  static boolean fill_input_buffer(j_decompress_ptr cinfo) {
+    auto src = reinterpret_cast<SourceManager*>(cinfo->src);
+    if (src->pos_ < src->len_) {
+      size_t chunk_size = std::min(src->len_ - src->pos_, src->max_chunk_size_);
+      size_t next_idx = ++src->chunk_idx_ % kNumSourceBuffers;
+      uint8_t* next_buffer = src->buffers_[next_idx].data();
+      memcpy(next_buffer, src->data_ + src->pos_, chunk_size);
+      src->pub_.next_input_byte = next_buffer;
+      src->pub_.bytes_in_buffer = chunk_size;
+    } else {
+      src->pub_.next_input_byte = kFakeEoiMarker;
+      src->pub_.bytes_in_buffer = 2;
+      src->len_ += 2;
+    }
+    src->pos_ += src->pub_.bytes_in_buffer;
+    return TRUE;
+  }
+
+  static void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {
+    auto src = reinterpret_cast<SourceManager*>(cinfo->src);
+    if (num_bytes <= 0) {
+      return;
+    }
+    if (src->pub_.bytes_in_buffer >= static_cast<size_t>(num_bytes)) {
+      src->pub_.bytes_in_buffer -= num_bytes;
+      src->pub_.next_input_byte += num_bytes;
+    } else {
+      src->pos_ += num_bytes - src->pub_.bytes_in_buffer;
+      src->pub_.bytes_in_buffer = 0;
+    }
+  }
+
+  static void term_source(j_decompress_ptr cinfo) {}
+};
+
+uint8_t markers_seen[kMarkerSequenceLen];
+size_t num_markers_seen = 0;
+
+uint8_t get_next_byte(j_decompress_ptr cinfo) {
+  if (cinfo->src->bytes_in_buffer == 0) {
+    (*cinfo->src->fill_input_buffer)(cinfo);
+  }
+  cinfo->src->bytes_in_buffer--;
+  return *cinfo->src->next_input_byte++;
+}
+
+boolean test_marker_processor(j_decompress_ptr cinfo) {
+  markers_seen[num_markers_seen] = cinfo->unread_marker;
+  size_t marker_len = (get_next_byte(cinfo) << 8) + get_next_byte(cinfo);
+  EXPECT_EQ(2 + ((num_markers_seen + 2) % sizeof(kMarkerData)), marker_len);
+  if (marker_len > 2) {
+    (*cinfo->src->skip_input_data)(cinfo, marker_len - 2);
+  }
+  ++num_markers_seen;
+  return TRUE;
+}
+
+void ReadOutputImage(const DecompressParams& dparams, j_decompress_ptr cinfo,
+                     TestImage* output) {
+  JDIMENSION xoffset = 0;
+  JDIMENSION yoffset = 0;
+  JDIMENSION xsize_cropped = cinfo->output_width;
+  JDIMENSION ysize_cropped = cinfo->output_height;
+  if (dparams.crop_output) {
+    xoffset = xsize_cropped = cinfo->output_width / 3;
+    yoffset = ysize_cropped = cinfo->output_height / 3;
+    jpegli_crop_scanline(cinfo, &xoffset, &xsize_cropped);
+  }
+  output->ysize = ysize_cropped;
+  output->xsize = cinfo->output_width;
+  output->components = cinfo->out_color_components;
+  output->data_type = dparams.data_type;
+  output->endianness = dparams.endianness;
+  size_t bytes_per_sample = jpegli_bytes_per_sample(dparams.data_type);
+  if (cinfo->raw_data_out) {
+    output->color_space = cinfo->jpeg_color_space;
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+      size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+      std::vector<uint8_t> plane(ysize * xsize * bytes_per_sample);
+      output->raw_data.emplace_back(std::move(plane));
+    }
+  } else {
+    output->color_space = cinfo->out_color_space;
+    output->AllocatePixels();
+  }
+  size_t total_output_lines = 0;
+  while (cinfo->output_scanline < cinfo->output_height) {
+    size_t max_lines;
+    size_t num_output_lines;
+    if (cinfo->raw_data_out) {
+      size_t iMCU_height = cinfo->max_v_samp_factor * DCTSIZE;
+      EXPECT_EQ(cinfo->output_scanline, cinfo->output_iMCU_row * iMCU_height);
+      max_lines = iMCU_height;
+      std::vector<std::vector<JSAMPROW>> rowdata(cinfo->num_components);
+      std::vector<JSAMPARRAY> data(cinfo->num_components);
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+        size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+        size_t num_lines = cinfo->comp_info[c].v_samp_factor * DCTSIZE;
+        rowdata[c].resize(num_lines);
+        size_t y0 = cinfo->output_iMCU_row * num_lines;
+        for (size_t i = 0; i < num_lines; ++i) {
+          rowdata[c][i] =
+              y0 + i < ysize ? &output->raw_data[c][(y0 + i) * xsize] : nullptr;
+        }
+        data[c] = &rowdata[c][0];
+      }
+      num_output_lines = jpegli_read_raw_data(cinfo, &data[0], max_lines);
+    } else {
+      size_t max_output_lines = dparams.max_output_lines;
+      if (max_output_lines == 0) max_output_lines = cinfo->output_height;
+      if (cinfo->output_scanline < yoffset) {
+        max_lines = yoffset - cinfo->output_scanline;
+        num_output_lines = jpegli_skip_scanlines(cinfo, max_lines);
+      } else if (cinfo->output_scanline >= yoffset + ysize_cropped) {
+        max_lines = cinfo->output_height - cinfo->output_scanline;
+        num_output_lines = jpegli_skip_scanlines(cinfo, max_lines);
+      } else {
+        size_t lines_left = yoffset + ysize_cropped - cinfo->output_scanline;
+        max_lines = std::min<size_t>(max_output_lines, lines_left);
+        size_t stride = cinfo->output_width * cinfo->out_color_components *
+                        bytes_per_sample;
+        std::vector<JSAMPROW> scanlines(max_lines);
+        for (size_t i = 0; i < max_lines; ++i) {
+          size_t yidx = cinfo->output_scanline - yoffset + i;
+          scanlines[i] = &output->pixels[yidx * stride];
+        }
+        num_output_lines =
+            jpegli_read_scanlines(cinfo, &scanlines[0], max_lines);
+        if (cinfo->quantize_colors) {
+          for (size_t i = 0; i < num_output_lines; ++i) {
+            UnmapColors(scanlines[i], cinfo->output_width,
+                        cinfo->out_color_components, cinfo->colormap,
+                        cinfo->actual_number_of_colors);
+          }
+        }
+      }
+    }
+    total_output_lines += num_output_lines;
+    EXPECT_EQ(total_output_lines, cinfo->output_scanline);
+    EXPECT_EQ(num_output_lines, max_lines);
+  }
+  EXPECT_EQ(cinfo->total_iMCU_rows,
+            DivCeil(cinfo->image_height, cinfo->max_v_samp_factor * DCTSIZE));
+}
+
+struct TestConfig {
+  std::string fn;
+  std::string fn_desc;
+  TestImage input;
+  CompressParams jparams;
+  DecompressParams dparams;
+  bool compare_to_orig = false;
+  float max_tolerance_factor = 1.01f;
+  float max_rms_dist = 1.0f;
+  float max_diff = 35.0f;
+};
+
+std::vector<uint8_t> GetTestJpegData(TestConfig& config) {
+  std::vector<uint8_t> compressed;
+  if (!config.fn.empty()) {
+    compressed = ReadTestData(config.fn.c_str());
+  } else {
+    GeneratePixels(&config.input);
+    JXL_CHECK(EncodeWithJpegli(config.input, config.jparams, &compressed));
+  }
+  if (config.dparams.size_factor < 1.0f) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  return compressed;
+}
+
+void TestAPINonBuffered(const CompressParams& jparams,
+                        const DecompressParams& dparams,
+                        const TestImage& expected_output,
+                        j_decompress_ptr cinfo, TestImage* output) {
+  if (jparams.add_marker) {
+    jpegli_save_markers(cinfo, kSpecialMarker0, 0xffff);
+    jpegli_save_markers(cinfo, kSpecialMarker1, 0xffff);
+    num_markers_seen = 0;
+    jpegli_set_marker_processor(cinfo, 0xe6, test_marker_processor);
+    jpegli_set_marker_processor(cinfo, 0xe7, test_marker_processor);
+    jpegli_set_marker_processor(cinfo, 0xe8, test_marker_processor);
+  }
+  if (!jparams.icc.empty()) {
+    jpegli_save_markers(cinfo, JPEG_APP0 + 2, 0xffff);
+  }
+  jpegli_read_header(cinfo, /*require_image=*/TRUE);
+  if (jparams.add_marker) {
+    EXPECT_EQ(num_markers_seen, kMarkerSequenceLen);
+    EXPECT_EQ(0, memcmp(markers_seen, kMarkerSequence, num_markers_seen));
+  }
+  if (!jparams.icc.empty()) {
+    uint8_t* icc_data = nullptr;
+    unsigned int icc_len;
+    JXL_CHECK(jpegli_read_icc_profile(cinfo, &icc_data, &icc_len));
+    JXL_CHECK(icc_data);
+    EXPECT_EQ(0, memcmp(jparams.icc.data(), icc_data, icc_len));
+    free(icc_data);
+  }
+  // Check that jpegli_calc_output_dimensions can be called multiple times
+  // even with different parameters.
+  if (!cinfo->raw_data_out) {
+    cinfo->scale_num = 1;
+    cinfo->scale_denom = 2;
+  }
+  jpegli_calc_output_dimensions(cinfo);
+  SetDecompressParams(dparams, cinfo, /*is_jpegli=*/true);
+  VerifyHeader(jparams, cinfo);
+  jpegli_calc_output_dimensions(cinfo);
+  EXPECT_LE(expected_output.xsize, cinfo->output_width);
+  if (!dparams.crop_output) {
+    EXPECT_EQ(expected_output.xsize, cinfo->output_width);
+  }
+  if (dparams.output_mode == COEFFICIENTS) {
+    jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(cinfo);
+    JXL_CHECK(coef_arrays != nullptr);
+    CopyCoefficients(cinfo, coef_arrays, output);
+  } else {
+    jpegli_start_decompress(cinfo);
+    VerifyScanHeader(jparams, cinfo);
+    ReadOutputImage(dparams, cinfo, output);
+  }
+  jpegli_finish_decompress(cinfo);
+}
+
+void TestAPIBuffered(const CompressParams& jparams,
+                     const DecompressParams& dparams, j_decompress_ptr cinfo,
+                     std::vector<TestImage>* output_progression) {
+  EXPECT_EQ(JPEG_REACHED_SOS,
+            jpegli_read_header(cinfo, /*require_image=*/TRUE));
+  cinfo->buffered_image = TRUE;
+  SetDecompressParams(dparams, cinfo, /*is_jpegli=*/true);
+  VerifyHeader(jparams, cinfo);
+  EXPECT_TRUE(jpegli_start_decompress(cinfo));
+  // start decompress should not read the whole input in buffered image mode
+  EXPECT_FALSE(jpegli_input_complete(cinfo));
+  bool has_multiple_scans = jpegli_has_multiple_scans(cinfo);
+  EXPECT_EQ(0, cinfo->output_scan_number);
+  int sos_marker_cnt = 1;  // read_header reads the first SOS marker
+  while (!jpegli_input_complete(cinfo)) {
+    EXPECT_EQ(cinfo->input_scan_number, sos_marker_cnt);
+    if (dparams.skip_scans && (cinfo->input_scan_number % 2) != 1) {
+      int result = JPEG_SUSPENDED;
+      while (result != JPEG_REACHED_SOS && result != JPEG_REACHED_EOI) {
+        result = jpegli_consume_input(cinfo);
+      }
+      if (result == JPEG_REACHED_SOS) ++sos_marker_cnt;
+      continue;
+    }
+    SetScanDecompressParams(dparams, cinfo, cinfo->input_scan_number,
+                            /*is_jpegli=*/true);
+    EXPECT_TRUE(jpegli_start_output(cinfo, cinfo->input_scan_number));
+    // start output sets output_scan_number, but does not change
+    // input_scan_number
+    EXPECT_EQ(cinfo->output_scan_number, cinfo->input_scan_number);
+    EXPECT_EQ(cinfo->input_scan_number, sos_marker_cnt);
+    VerifyScanHeader(jparams, cinfo);
+    TestImage output;
+    ReadOutputImage(dparams, cinfo, &output);
+    output_progression->emplace_back(std::move(output));
+    // read scanlines/read raw data does not change input/output scan number
+    EXPECT_EQ(cinfo->input_scan_number, sos_marker_cnt);
+    EXPECT_EQ(cinfo->output_scan_number, cinfo->input_scan_number);
+    EXPECT_TRUE(jpegli_finish_output(cinfo));
+    ++sos_marker_cnt;  // finish output reads the next SOS marker or EOI
+    if (dparams.output_mode == COEFFICIENTS) {
+      jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(cinfo);
+      JXL_CHECK(coef_arrays != nullptr);
+      CopyCoefficients(cinfo, coef_arrays, &output_progression->back());
+    }
+  }
+  jpegli_finish_decompress(cinfo);
+  if (dparams.size_factor == 1.0f) {
+    EXPECT_EQ(has_multiple_scans, cinfo->input_scan_number > 1);
+  }
+}
+
+TEST(DecodeAPITest, ReuseCinfo) {
+  TestImage input, output, expected;
+  std::vector<TestImage> output_progression, expected_output_progression;
+  CompressParams jparams;
+  DecompressParams dparams;
+  std::vector<uint8_t> compressed;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    input.xsize = 129;
+    input.ysize = 73;
+    GeneratePixels(&input);
+    for (int h_samp : {2, 1}) {
+      for (int v_samp : {2, 1}) {
+        for (int progr : {0, 2}) {
+          jparams.h_sampling = {h_samp, 1, 1};
+          jparams.v_sampling = {v_samp, 1, 1};
+          jparams.progressive_mode = progr;
+          printf(
+              "Generating input with %dx%d chroma subsampling "
+              "progressive level %d\n",
+              h_samp, v_samp, progr);
+          JXL_CHECK(EncodeWithJpegli(input, jparams, &compressed));
+          for (JpegIOMode output_mode : {PIXELS, RAW_DATA, COEFFICIENTS}) {
+            for (bool crop : {true, false}) {
+              if (crop && output_mode != PIXELS) continue;
+              for (int scale_num : {1, 2, 3, 4, 7, 8, 13, 16}) {
+                if (scale_num != 8 && output_mode != PIXELS) continue;
+                int scale_denom = 8;
+                while (scale_num % 2 == 0 && scale_denom % 2 == 0) {
+                  scale_num /= 2;
+                  scale_denom /= 2;
+                }
+                printf("Decoding with output mode %d output scaling %d/%d %s\n",
+                       output_mode, scale_num, scale_denom,
+                       crop ? "with cropped output" : "");
+                dparams.output_mode = output_mode;
+                dparams.scale_num = scale_num;
+                dparams.scale_denom = scale_denom;
+                expected.Clear();
+                DecodeWithLibjpeg(jparams, dparams, compressed, &expected);
+                output.Clear();
+                cinfo.buffered_image = false;
+                cinfo.raw_data_out = false;
+                cinfo.scale_num = cinfo.scale_denom = 1;
+                SourceManager src(compressed.data(), compressed.size(),
+                                  1u << 12);
+                cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+                jpegli_read_header(&cinfo, /*require_image=*/TRUE);
+                jpegli_abort_decompress(&cinfo);
+                src.Reset();
+                TestAPINonBuffered(jparams, dparams, expected, &cinfo, &output);
+                float max_rms = output_mode == COEFFICIENTS ? 0.0f : 1.0f;
+                if (scale_num == 1 && scale_denom == 8 && h_samp != v_samp) {
+                  max_rms = 5.0f;  // libjpeg does not do fancy upsampling
+                }
+                VerifyOutputImage(expected, output, max_rms);
+                printf("Decoding in buffered image mode\n");
+                expected_output_progression.clear();
+                DecodeAllScansWithLibjpeg(jparams, dparams, compressed,
+                                          &expected_output_progression);
+                output_progression.clear();
+                src.Reset();
+                TestAPIBuffered(jparams, dparams, &cinfo, &output_progression);
+                JXL_CHECK(output_progression.size() ==
+                          expected_output_progression.size());
+                for (size_t i = 0; i < output_progression.size(); ++i) {
+                  const TestImage& output = output_progression[i];
+                  const TestImage& expected = expected_output_progression[i];
+                  VerifyOutputImage(expected, output, max_rms);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+std::vector<TestConfig> GenerateBasicConfigs() {
+  std::vector<TestConfig> all_configs;
+  for (int samp : {1, 2}) {
+    for (int progr : {0, 2}) {
+      TestConfig config;
+      config.input.xsize = 257 + samp * 37;
+      config.input.ysize = 265 + (progr / 2) * 17;
+      config.jparams.h_sampling = {samp, 1, 1};
+      config.jparams.v_sampling = {samp, 1, 1};
+      config.jparams.progressive_mode = progr;
+      GeneratePixels(&config.input);
+      all_configs.push_back(config);
+    }
+  }
+  return all_configs;
+}
+
+TEST(DecodeAPITest, ReuseCinfoSameMemSource) {
+  std::vector<TestConfig> all_configs = GenerateBasicConfigs();
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+      for (const TestConfig& config : all_configs) {
+        EncodeWithJpegli(config.input, config.jparams, &cinfo);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_compress(&cinfo);
+  }
+  std::vector<TestImage> all_outputs(all_configs.size());
+  {
+    jpeg_decompress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_decompress(&cinfo);
+      jpegli_mem_src(&cinfo, buffer, buffer_size);
+      for (size_t i = 0; i < all_configs.size(); ++i) {
+        TestAPINonBuffered(all_configs[i].jparams, DecompressParams(),
+                           all_configs[i].input, &cinfo, &all_outputs[i]);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_decompress(&cinfo);
+  }
+  for (size_t i = 0; i < all_configs.size(); ++i) {
+    VerifyOutputImage(all_configs[i].input, all_outputs[i], 2.35f);
+  }
+  if (buffer) free(buffer);
+}
+
+TEST(DecodeAPITest, ReuseCinfoSameStdSource) {
+  std::vector<TestConfig> all_configs = GenerateBasicConfigs();
+  FILE* tmpf = tmpfile();
+  JXL_CHECK(tmpf);
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_stdio_dest(&cinfo, tmpf);
+      for (const TestConfig& config : all_configs) {
+        EncodeWithJpegli(config.input, config.jparams, &cinfo);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_compress(&cinfo);
+  }
+  rewind(tmpf);
+  std::vector<TestImage> all_outputs(all_configs.size());
+  {
+    jpeg_decompress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_decompress(&cinfo);
+      jpegli_stdio_src(&cinfo, tmpf);
+      for (size_t i = 0; i < all_configs.size(); ++i) {
+        TestAPINonBuffered(all_configs[i].jparams, DecompressParams(),
+                           all_configs[i].input, &cinfo, &all_outputs[i]);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_decompress(&cinfo);
+  }
+  for (size_t i = 0; i < all_configs.size(); ++i) {
+    VerifyOutputImage(all_configs[i].input, all_outputs[i], 2.35f);
+  }
+  fclose(tmpf);
+}
+
+TEST(DecodeAPITest, AbbreviatedStreams) {
+  uint8_t* table_stream = nullptr;
+  unsigned long table_stream_size = 0;
+  uint8_t* data_stream = nullptr;
+  unsigned long data_stream_size = 0;
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_mem_dest(&cinfo, &table_stream, &table_stream_size);
+      cinfo.input_components = 3;
+      cinfo.in_color_space = JCS_RGB;
+      jpegli_set_defaults(&cinfo);
+      jpegli_write_tables(&cinfo);
+      jpegli_mem_dest(&cinfo, &data_stream, &data_stream_size);
+      cinfo.image_width = 1;
+      cinfo.image_height = 1;
+      cinfo.optimize_coding = FALSE;
+      jpegli_set_progressive_level(&cinfo, 0);
+      jpegli_start_compress(&cinfo, FALSE);
+      JSAMPLE image[3] = {0};
+      JSAMPROW row[] = {image};
+      jpegli_write_scanlines(&cinfo, row, 1);
+      jpegli_finish_compress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    EXPECT_LT(data_stream_size, 50);
+    jpegli_destroy_compress(&cinfo);
+  }
+  {
+    jpeg_decompress_struct cinfo = {};
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_decompress(&cinfo);
+      jpegli_mem_src(&cinfo, table_stream, table_stream_size);
+      jpegli_read_header(&cinfo, FALSE);
+      jpegli_mem_src(&cinfo, data_stream, data_stream_size);
+      jpegli_read_header(&cinfo, TRUE);
+      EXPECT_EQ(1, cinfo.image_width);
+      EXPECT_EQ(1, cinfo.image_height);
+      EXPECT_EQ(3, cinfo.num_components);
+      jpegli_start_decompress(&cinfo);
+      JSAMPLE image[3] = {0};
+      JSAMPROW row[] = {image};
+      jpegli_read_scanlines(&cinfo, row, 1);
+      EXPECT_EQ(0, image[0]);
+      EXPECT_EQ(0, image[1]);
+      EXPECT_EQ(0, image[2]);
+      jpegli_finish_decompress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_decompress(&cinfo);
+  }
+  if (table_stream) free(table_stream);
+  if (data_stream) free(data_stream);
+}
+
+class DecodeAPITestParam : public ::testing::TestWithParam<TestConfig> {};
+
+TEST_P(DecodeAPITestParam, TestAPI) {
+  TestConfig config = GetParam();
+  const DecompressParams& dparams = config.dparams;
+  if (dparams.skip_scans) return;
+  const std::vector<uint8_t> compressed = GetTestJpegData(config);
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size);
+
+  TestImage output1;
+  DecodeWithLibjpeg(config.jparams, dparams, compressed, &output1);
+
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+    TestAPINonBuffered(config.jparams, dparams, output1, &cinfo, &output0);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  if (config.compare_to_orig) {
+    double rms0 = DistanceRms(config.input, output0);
+    double rms1 = DistanceRms(config.input, output1);
+    printf("rms: %f  vs  %f\n", rms0, rms1);
+    EXPECT_LE(rms0, rms1 * config.max_tolerance_factor);
+  } else {
+    VerifyOutputImage(output0, output1, config.max_rms_dist, config.max_diff);
+  }
+}
+
+class DecodeAPITestParamBuffered : public ::testing::TestWithParam<TestConfig> {
+};
+
+TEST_P(DecodeAPITestParamBuffered, TestAPI) {
+  TestConfig config = GetParam();
+  const DecompressParams& dparams = config.dparams;
+  const std::vector<uint8_t> compressed = GetTestJpegData(config);
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size);
+
+  std::vector<TestImage> output_progression1;
+  DecodeAllScansWithLibjpeg(config.jparams, dparams, compressed,
+                            &output_progression1);
+
+  std::vector<TestImage> output_progression0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+    TestAPIBuffered(config.jparams, dparams, &cinfo, &output_progression0);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  ASSERT_EQ(output_progression0.size(), output_progression1.size());
+  for (size_t i = 0; i < output_progression0.size(); ++i) {
+    const TestImage& output = output_progression0[i];
+    const TestImage& expected = output_progression1[i];
+    if (config.compare_to_orig) {
+      double rms0 = DistanceRms(config.input, output);
+      double rms1 = DistanceRms(config.input, expected);
+      printf("rms: %f  vs  %f\n", rms0, rms1);
+      EXPECT_LE(rms0, rms1 * config.max_tolerance_factor);
+    } else {
+      VerifyOutputImage(expected, output, config.max_rms_dist, config.max_diff);
+    }
+  }
+}
+
+std::vector<TestConfig> GenerateTests(bool buffered) {
+  std::vector<TestConfig> all_tests;
+  {
+    std::vector<std::pair<std::string, std::string>> testfiles({
+        {"jxl/flower/flower.png.im_q85_420_progr.jpg", "Q85YUV420PROGR"},
+        {"jxl/flower/flower.png.im_q85_420_R13B.jpg", "Q85YUV420R13B"},
+        {"jxl/flower/flower.png.im_q85_444.jpg", "Q85YUV444"},
+    });
+    for (size_t i = 0; i < (buffered ? 1u : testfiles.size()); ++i) {
+      TestConfig config;
+      config.fn = testfiles[i].first;
+      config.fn_desc = testfiles[i].second;
+      for (size_t chunk_size : {0, 1, 64, 65536}) {
+        config.dparams.chunk_size = chunk_size;
+        for (size_t max_output_lines : {0, 1, 8, 16}) {
+          config.dparams.max_output_lines = max_output_lines;
+          config.dparams.output_mode = PIXELS;
+          all_tests.push_back(config);
+        }
+        {
+          config.dparams.max_output_lines = 16;
+          config.dparams.output_mode = RAW_DATA;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+
+  {
+    std::vector<std::pair<std::string, std::string>> testfiles({
+        {"jxl/flower/flower_small.q85_444_non_interleaved.jpg",
+         "Q85YUV444NonInterleaved"},
+        {"jxl/flower/flower_small.q85_420_non_interleaved.jpg",
+         "Q85YUV420NonInterleaved"},
+        {"jxl/flower/flower_small.q85_444_partially_interleaved.jpg",
+         "Q85YUV444PartiallyInterleaved"},
+        {"jxl/flower/flower_small.q85_420_partially_interleaved.jpg",
+         "Q85YUV420PartiallyInterleaved"},
+        {"jxl/flower/flower.png.im_q85_422.jpg", "Q85YUV422"},
+        {"jxl/flower/flower.png.im_q85_440.jpg", "Q85YUV440"},
+        {"jxl/flower/flower.png.im_q85_444_1x2.jpg", "Q85YUV444_1x2"},
+        {"jxl/flower/flower.png.im_q85_asymmetric.jpg", "Q85Asymmetric"},
+        {"jxl/flower/flower.png.im_q85_gray.jpg", "Q85Gray"},
+        {"jxl/flower/flower.png.im_q85_luma_subsample.jpg", "Q85LumaSubsample"},
+        {"jxl/flower/flower.png.im_q85_rgb.jpg", "Q85RGB"},
+        {"jxl/flower/flower.png.im_q85_rgb_subsample_blue.jpg",
+         "Q85RGBSubsampleBlue"},
+        {"jxl/flower/flower_small.cmyk.jpg", "CMYK"},
+    });
+    for (size_t i = 0; i < (buffered ? 4u : testfiles.size()); ++i) {
+      for (JpegIOMode output_mode : {PIXELS, RAW_DATA}) {
+        TestConfig config;
+        config.fn = testfiles[i].first;
+        config.fn_desc = testfiles[i].second;
+        config.dparams.output_mode = output_mode;
+        all_tests.push_back(config);
+      }
+    }
+  }
+
+  // Tests for common chroma subsampling and output modes.
+  for (JpegIOMode output_mode : {PIXELS, RAW_DATA, COEFFICIENTS}) {
+    for (int h_samp : {1, 2}) {
+      for (int v_samp : {1, 2}) {
+        for (bool fancy : {true, false}) {
+          if (!fancy && (output_mode != PIXELS || h_samp * v_samp == 1)) {
+            continue;
+          }
+          TestConfig config;
+          config.dparams.output_mode = output_mode;
+          config.dparams.do_fancy_upsampling = fancy;
+          config.jparams.progressive_mode = 2;
+          config.jparams.h_sampling = {h_samp, 1, 1};
+          config.jparams.v_sampling = {v_samp, 1, 1};
+          if (output_mode == COEFFICIENTS) {
+            config.max_rms_dist = 0.0f;
+          }
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+
+  // Tests for partial input.
+  for (float size_factor : {0.1f, 0.33f, 0.5f, 0.75f}) {
+    for (int progr : {0, 1, 3}) {
+      for (int samp : {1, 2}) {
+        for (bool skip_scans : {false, true}) {
+          if (skip_scans && (progr != 1 || size_factor < 0.5f)) continue;
+          for (JpegIOMode output_mode : {PIXELS, RAW_DATA}) {
+            TestConfig config;
+            config.input.xsize = 517;
+            config.input.ysize = 523;
+            config.jparams.h_sampling = {samp, 1, 1};
+            config.jparams.v_sampling = {samp, 1, 1};
+            config.jparams.progressive_mode = progr;
+            config.dparams.size_factor = size_factor;
+            config.dparams.output_mode = output_mode;
+            config.dparams.skip_scans = skip_scans;
+            // The last partially available block can behave differently.
+            // TODO(szabadka) Figure out if we can make the behaviour more
+            // similar.
+            config.max_rms_dist = samp == 1 ? 1.75f : 3.0f;
+            config.max_diff = 255.0f;
+            all_tests.push_back(config);
+          }
+        }
+      }
+    }
+  }
+
+  // Tests for block smoothing.
+  for (float size_factor : {0.1f, 0.33f, 0.5f, 0.75f, 1.0f}) {
+    for (int samp : {1, 2}) {
+      for (bool skip_scans : {false, true}) {
+        if (skip_scans && size_factor < 0.3f) continue;
+        TestConfig config;
+        config.input.xsize = 517;
+        config.input.ysize = 523;
+        config.jparams.h_sampling = {samp, 1, 1};
+        config.jparams.v_sampling = {samp, 1, 1};
+        config.jparams.progressive_mode = 2;
+        config.dparams.size_factor = size_factor;
+        config.dparams.do_block_smoothing = true;
+        config.dparams.skip_scans = skip_scans;
+        // libjpeg does smoothing for incomplete scans differently at
+        // the border between current and previous scans.
+        config.max_rms_dist = 8.0f;
+        config.max_diff = 255.0f;
+        all_tests.push_back(config);
+      }
+    }
+  }
+
+  // Test for switching output color quantization modes between scans.
+  if (buffered) {
+    TestConfig config;
+    config.jparams.progressive_mode = 2;
+    config.dparams.quantize_colors = true;
+    config.dparams.scan_params = {
+        {3, JDITHER_NONE, CQUANT_1PASS},  {4, JDITHER_ORDERED, CQUANT_1PASS},
+        {5, JDITHER_FS, CQUANT_1PASS},    {6, JDITHER_NONE, CQUANT_EXTERNAL},
+        {8, JDITHER_NONE, CQUANT_REUSE},  {9, JDITHER_NONE, CQUANT_EXTERNAL},
+        {10, JDITHER_NONE, CQUANT_2PASS}, {11, JDITHER_NONE, CQUANT_REUSE},
+        {12, JDITHER_NONE, CQUANT_2PASS}, {13, JDITHER_FS, CQUANT_2PASS},
+    };
+    config.compare_to_orig = true;
+    config.max_tolerance_factor = 1.04f;
+    all_tests.push_back(config);
+  }
+
+  if (buffered) {
+    return all_tests;
+  }
+
+  // Tests for output color quantization.
+  for (int num_colors : {8, 64, 256}) {
+    for (ColorQuantMode mode : {CQUANT_1PASS, CQUANT_EXTERNAL, CQUANT_2PASS}) {
+      if (mode == CQUANT_EXTERNAL && num_colors != 256) continue;
+      for (J_DITHER_MODE dither : {JDITHER_NONE, JDITHER_ORDERED, JDITHER_FS}) {
+        if (mode == CQUANT_EXTERNAL && dither != JDITHER_NONE) continue;
+        if (mode != CQUANT_1PASS && dither == JDITHER_ORDERED) continue;
+        for (bool crop : {false, true}) {
+          for (bool scale : {false, true}) {
+            for (bool samp : {false, true}) {
+              if ((num_colors != 256) && (crop || scale || samp)) {
+                continue;
+              }
+              if (mode == CQUANT_2PASS && crop) continue;
+              TestConfig config;
+              config.input.xsize = 1024;
+              config.input.ysize = 768;
+              config.dparams.quantize_colors = true;
+              config.dparams.desired_number_of_colors = num_colors;
+              config.dparams.scan_params = {{kLastScan, dither, mode}};
+              config.dparams.crop_output = crop;
+              if (scale) {
+                config.dparams.scale_num = 7;
+                config.dparams.scale_denom = 8;
+              }
+              if (samp) {
+                config.jparams.h_sampling = {2, 1, 1};
+                config.jparams.v_sampling = {2, 1, 1};
+              }
+              if (!scale && !crop) {
+                config.compare_to_orig = true;
+                if (dither != JDITHER_NONE) {
+                  config.max_tolerance_factor = 1.05f;
+                }
+                if (mode == CQUANT_2PASS &&
+                    (num_colors == 8 || dither == JDITHER_FS)) {
+                  // TODO(szabadka) Lower this bound.
+                  config.max_tolerance_factor = 1.5f;
+                }
+              } else {
+                // We only test for buffer overflows, etc.
+                config.max_rms_dist = 100.0f;
+                config.max_diff = 255.0f;
+              }
+              all_tests.push_back(config);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Tests for output formats.
+  for (JpegliDataType type :
+       {JPEGLI_TYPE_UINT8, JPEGLI_TYPE_UINT16, JPEGLI_TYPE_FLOAT}) {
+    for (JpegliEndianness endianness :
+         {JPEGLI_NATIVE_ENDIAN, JPEGLI_LITTLE_ENDIAN, JPEGLI_BIG_ENDIAN}) {
+      if (type == JPEGLI_TYPE_UINT8 && endianness != JPEGLI_NATIVE_ENDIAN) {
+        continue;
+      }
+      for (int channels = 1; channels <= 4; ++channels) {
+        TestConfig config;
+        config.dparams.data_type = type;
+        config.dparams.endianness = endianness;
+        config.input.color_space = JCS_UNKNOWN;
+        config.input.components = channels;
+        config.dparams.set_out_color_space = true;
+        config.dparams.out_color_space = JCS_UNKNOWN;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  // Test for output cropping.
+  {
+    TestConfig config;
+    config.dparams.crop_output = true;
+    all_tests.push_back(config);
+  }
+  // Tests for color transforms.
+  for (J_COLOR_SPACE out_color_space : {JCS_RGB, JCS_GRAYSCALE}) {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.input.color_space = JCS_GRAYSCALE;
+    config.dparams.set_out_color_space = true;
+    config.dparams.out_color_space = out_color_space;
+    all_tests.push_back(config);
+  }
+  for (J_COLOR_SPACE jpeg_color_space : {JCS_RGB, JCS_YCbCr}) {
+    for (J_COLOR_SPACE out_color_space : {JCS_RGB, JCS_YCbCr, JCS_GRAYSCALE}) {
+      if (jpeg_color_space == JCS_RGB && out_color_space == JCS_YCbCr) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 256;
+      config.jparams.set_jpeg_colorspace = true;
+      config.jparams.jpeg_color_space = jpeg_color_space;
+      config.dparams.set_out_color_space = true;
+      config.dparams.out_color_space = out_color_space;
+      all_tests.push_back(config);
+    }
+  }
+  for (J_COLOR_SPACE jpeg_color_space : {JCS_CMYK, JCS_YCCK}) {
+    for (J_COLOR_SPACE out_color_space : {JCS_CMYK, JCS_YCCK}) {
+      if (jpeg_color_space == JCS_CMYK && out_color_space == JCS_YCCK) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 256;
+      config.input.color_space = JCS_CMYK;
+      config.jparams.set_jpeg_colorspace = true;
+      config.jparams.jpeg_color_space = jpeg_color_space;
+      config.dparams.set_out_color_space = true;
+      config.dparams.out_color_space = out_color_space;
+      all_tests.push_back(config);
+    }
+  }
+  // Tests for progressive levels.
+  for (int p = 0; p < 3 + kNumTestScripts; ++p) {
+    TestConfig config;
+    config.jparams.progressive_mode = p;
+    all_tests.push_back(config);
+  }
+  // Tests for RST markers.
+  for (size_t r : {1, 17, 1024}) {
+    for (size_t chunk_size : {1, 65536}) {
+      for (int progr : {0, 2}) {
+        TestConfig config;
+        config.dparams.chunk_size = chunk_size;
+        config.jparams.progressive_mode = progr;
+        config.jparams.restart_interval = r;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  for (size_t rr : {1, 3, 8, 100}) {
+    TestConfig config;
+    config.jparams.restart_in_rows = rr;
+    all_tests.push_back(config);
+  }
+  // Tests for custom quantization tables.
+  for (int type : {0, 1, 10, 100, 10000}) {
+    for (int scale : {1, 50, 100, 200, 500}) {
+      for (bool add_raw : {false, true}) {
+        for (bool baseline : {true, false}) {
+          if (!baseline && (add_raw || type * scale < 25500)) continue;
+          TestConfig config;
+          config.input.xsize = 64;
+          config.input.ysize = 64;
+          CustomQuantTable table;
+          table.table_type = type;
+          table.scale_factor = scale;
+          table.force_baseline = baseline;
+          table.add_raw = add_raw;
+          table.Generate();
+          config.jparams.quant_tables.push_back(table);
+          config.jparams.quant_indexes = {0, 0, 0};
+          config.compare_to_orig = true;
+          config.max_tolerance_factor = 1.02;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    if (qidx == 3) continue;
+    TestConfig config;
+    config.input.xsize = 256;
+    config.input.ysize = 256;
+    config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                    (qidx >> 0) & 1};
+    all_tests.push_back(config);
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    for (int slot_idx = 0; slot_idx < 2; ++slot_idx) {
+      if (qidx == 0 && slot_idx == 0) continue;
+      TestConfig config;
+      config.input.xsize = 256;
+      config.input.ysize = 256;
+      config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                      (qidx >> 0) & 1};
+      CustomQuantTable table;
+      table.slot_idx = slot_idx;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+      all_tests.push_back(config);
+    }
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    for (bool xyb : {false, true}) {
+      TestConfig config;
+      config.input.xsize = 256;
+      config.input.ysize = 256;
+      config.jparams.xyb_mode = xyb;
+      config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                      (qidx >> 0) & 1};
+      {
+        CustomQuantTable table;
+        table.slot_idx = 0;
+        table.Generate();
+        config.jparams.quant_tables.push_back(table);
+      }
+      {
+        CustomQuantTable table;
+        table.slot_idx = 1;
+        table.table_type = 20;
+        table.Generate();
+        config.jparams.quant_tables.push_back(table);
+      }
+      config.compare_to_orig = true;
+      all_tests.push_back(config);
+    }
+  }
+  for (bool xyb : {false, true}) {
+    TestConfig config;
+    config.input.xsize = 256;
+    config.input.ysize = 256;
+    config.jparams.xyb_mode = xyb;
+    config.jparams.quant_indexes = {0, 1, 2};
+    {
+      CustomQuantTable table;
+      table.slot_idx = 0;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    {
+      CustomQuantTable table;
+      table.slot_idx = 1;
+      table.table_type = 20;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    {
+      CustomQuantTable table;
+      table.slot_idx = 2;
+      table.table_type = 30;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    config.compare_to_orig = true;
+    all_tests.push_back(config);
+  }
+  // Tests for fixed (and custom) prefix codes.
+  for (J_COLOR_SPACE jpeg_color_space : {JCS_RGB, JCS_YCbCr}) {
+    for (bool flat_dc_luma : {false, true}) {
+      TestConfig config;
+      config.jparams.set_jpeg_colorspace = true;
+      config.jparams.jpeg_color_space = jpeg_color_space;
+      config.jparams.progressive_mode = 0;
+      config.jparams.optimize_coding = 0;
+      config.jparams.use_flat_dc_luma_code = flat_dc_luma;
+      all_tests.push_back(config);
+    }
+  }
+  for (J_COLOR_SPACE jpeg_color_space : {JCS_CMYK, JCS_YCCK}) {
+    for (bool flat_dc_luma : {false, true}) {
+      TestConfig config;
+      config.input.color_space = JCS_CMYK;
+      config.jparams.set_jpeg_colorspace = true;
+      config.jparams.jpeg_color_space = jpeg_color_space;
+      config.jparams.progressive_mode = 0;
+      config.jparams.optimize_coding = 0;
+      config.jparams.use_flat_dc_luma_code = flat_dc_luma;
+      all_tests.push_back(config);
+    }
+  }
+  // Test for jpeg without DHT marker.
+  {
+    TestConfig config;
+    config.jparams.progressive_mode = 0;
+    config.jparams.optimize_coding = 0;
+    config.jparams.omit_standard_tables = true;
+    all_tests.push_back(config);
+  }
+  // Test for custom component ids.
+  {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 128;
+    config.jparams.comp_ids = {7, 17, 177};
+    all_tests.push_back(config);
+  }
+  // Tests for JFIF/Adobe markers.
+  for (int override_JFIF : {-1, 0, 1}) {
+    for (int override_Adobe : {-1, 0, 1}) {
+      if (override_JFIF == -1 && override_Adobe == -1) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 128;
+      config.jparams.override_JFIF = override_JFIF;
+      config.jparams.override_Adobe = override_Adobe;
+      all_tests.push_back(config);
+    }
+  }
+  // Tests for small images.
+  for (int xsize : {1, 7, 8, 9, 15, 16, 17}) {
+    for (int ysize : {1, 7, 8, 9, 15, 16, 17}) {
+      TestConfig config;
+      config.input.xsize = xsize;
+      config.input.ysize = ysize;
+      all_tests.push_back(config);
+    }
+  }
+  // Tests for custom marker processor.
+  for (size_t chunk_size : {0, 1, 64, 65536}) {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.dparams.chunk_size = chunk_size;
+    config.jparams.add_marker = true;
+    all_tests.push_back(config);
+  }
+  // Tests for icc profile decoding.
+  for (size_t icc_size : {728, 70000, 1000000}) {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.jparams.icc.resize(icc_size);
+    for (size_t i = 0; i < icc_size; ++i) {
+      config.jparams.icc[i] = (i * 17) & 0xff;
+    }
+    all_tests.push_back(config);
+  }
+  // Tests for unusual sampling factors.
+  for (int h0_samp : {1, 2, 3, 4}) {
+    for (int v0_samp : {1, 2, 3, 4}) {
+      for (int dxb = 0; dxb < h0_samp; ++dxb) {
+        for (int dyb = 0; dyb < v0_samp; ++dyb) {
+          for (int dx = 0; dx < 2; ++dx) {
+            for (int dy = 0; dy < 2; ++dy) {
+              TestConfig config;
+              config.input.xsize = 128 + dyb * 8 + dy;
+              config.input.ysize = 256 + dxb * 8 + dx;
+              config.jparams.progressive_mode = 2;
+              config.jparams.h_sampling = {h0_samp, 1, 1};
+              config.jparams.v_sampling = {v0_samp, 1, 1};
+              config.compare_to_orig = true;
+              all_tests.push_back(config);
+            }
+          }
+        }
+      }
+    }
+  }
+  for (int h0_samp : {1, 2, 4}) {
+    for (int v0_samp : {1, 2, 4}) {
+      for (int h2_samp : {1, 2, 4}) {
+        for (int v2_samp : {1, 2, 4}) {
+          TestConfig config;
+          config.input.xsize = 137;
+          config.input.ysize = 75;
+          config.jparams.progressive_mode = 2;
+          config.jparams.h_sampling = {h0_samp, 1, h2_samp};
+          config.jparams.v_sampling = {v0_samp, 1, v2_samp};
+          config.compare_to_orig = true;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (int h0_samp : {1, 3}) {
+    for (int v0_samp : {1, 3}) {
+      for (int h2_samp : {1, 3}) {
+        for (int v2_samp : {1, 3}) {
+          TestConfig config;
+          config.input.xsize = 205;
+          config.input.ysize = 99;
+          config.jparams.progressive_mode = 2;
+          config.jparams.h_sampling = {h0_samp, 1, h2_samp};
+          config.jparams.v_sampling = {v0_samp, 1, v2_samp};
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  // Tests for output scaling.
+  for (int scale_num = 1; scale_num <= 16; ++scale_num) {
+    if (scale_num == 8) continue;
+    for (bool crop : {false, true}) {
+      for (int samp : {1, 2}) {
+        for (int progr : {0, 2}) {
+          TestConfig config;
+          config.jparams.h_sampling = {samp, 1, 1};
+          config.jparams.v_sampling = {samp, 1, 1};
+          config.jparams.progressive_mode = progr;
+          config.dparams.scale_num = scale_num;
+          config.dparams.scale_denom = 8;
+          config.dparams.crop_output = crop;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  return all_tests;
+}
+
+std::string QuantMode(ColorQuantMode mode) {
+  switch (mode) {
+    case CQUANT_1PASS:
+      return "1pass";
+    case CQUANT_EXTERNAL:
+      return "External";
+    case CQUANT_2PASS:
+      return "2pass";
+    case CQUANT_REUSE:
+      return "Reuse";
+  }
+  return "";
+}
+
+std::string DitherMode(J_DITHER_MODE mode) {
+  switch (mode) {
+    case JDITHER_NONE:
+      return "No";
+    case JDITHER_ORDERED:
+      return "Ordered";
+    case JDITHER_FS:
+      return "FS";
+  }
+  return "";
+}
+
+std::ostream& operator<<(std::ostream& os, const DecompressParams& dparams) {
+  if (dparams.chunk_size == 0) {
+    os << "CompleteInput";
+  } else {
+    os << "InputChunks" << dparams.chunk_size;
+  }
+  if (dparams.size_factor < 1.0f) {
+    os << "Partial" << static_cast<int>(dparams.size_factor * 100) << "p";
+  }
+  if (dparams.max_output_lines == 0) {
+    os << "CompleteOutput";
+  } else {
+    os << "OutputLines" << dparams.max_output_lines;
+  }
+  if (dparams.output_mode == RAW_DATA) {
+    os << "RawDataOut";
+  } else if (dparams.output_mode == COEFFICIENTS) {
+    os << "CoeffsOut";
+  }
+  os << IOMethodName(dparams.data_type, dparams.endianness);
+  if (dparams.set_out_color_space) {
+    os << "OutColor" << ColorSpaceName(dparams.out_color_space);
+  }
+  if (dparams.crop_output) {
+    os << "Crop";
+  }
+  if (dparams.do_block_smoothing) {
+    os << "BlockSmoothing";
+  }
+  if (!dparams.do_fancy_upsampling) {
+    os << "NoFancyUpsampling";
+  }
+  if (dparams.scale_num != 1 || dparams.scale_denom != 1) {
+    os << "Scale" << dparams.scale_num << "_" << dparams.scale_denom;
+  }
+  if (dparams.quantize_colors) {
+    os << "Quant" << dparams.desired_number_of_colors << "colors";
+    for (size_t i = 0; i < dparams.scan_params.size(); ++i) {
+      if (i > 0) os << "_";
+      const auto& sparam = dparams.scan_params[i];
+      os << QuantMode(sparam.color_quant_mode);
+      os << DitherMode(sparam.dither_mode) << "Dither";
+    }
+  }
+  if (dparams.skip_scans) {
+    os << "SkipScans";
+  }
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  if (!c.fn.empty()) {
+    os << c.fn_desc;
+  } else {
+    os << c.input;
+  }
+  os << c.jparams;
+  os << c.dparams;
+  return os;
+}
+
+std::string TestDescription(const testing::TestParamInfo<TestConfig>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(DecodeAPITest, DecodeAPITestParam,
+                                testing::ValuesIn(GenerateTests(false)),
+                                TestDescription);
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(DecodeAPITestBuffered,
+                                DecodeAPITestParamBuffered,
+                                testing::ValuesIn(GenerateTests(true)),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/decode_internal.h b/third_party/jpeg-xl/lib/jpegli/decode_internal.h
new file mode 100644
index 0000000000..1c4f248d40
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode_internal.h
@@ -0,0 +1,150 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_DECODE_INTERNAL_H_
+#define LIB_JPEGLI_DECODE_INTERNAL_H_
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <vector>
+
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/common_internal.h"
+#include "lib/jpegli/huffman.h"
+
+namespace jpegli {
+
+static constexpr int kNeedMoreInput = 100;
+static constexpr int kHandleRestart = 101;
+static constexpr int kHandleMarkerProcessor = 102;
+static constexpr int kProcessNextMarker = 103;
+static constexpr size_t kAllHuffLutSize = NUM_HUFF_TBLS * kJpegHuffmanLutSize;
+
+typedef int16_t coeff_t;
+
+// State of the decoder that has to be saved before decoding one MCU in case
+// we run out of the bitstream.
+struct MCUCodingState {
+  coeff_t last_dc_coeff[kMaxComponents];
+  int eobrun;
+  coeff_t coeffs[D_MAX_BLOCKS_IN_MCU * DCTSIZE2];
+};
+
+}  // namespace jpegli
+
+// Use this forward-declared libjpeg struct to hold all our private variables.
+// TODO(szabadka) Remove variables that have a corresponding version in cinfo.
+struct jpeg_decomp_master {
+  //
+  // Input handling state.
+  //
+  std::vector<uint8_t> input_buffer_;
+  size_t input_buffer_pos_;
+  // Number of bits after codestream_pos_ that were already processed.
+  size_t codestream_bits_ahead_;
+  bool streaming_mode_;
+
+  // Coefficient buffers
+  jvirt_barray_ptr* coef_arrays;
+  JBLOCKARRAY coeff_rows[jpegli::kMaxComponents];
+
+  //
+  // Marker data processing state.
+  //
+  bool found_soi_;
+  bool found_dri_;
+  bool found_sof_;
+  bool found_eoi_;
+  size_t icc_index_;
+  size_t icc_total_;
+  std::vector<uint8_t> icc_profile_;
+  jpegli::HuffmanTableEntry dc_huff_lut_[jpegli::kAllHuffLutSize];
+  jpegli::HuffmanTableEntry ac_huff_lut_[jpegli::kAllHuffLutSize];
+  uint8_t markers_to_save_[32];
+  jpeg_marker_parser_method app_marker_parsers[16];
+  jpeg_marker_parser_method com_marker_parser;
+  // Whether this jpeg has multiple scans (progressive or non-interleaved
+  // sequential).
+  bool is_multiscan_;
+
+  // Fields defined by SOF marker.
+  size_t iMCU_cols_;
+  int h_factor[jpegli::kMaxComponents];
+  int v_factor[jpegli::kMaxComponents];
+
+  // Initialized at strat of frame.
+  uint16_t scan_progression_[jpegli::kMaxComponents][DCTSIZE2];
+
+  //
+  // Per scan state.
+  //
+  size_t scan_mcu_row_;
+  size_t scan_mcu_col_;
+  size_t mcu_rows_per_iMCU_row_;
+  jpegli::coeff_t last_dc_coeff_[jpegli::kMaxComponents];
+  int eobrun_;
+  int restarts_to_go_;
+  int next_restart_marker_;
+
+  jpegli::MCUCodingState mcu_;
+
+  //
+  // Rendering state.
+  //
+  int output_passes_done_;
+  JpegliDataType output_data_type_ = JPEGLI_TYPE_UINT8;
+  bool swap_endianness_ = false;
+  size_t xoffset_;
+
+  int min_scaled_dct_size;
+  int scaled_dct_size[jpegli::kMaxComponents];
+
+  size_t raw_height_[jpegli::kMaxComponents];
+  jpegli::RowBuffer<float> raw_output_[jpegli::kMaxComponents];
+  jpegli::RowBuffer<float> render_output_[jpegli::kMaxComponents];
+
+  void (*inverse_transform[jpegli::kMaxComponents])(
+      const int16_t* JXL_RESTRICT qblock, const float* JXL_RESTRICT dequant,
+      const float* JXL_RESTRICT biases, float* JXL_RESTRICT scratch_space,
+      float* JXL_RESTRICT output, size_t output_stride, size_t dctsize);
+
+  void (*color_transform)(float* row[jpegli::kMaxComponents], size_t len);
+
+  float* idct_scratch_;
+  float* upsample_scratch_;
+  uint8_t* output_scratch_;
+  int16_t* smoothing_scratch_;
+  float* dequant_;
+  // 1 = 1pass, 2 = 2pass, 3 = external
+  int quant_mode_;
+  int quant_pass_;
+  int num_colors_[jpegli::kMaxComponents];
+  uint8_t* colormap_lut_;
+  uint8_t* pixels_;
+  JSAMPARRAY scanlines_;
+  std::vector<std::vector<uint8_t>> candidate_lists_;
+  bool regenerate_inverse_colormap_;
+  float* dither_[jpegli::kMaxComponents];
+  float* error_row_[2 * jpegli::kMaxComponents];
+  size_t dither_size_;
+  size_t dither_mask_;
+
+  // Per channel and per frequency statistics about the number of nonzeros and
+  // the sum of coefficient absolute values, used in dequantization bias
+  // computation.
+  int* nonzeros_;
+  int* sumabs_;
+  size_t num_processed_blocks_[jpegli::kMaxComponents];
+  float* biases_;
+#define SAVED_COEFS 10
+  // This holds the coef_bits of the scan before the current scan,
+  // i.e. the bottom half when rendering incomplete scans.
+  int (*coef_bits_latch)[SAVED_COEFS];
+  int (*prev_coef_bits_latch)[SAVED_COEFS];
+  bool apply_smoothing;
+};
+
+#endif  // LIB_JPEGLI_DECODE_INTERNAL_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/decode_marker.cc b/third_party/jpeg-xl/lib/jpegli/decode_marker.cc
new file mode 100644
index 0000000000..c5c5790cdf
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode_marker.cc
@@ -0,0 +1,588 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode_marker.h"
+
+#include <string.h>
+
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/huffman.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jxl/base/printf_macros.h"
+
+namespace jpegli {
+namespace {
+
+constexpr int kMaxDimPixels = 65535;
+constexpr uint8_t kIccProfileTag[12] = "ICC_PROFILE";
+
+// Macros for commonly used error conditions.
+
+#define JPEG_VERIFY_LEN(n)                                      \
+  if (pos + (n) > len) {                                        \
+    return JPEGLI_ERROR("Unexpected end of marker: pos=%" PRIuS \
+                        " need=%d len=%" PRIuS,                 \
+                        pos, static_cast<int>(n), len);         \
+  }
+
+#define JPEG_VERIFY_INPUT(var, low, high)                               \
+  if ((var) < (low) || (var) > (high)) {                                \
+    return JPEGLI_ERROR("Invalid " #var ": %d", static_cast<int>(var)); \
+  }
+
+#define JPEG_VERIFY_MARKER_END()                                  \
+  if (pos != len) {                                               \
+    return JPEGLI_ERROR("Invalid marker length: declared=%" PRIuS \
+                        " actual=%" PRIuS,                        \
+                        len, pos);                                \
+  }
+
+inline int ReadUint8(const uint8_t* data, size_t* pos) {
+  return data[(*pos)++];
+}
+
+inline int ReadUint16(const uint8_t* data, size_t* pos) {
+  int v = (data[*pos] << 8) + data[*pos + 1];
+  *pos += 2;
+  return v;
+}
+
+void ProcessSOF(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!m->found_soi_) {
+    JPEGLI_ERROR("Unexpected SOF marker.");
+  }
+  if (m->found_sof_) {
+    JPEGLI_ERROR("Duplicate SOF marker.");
+  }
+  m->found_sof_ = true;
+  cinfo->progressive_mode = (cinfo->unread_marker == 0xc2);
+  cinfo->arith_code = 0;
+  size_t pos = 2;
+  JPEG_VERIFY_LEN(6);
+  cinfo->data_precision = ReadUint8(data, &pos);
+  cinfo->image_height = ReadUint16(data, &pos);
+  cinfo->image_width = ReadUint16(data, &pos);
+  cinfo->num_components = ReadUint8(data, &pos);
+  JPEG_VERIFY_INPUT(cinfo->data_precision, kJpegPrecision, kJpegPrecision);
+  JPEG_VERIFY_INPUT(cinfo->image_height, 1, kMaxDimPixels);
+  JPEG_VERIFY_INPUT(cinfo->image_width, 1, kMaxDimPixels);
+  JPEG_VERIFY_INPUT(cinfo->num_components, 1, kMaxComponents);
+  JPEG_VERIFY_LEN(3 * cinfo->num_components);
+  cinfo->comp_info = jpegli::Allocate<jpeg_component_info>(
+      cinfo, cinfo->num_components, JPOOL_IMAGE);
+
+  // Read sampling factors and quant table index for each component.
+  uint8_t ids_seen[256] = {0};
+  cinfo->max_h_samp_factor = 1;
+  cinfo->max_v_samp_factor = 1;
+  for (int i = 0; i < cinfo->num_components; ++i) {
+    jpeg_component_info* comp = &cinfo->comp_info[i];
+    comp->component_index = i;
+    const int id = ReadUint8(data, &pos);
+    if (ids_seen[id]) {  // (cf. section B.2.2, syntax of Ci)
+      JPEGLI_ERROR("Duplicate ID %d in SOF.", id);
+    }
+    ids_seen[id] = 1;
+    comp->component_id = id;
+    int factor = ReadUint8(data, &pos);
+    int h_samp_factor = factor >> 4;
+    int v_samp_factor = factor & 0xf;
+    JPEG_VERIFY_INPUT(h_samp_factor, 1, MAX_SAMP_FACTOR);
+    JPEG_VERIFY_INPUT(v_samp_factor, 1, MAX_SAMP_FACTOR);
+    comp->h_samp_factor = h_samp_factor;
+    comp->v_samp_factor = v_samp_factor;
+    cinfo->max_h_samp_factor =
+        std::max(cinfo->max_h_samp_factor, h_samp_factor);
+    cinfo->max_v_samp_factor =
+        std::max(cinfo->max_v_samp_factor, v_samp_factor);
+    int quant_tbl_idx = ReadUint8(data, &pos);
+    JPEG_VERIFY_INPUT(quant_tbl_idx, 0, NUM_QUANT_TBLS - 1);
+    comp->quant_tbl_no = quant_tbl_idx;
+    if (cinfo->quant_tbl_ptrs[quant_tbl_idx] == nullptr) {
+      JPEGLI_ERROR("Quantization table with index %u not found", quant_tbl_idx);
+    }
+    comp->quant_table = nullptr;  // will be allocated after SOS marker
+  }
+  JPEG_VERIFY_MARKER_END();
+
+  // Set the input colorspace based on the markers we have seen and set
+  // default output colorspace.
+  if (cinfo->num_components == 1) {
+    cinfo->jpeg_color_space = JCS_GRAYSCALE;
+    cinfo->out_color_space = JCS_GRAYSCALE;
+  } else if (cinfo->num_components == 3) {
+    if (cinfo->saw_JFIF_marker) {
+      cinfo->jpeg_color_space = JCS_YCbCr;
+    } else if (cinfo->saw_Adobe_marker) {
+      cinfo->jpeg_color_space =
+          cinfo->Adobe_transform == 0 ? JCS_RGB : JCS_YCbCr;
+    } else {
+      cinfo->jpeg_color_space = JCS_YCbCr;
+      if (cinfo->comp_info[0].component_id == 'R' &&  //
+          cinfo->comp_info[1].component_id == 'G' &&  //
+          cinfo->comp_info[2].component_id == 'B') {
+        cinfo->jpeg_color_space = JCS_RGB;
+      }
+    }
+    cinfo->out_color_space = JCS_RGB;
+  } else if (cinfo->num_components == 4) {
+    if (cinfo->saw_Adobe_marker) {
+      cinfo->jpeg_color_space =
+          cinfo->Adobe_transform == 0 ? JCS_CMYK : JCS_YCCK;
+    } else {
+      cinfo->jpeg_color_space = JCS_CMYK;
+    }
+    cinfo->out_color_space = JCS_CMYK;
+  }
+
+  // We have checked above that none of the sampling factors are 0, so the max
+  // sampling factors can not be 0.
+  cinfo->total_iMCU_rows =
+      DivCeil(cinfo->image_height, cinfo->max_v_samp_factor * DCTSIZE);
+  m->iMCU_cols_ =
+      DivCeil(cinfo->image_width, cinfo->max_h_samp_factor * DCTSIZE);
+  // Compute the block dimensions for each component.
+  for (int i = 0; i < cinfo->num_components; ++i) {
+    jpeg_component_info* comp = &cinfo->comp_info[i];
+    if (cinfo->max_h_samp_factor % comp->h_samp_factor != 0 ||
+        cinfo->max_v_samp_factor % comp->v_samp_factor != 0) {
+      JPEGLI_ERROR("Non-integral subsampling ratios.");
+    }
+    m->h_factor[i] = cinfo->max_h_samp_factor / comp->h_samp_factor;
+    m->v_factor[i] = cinfo->max_v_samp_factor / comp->v_samp_factor;
+    comp->downsampled_width = DivCeil(cinfo->image_width, m->h_factor[i]);
+    comp->downsampled_height = DivCeil(cinfo->image_height, m->v_factor[i]);
+    comp->width_in_blocks = DivCeil(comp->downsampled_width, DCTSIZE);
+    comp->height_in_blocks = DivCeil(comp->downsampled_height, DCTSIZE);
+  }
+  memset(m->scan_progression_, 0, sizeof(m->scan_progression_));
+}
+
+void ProcessSOS(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!m->found_sof_) {
+    JPEGLI_ERROR("Unexpected SOS marker.");
+  }
+  size_t pos = 2;
+  JPEG_VERIFY_LEN(1);
+  cinfo->comps_in_scan = ReadUint8(data, &pos);
+  JPEG_VERIFY_INPUT(cinfo->comps_in_scan, 1, cinfo->num_components);
+  JPEG_VERIFY_INPUT(cinfo->comps_in_scan, 1, MAX_COMPS_IN_SCAN);
+
+  JPEG_VERIFY_LEN(2 * cinfo->comps_in_scan);
+  bool is_interleaved = (cinfo->comps_in_scan > 1);
+  uint8_t ids_seen[256] = {0};
+  cinfo->blocks_in_MCU = 0;
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    int id = ReadUint8(data, &pos);
+    if (ids_seen[id]) {  // (cf. section B.2.3, regarding CSj)
+      return JPEGLI_ERROR("Duplicate ID %d in SOS.", id);
+    }
+    ids_seen[id] = 1;
+    jpeg_component_info* comp = nullptr;
+    for (int j = 0; j < cinfo->num_components; ++j) {
+      if (cinfo->comp_info[j].component_id == id) {
+        comp = &cinfo->comp_info[j];
+        cinfo->cur_comp_info[i] = comp;
+      }
+    }
+    if (!comp) {
+      return JPEGLI_ERROR("SOS marker: Could not find component with id %d",
+                          id);
+    }
+    int c = ReadUint8(data, &pos);
+    comp->dc_tbl_no = c >> 4;
+    comp->ac_tbl_no = c & 0xf;
+    JPEG_VERIFY_INPUT(comp->dc_tbl_no, 0, 3);
+    JPEG_VERIFY_INPUT(comp->ac_tbl_no, 0, 3);
+    comp->MCU_width = is_interleaved ? comp->h_samp_factor : 1;
+    comp->MCU_height = is_interleaved ? comp->v_samp_factor : 1;
+    comp->MCU_blocks = comp->MCU_width * comp->MCU_height;
+    if (cinfo->blocks_in_MCU + comp->MCU_blocks > D_MAX_BLOCKS_IN_MCU) {
+      JPEGLI_ERROR("Too many blocks in MCU.");
+    }
+    for (int j = 0; j < comp->MCU_blocks; ++j) {
+      cinfo->MCU_membership[cinfo->blocks_in_MCU++] = i;
+    }
+  }
+  JPEG_VERIFY_LEN(3);
+  cinfo->Ss = ReadUint8(data, &pos);
+  cinfo->Se = ReadUint8(data, &pos);
+  JPEG_VERIFY_INPUT(cinfo->Ss, 0, 63);
+  JPEG_VERIFY_INPUT(cinfo->Se, cinfo->Ss, 63);
+  int c = ReadUint8(data, &pos);
+  cinfo->Ah = c >> 4;
+  cinfo->Al = c & 0xf;
+  JPEG_VERIFY_MARKER_END();
+
+  if (cinfo->input_scan_number == 0) {
+    m->is_multiscan_ = (cinfo->comps_in_scan < cinfo->num_components ||
+                        cinfo->progressive_mode);
+  }
+  if (cinfo->Ah != 0 && cinfo->Al != cinfo->Ah - 1) {
+    // section G.1.1.1.2 : Successive approximation control only improves
+    // by one bit at a time.
+    JPEGLI_ERROR("Invalid progressive parameters: Al=%d Ah=%d", cinfo->Al,
+                 cinfo->Ah);
+  }
+  if (!cinfo->progressive_mode) {
+    cinfo->Ss = 0;
+    cinfo->Se = 63;
+    cinfo->Ah = 0;
+    cinfo->Al = 0;
+  }
+  const uint16_t scan_bitmask =
+      cinfo->Ah == 0 ? (0xffff << cinfo->Al) : (1u << cinfo->Al);
+  const uint16_t refinement_bitmask = (1 << cinfo->Al) - 1;
+  if (!cinfo->coef_bits) {
+    cinfo->coef_bits =
+        Allocate<int[DCTSIZE2]>(cinfo, cinfo->num_components * 2, JPOOL_IMAGE);
+    m->coef_bits_latch =
+        Allocate<int[SAVED_COEFS]>(cinfo, cinfo->num_components, JPOOL_IMAGE);
+    m->prev_coef_bits_latch =
+        Allocate<int[SAVED_COEFS]>(cinfo, cinfo->num_components, JPOOL_IMAGE);
+
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      for (int i = 0; i < DCTSIZE2; ++i) {
+        cinfo->coef_bits[c][i] = -1;
+        if (i < SAVED_COEFS) {
+          m->coef_bits_latch[c][i] = -1;
+        }
+      }
+    }
+  }
+
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    int comp_idx = cinfo->cur_comp_info[i]->component_index;
+    for (int k = cinfo->Ss; k <= cinfo->Se; ++k) {
+      if (m->scan_progression_[comp_idx][k] & scan_bitmask) {
+        return JPEGLI_ERROR(
+            "Overlapping scans: component=%d k=%d prev_mask: %u cur_mask %u",
+            comp_idx, k, m->scan_progression_[i][k], scan_bitmask);
+      }
+      if (m->scan_progression_[comp_idx][k] & refinement_bitmask) {
+        return JPEGLI_ERROR(
+            "Invalid scan order, a more refined scan was already done: "
+            "component=%d k=%d prev_mask=%u cur_mask=%u",
+            comp_idx, k, m->scan_progression_[i][k], scan_bitmask);
+      }
+      m->scan_progression_[comp_idx][k] |= scan_bitmask;
+    }
+  }
+  if (cinfo->Al > 10) {
+    return JPEGLI_ERROR("Scan parameter Al=%d is not supported.", cinfo->Al);
+  }
+}
+
+// Reads the Define Huffman Table (DHT) marker segment and builds the Huffman
+// decoding table in either dc_huff_lut_ or ac_huff_lut_, depending on the type
+// and solt_id of Huffman code being read.
+void ProcessDHT(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  size_t pos = 2;
+  if (pos == len) {
+    return JPEGLI_ERROR("DHT marker: no Huffman table found");
+  }
+  while (pos < len) {
+    JPEG_VERIFY_LEN(1 + kJpegHuffmanMaxBitLength);
+    // The index of the Huffman code in the current set of Huffman codes. For AC
+    // component Huffman codes, 0x10 is added to the index.
+    int slot_id = ReadUint8(data, &pos);
+    int huffman_index = slot_id;
+    int is_ac_table = (slot_id & 0x10) != 0;
+    JHUFF_TBL** table;
+    if (is_ac_table) {
+      huffman_index -= 0x10;
+      JPEG_VERIFY_INPUT(huffman_index, 0, NUM_HUFF_TBLS - 1);
+      table = &cinfo->ac_huff_tbl_ptrs[huffman_index];
+    } else {
+      JPEG_VERIFY_INPUT(huffman_index, 0, NUM_HUFF_TBLS - 1);
+      table = &cinfo->dc_huff_tbl_ptrs[huffman_index];
+    }
+    if (*table == nullptr) {
+      *table = jpegli_alloc_huff_table(reinterpret_cast<j_common_ptr>(cinfo));
+    }
+    int total_count = 0;
+    for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+      int count = ReadUint8(data, &pos);
+      (*table)->bits[i] = count;
+      total_count += count;
+    }
+    if (is_ac_table) {
+      JPEG_VERIFY_INPUT(total_count, 0, kJpegHuffmanAlphabetSize);
+    } else {
+      // Allow symbols up to 15 here, we check later whether any invalid symbols
+      // are actually decoded.
+      // TODO(szabadka) Make sure decoder works (does not crash) with up to
+      // 15-nbits DC symbols and then increase kJpegDCAlphabetSize.
+      JPEG_VERIFY_INPUT(total_count, 0, 16);
+    }
+    JPEG_VERIFY_LEN(total_count);
+    for (int i = 0; i < total_count; ++i) {
+      int value = ReadUint8(data, &pos);
+      if (!is_ac_table) {
+        JPEG_VERIFY_INPUT(value, 0, 15);
+      }
+      (*table)->huffval[i] = value;
+    }
+    for (int i = total_count; i < kJpegHuffmanAlphabetSize; ++i) {
+      (*table)->huffval[i] = 0;
+    }
+  }
+  JPEG_VERIFY_MARKER_END();
+}
+
+void ProcessDQT(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (m->found_sof_) {
+    JPEGLI_ERROR("Updating quant tables between scans is not supported.");
+  }
+  size_t pos = 2;
+  if (pos == len) {
+    return JPEGLI_ERROR("DQT marker: no quantization table found");
+  }
+  while (pos < len) {
+    JPEG_VERIFY_LEN(1);
+    int quant_table_index = ReadUint8(data, &pos);
+    int precision = quant_table_index >> 4;
+    JPEG_VERIFY_INPUT(precision, 0, 1);
+    quant_table_index &= 0xf;
+    JPEG_VERIFY_INPUT(quant_table_index, 0, NUM_QUANT_TBLS - 1);
+    JPEG_VERIFY_LEN((precision + 1) * DCTSIZE2);
+
+    if (cinfo->quant_tbl_ptrs[quant_table_index] == nullptr) {
+      cinfo->quant_tbl_ptrs[quant_table_index] =
+          jpegli_alloc_quant_table(reinterpret_cast<j_common_ptr>(cinfo));
+    }
+    JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[quant_table_index];
+
+    for (size_t i = 0; i < DCTSIZE2; ++i) {
+      int quant_val =
+          precision ? ReadUint16(data, &pos) : ReadUint8(data, &pos);
+      JPEG_VERIFY_INPUT(quant_val, 1, 65535);
+      quant_table->quantval[kJPEGNaturalOrder[i]] = quant_val;
+    }
+  }
+  JPEG_VERIFY_MARKER_END();
+}
+
+void ProcessDNL(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  // Ignore marker.
+}
+
+void ProcessDRI(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (m->found_dri_) {
+    return JPEGLI_ERROR("Duplicate DRI marker.");
+  }
+  m->found_dri_ = true;
+  size_t pos = 2;
+  JPEG_VERIFY_LEN(2);
+  cinfo->restart_interval = ReadUint16(data, &pos);
+  JPEG_VERIFY_MARKER_END();
+}
+
+void ProcessAPP(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  const uint8_t marker = cinfo->unread_marker;
+  const uint8_t* payload = data + 2;
+  size_t payload_size = len - 2;
+  if (marker == 0xE0) {
+    if (payload_size >= 14 && memcmp(payload, "JFIF", 4) == 0) {
+      cinfo->saw_JFIF_marker = TRUE;
+      cinfo->JFIF_major_version = payload[5];
+      cinfo->JFIF_minor_version = payload[6];
+      cinfo->density_unit = payload[7];
+      cinfo->X_density = (payload[8] << 8) + payload[9];
+      cinfo->Y_density = (payload[10] << 8) + payload[11];
+    }
+  } else if (marker == 0xEE) {
+    if (payload_size >= 12 && memcmp(payload, "Adobe", 5) == 0) {
+      cinfo->saw_Adobe_marker = TRUE;
+      cinfo->Adobe_transform = payload[11];
+    }
+  } else if (marker == 0xE2) {
+    if (payload_size >= sizeof(kIccProfileTag) &&
+        memcmp(payload, kIccProfileTag, sizeof(kIccProfileTag)) == 0) {
+      payload += sizeof(kIccProfileTag);
+      payload_size -= sizeof(kIccProfileTag);
+      if (payload_size < 2) {
+        return JPEGLI_ERROR("ICC chunk is too small.");
+      }
+      uint8_t index = payload[0];
+      uint8_t total = payload[1];
+      ++m->icc_index_;
+      if (m->icc_index_ != index) {
+        return JPEGLI_ERROR("Invalid ICC chunk order.");
+      }
+      if (total == 0) {
+        return JPEGLI_ERROR("Invalid ICC chunk total.");
+      }
+      if (m->icc_total_ == 0) {
+        m->icc_total_ = total;
+      } else if (m->icc_total_ != total) {
+        return JPEGLI_ERROR("Invalid ICC chunk total.");
+      }
+      if (m->icc_index_ > m->icc_total_) {
+        return JPEGLI_ERROR("Invalid ICC chunk index.");
+      }
+      m->icc_profile_.insert(m->icc_profile_.end(), payload + 2,
+                             payload + payload_size);
+    }
+  }
+}
+
+void ProcessCOM(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  // Ignore marker.
+}
+
+void ProcessSOI(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (m->found_soi_) {
+    JPEGLI_ERROR("Duplicate SOI marker");
+  }
+  m->found_soi_ = true;
+}
+
+void ProcessEOI(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  cinfo->master->found_eoi_ = true;
+}
+
+void SaveMarker(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  const uint8_t marker = cinfo->unread_marker;
+  const uint8_t* payload = data + 2;
+  size_t payload_size = len - 2;
+
+  // Insert new saved marker to the head of the list.
+  jpeg_saved_marker_ptr next = cinfo->marker_list;
+  cinfo->marker_list =
+      jpegli::Allocate<jpeg_marker_struct>(cinfo, 1, JPOOL_IMAGE);
+  cinfo->marker_list->next = next;
+  cinfo->marker_list->marker = marker;
+  cinfo->marker_list->original_length = payload_size;
+  cinfo->marker_list->data_length = payload_size;
+  cinfo->marker_list->data =
+      jpegli::Allocate<uint8_t>(cinfo, payload_size, JPOOL_IMAGE);
+  memcpy(cinfo->marker_list->data, payload, payload_size);
+}
+
+uint8_t ProcessNextMarker(j_decompress_ptr cinfo, const uint8_t* const data,
+                          const size_t len, size_t* pos) {
+  jpeg_decomp_master* m = cinfo->master;
+  size_t num_skipped = 0;
+  uint8_t marker = cinfo->unread_marker;
+  if (marker == 0) {
+    // kIsValidMarker[i] == 1 means (0xc0 + i) is a valid marker.
+    static const uint8_t kIsValidMarker[] = {
+        1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+    };
+    // Skip bytes between markers.
+    while (*pos + 1 < len && (data[*pos] != 0xff || data[*pos + 1] < 0xc0 ||
+                              !kIsValidMarker[data[*pos + 1] - 0xc0])) {
+      ++(*pos);
+      ++num_skipped;
+    }
+    if (*pos + 2 > len) {
+      return kNeedMoreInput;
+    }
+    marker = data[*pos + 1];
+    if (num_skipped > 0) {
+      if (m->found_soi_) {
+        JPEGLI_WARN("Skipped %d bytes before marker 0x%02x", (int)num_skipped,
+                    marker);
+      } else {
+        JPEGLI_ERROR("Did not find SOI marker.");
+      }
+    }
+    *pos += 2;
+    cinfo->unread_marker = marker;
+  }
+  if (!m->found_soi_ && marker != 0xd8) {
+    JPEGLI_ERROR("Did not find SOI marker.");
+  }
+  if (GetMarkerProcessor(cinfo)) {
+    return kHandleMarkerProcessor;
+  }
+  const uint8_t* marker_data = &data[*pos];
+  size_t marker_len = 0;
+  if (marker != 0xd8 && marker != 0xd9) {
+    if (*pos + 2 > len) {
+      return kNeedMoreInput;
+    }
+    marker_len += (data[*pos] << 8) + data[*pos + 1];
+    if (marker_len < 2) {
+      JPEGLI_ERROR("Invalid marker length");
+    }
+    if (*pos + marker_len > len) {
+      // TODO(szabadka) Limit our memory usage by using the skip_input_data
+      // source manager callback on APP markers that are not saved.
+      return kNeedMoreInput;
+    }
+    if (marker >= 0xe0 && m->markers_to_save_[marker - 0xe0]) {
+      SaveMarker(cinfo, marker_data, marker_len);
+    }
+  }
+  if (marker == 0xc0 || marker == 0xc1 || marker == 0xc2) {
+    ProcessSOF(cinfo, marker_data, marker_len);
+  } else if (marker == 0xc4) {
+    ProcessDHT(cinfo, marker_data, marker_len);
+  } else if (marker == 0xda) {
+    ProcessSOS(cinfo, marker_data, marker_len);
+  } else if (marker == 0xdb) {
+    ProcessDQT(cinfo, marker_data, marker_len);
+  } else if (marker == 0xdc) {
+    ProcessDNL(cinfo, marker_data, marker_len);
+  } else if (marker == 0xdd) {
+    ProcessDRI(cinfo, marker_data, marker_len);
+  } else if (marker >= 0xe0 && marker <= 0xef) {
+    ProcessAPP(cinfo, marker_data, marker_len);
+  } else if (marker == 0xfe) {
+    ProcessCOM(cinfo, marker_data, marker_len);
+  } else if (marker == 0xd8) {
+    ProcessSOI(cinfo, marker_data, marker_len);
+  } else if (marker == 0xd9) {
+    ProcessEOI(cinfo, marker_data, marker_len);
+  } else {
+    JPEGLI_ERROR("Unexpected marker 0x%x", marker);
+  }
+  *pos += marker_len;
+  cinfo->unread_marker = 0;
+  if (marker == 0xda) {
+    return JPEG_REACHED_SOS;
+  } else if (marker == 0xd9) {
+    return JPEG_REACHED_EOI;
+  }
+  return kProcessNextMarker;
+}
+
+}  // namespace
+
+jpeg_marker_parser_method GetMarkerProcessor(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  uint8_t marker = cinfo->unread_marker;
+  jpeg_marker_parser_method callback = nullptr;
+  if (marker >= 0xe0 && marker <= 0xef) {
+    callback = m->app_marker_parsers[marker - 0xe0];
+  } else if (marker == 0xfe) {
+    callback = m->com_marker_parser;
+  }
+  return callback;
+}
+
+int ProcessMarkers(j_decompress_ptr cinfo, const uint8_t* const data,
+                   const size_t len, size_t* pos) {
+  for (;;) {
+    int status = ProcessNextMarker(cinfo, data, len, pos);
+    if (status != kProcessNextMarker) {
+      return status;
+    }
+  }
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/decode_marker.h b/third_party/jpeg-xl/lib/jpegli/decode_marker.h
new file mode 100644
index 0000000000..d52c335341
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode_marker.h
@@ -0,0 +1,34 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_DECODE_MARKER_H_
+#define LIB_JPEGLI_DECODE_MARKER_H_
+
+/* clang-format off */
+#include <stdint.h>
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+namespace jpegli {
+
+// Reads the available input in the source manager's input buffer until either
+// the end of the next SOS marker or the end of the input.
+// The corresponding fields of cinfo are updated with the processed input data.
+// Upon return, the input buffer will be at the start or at the end of a marker
+// data segment (inter-marker data is allowed).
+// Return value is one of:
+//   * JPEG_SUSPENDED, if the current input buffer ends before the next SOS or
+//       EOI marker. Input buffer refill is handled by the caller;
+//   * JPEG_REACHED_SOS, if the the next SOS marker is found;
+//   * JPEG_REACHED_EOR, if the end of the input is found.
+int ProcessMarkers(j_decompress_ptr cinfo, const uint8_t* const data,
+                   const size_t len, size_t* pos);
+
+jpeg_marker_parser_method GetMarkerProcessor(j_decompress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_DECODE_MARKER_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/decode_scan.cc b/third_party/jpeg-xl/lib/jpegli/decode_scan.cc
new file mode 100644
index 0000000000..29c0172950
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode_scan.cc
@@ -0,0 +1,566 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode_scan.h"
+
+#include <string.h>
+
+#include <hwy/base.h>
+
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jxl/base/status.h"
+
+namespace jpegli {
+namespace {
+
+// Max 14 block per MCU (when 1 channel is subsampled)
+// Max 64 nonzero coefficients per block
+// Max 16 symbol bits plus 11 extra bits per nonzero symbol
+// Max 2 bytes per 8 bits (worst case is all bytes are escaped 0xff)
+constexpr int kMaxMCUByteSize = 6048;
+
+// Helper structure to read bits from the entropy coded data segment.
+struct BitReaderState {
+  BitReaderState(const uint8_t* data, const size_t len, size_t pos)
+      : data_(data), len_(len), start_pos_(pos) {
+    Reset(pos);
+  }
+
+  void Reset(size_t pos) {
+    pos_ = pos;
+    val_ = 0;
+    bits_left_ = 0;
+    next_marker_pos_ = len_;
+    FillBitWindow();
+  }
+
+  // Returns the next byte and skips the 0xff/0x00 escape sequences.
+  uint8_t GetNextByte() {
+    if (pos_ >= next_marker_pos_) {
+      ++pos_;
+      return 0;
+    }
+    uint8_t c = data_[pos_++];
+    if (c == 0xff) {
+      uint8_t escape = pos_ < len_ ? data_[pos_] : 0;
+      if (escape == 0) {
+        ++pos_;
+      } else {
+        // 0xff was followed by a non-zero byte, which means that we found the
+        // start of the next marker segment.
+        next_marker_pos_ = pos_ - 1;
+      }
+    }
+    return c;
+  }
+
+  void FillBitWindow() {
+    if (bits_left_ <= 16) {
+      while (bits_left_ <= 56) {
+        val_ <<= 8;
+        val_ |= (uint64_t)GetNextByte();
+        bits_left_ += 8;
+      }
+    }
+  }
+
+  int ReadBits(int nbits) {
+    FillBitWindow();
+    uint64_t val = (val_ >> (bits_left_ - nbits)) & ((1ULL << nbits) - 1);
+    bits_left_ -= nbits;
+    return val;
+  }
+
+  // Sets *pos to the next stream position, and *bit_pos to the bit position
+  // within the next byte where parsing should continue.
+  // Returns false if the stream ended too early.
+  bool FinishStream(size_t* pos, size_t* bit_pos) {
+    *bit_pos = (8 - (bits_left_ & 7)) & 7;
+    // Give back some bytes that we did not use.
+    int unused_bytes_left = DivCeil(bits_left_, 8);
+    while (unused_bytes_left-- > 0) {
+      --pos_;
+      // If we give back a 0 byte, we need to check if it was a 0xff/0x00 escape
+      // sequence, and if yes, we need to give back one more byte.
+      if (((pos_ == len_ && pos_ == next_marker_pos_) ||
+           (pos_ > 0 && pos_ < next_marker_pos_ && data_[pos_] == 0)) &&
+          (data_[pos_ - 1] == 0xff)) {
+        --pos_;
+      }
+    }
+    if (pos_ >= next_marker_pos_) {
+      *pos = next_marker_pos_;
+      if (pos_ > next_marker_pos_ || *bit_pos > 0) {
+        // Data ran out before the scan was complete.
+        return false;
+      }
+    }
+    *pos = pos_;
+    return true;
+  }
+
+  const uint8_t* data_;
+  const size_t len_;
+  size_t pos_;
+  uint64_t val_;
+  int bits_left_;
+  size_t next_marker_pos_;
+  size_t start_pos_;
+};
+
+// Returns the next Huffman-coded symbol.
+int ReadSymbol(const HuffmanTableEntry* table, BitReaderState* br) {
+  int nbits;
+  br->FillBitWindow();
+  int val = (br->val_ >> (br->bits_left_ - 8)) & 0xff;
+  table += val;
+  nbits = table->bits - 8;
+  if (nbits > 0) {
+    br->bits_left_ -= 8;
+    table += table->value;
+    val = (br->val_ >> (br->bits_left_ - nbits)) & ((1 << nbits) - 1);
+    table += val;
+  }
+  br->bits_left_ -= table->bits;
+  return table->value;
+}
+
+/**
+ * Returns the DC diff or AC value for extra bits value x and prefix code s.
+ *
+ * CCITT Rec. T.81 (1992 E)
+ * Table F.1 – Difference magnitude categories for DC coding
+ *  SSSS | DIFF values
+ * ------+--------------------------
+ *     0 | 0
+ *     1 | –1, 1
+ *     2 | –3, –2, 2, 3
+ *     3 | –7..–4, 4..7
+ * ......|..........................
+ *    11 | –2047..–1024, 1024..2047
+ *
+ * CCITT Rec. T.81 (1992 E)
+ * Table F.2 – Categories assigned to coefficient values
+ * [ Same as Table F.1, but does not include SSSS equal to 0 and 11]
+ *
+ *
+ * CCITT Rec. T.81 (1992 E)
+ * F.1.2.1.1 Structure of DC code table
+ * For each category,... additional bits... appended... to uniquely identify
+ * which difference... occurred... When DIFF is positive... SSSS... bits of DIFF
+ * are appended. When DIFF is negative... SSSS... bits of (DIFF – 1) are
+ * appended... Most significant bit... is 0 for negative differences and 1 for
+ * positive differences.
+ *
+ * In other words the upper half of extra bits range represents DIFF as is.
+ * The lower half represents the negative DIFFs with an offset.
+ */
+int HuffExtend(int x, int s) {
+  JXL_DASSERT(s >= 1);
+  int half = 1 << (s - 1);
+  if (x >= half) {
+    JXL_DASSERT(x < (1 << s));
+    return x;
+  } else {
+    return x - (1 << s) + 1;
+  }
+}
+
+// Decodes one 8x8 block of DCT coefficients from the bit stream.
+bool DecodeDCTBlock(const HuffmanTableEntry* dc_huff,
+                    const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al,
+                    int* eobrun, BitReaderState* br, coeff_t* last_dc_coeff,
+                    coeff_t* coeffs) {
+  // Nowadays multiplication is even faster than variable shift.
+  int Am = 1 << Al;
+  bool eobrun_allowed = Ss > 0;
+  if (Ss == 0) {
+    int s = ReadSymbol(dc_huff, br);
+    if (s >= kJpegDCAlphabetSize) {
+      return false;
+    }
+    int diff = 0;
+    if (s > 0) {
+      int bits = br->ReadBits(s);
+      diff = HuffExtend(bits, s);
+    }
+    int coeff = diff + *last_dc_coeff;
+    const int dc_coeff = coeff * Am;
+    coeffs[0] = dc_coeff;
+    // TODO(eustas): is there a more elegant / explicit way to check this?
+    if (dc_coeff != coeffs[0]) {
+      return false;
+    }
+    *last_dc_coeff = coeff;
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  if (*eobrun > 0) {
+    --(*eobrun);
+    return true;
+  }
+  for (int k = Ss; k <= Se; k++) {
+    int sr = ReadSymbol(ac_huff, br);
+    if (sr >= kJpegHuffmanAlphabetSize) {
+      return false;
+    }
+    int r = sr >> 4;
+    int s = sr & 15;
+    if (s > 0) {
+      k += r;
+      if (k > Se) {
+        return false;
+      }
+      if (s + Al >= kJpegDCAlphabetSize) {
+        return false;
+      }
+      int bits = br->ReadBits(s);
+      int coeff = HuffExtend(bits, s);
+      coeffs[kJPEGNaturalOrder[k]] = coeff * Am;
+    } else if (r == 15) {
+      k += 15;
+    } else {
+      *eobrun = 1 << r;
+      if (r > 0) {
+        if (!eobrun_allowed) {
+          return false;
+        }
+        *eobrun += br->ReadBits(r);
+      }
+      break;
+    }
+  }
+  --(*eobrun);
+  return true;
+}
+
+bool RefineDCTBlock(const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al,
+                    int* eobrun, BitReaderState* br, coeff_t* coeffs) {
+  // Nowadays multiplication is even faster than variable shift.
+  int Am = 1 << Al;
+  bool eobrun_allowed = Ss > 0;
+  if (Ss == 0) {
+    int s = br->ReadBits(1);
+    coeff_t dc_coeff = coeffs[0];
+    dc_coeff |= s * Am;
+    coeffs[0] = dc_coeff;
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int p1 = Am;
+  int m1 = -Am;
+  int k = Ss;
+  int r;
+  int s;
+  bool in_zero_run = false;
+  if (*eobrun <= 0) {
+    for (; k <= Se; k++) {
+      s = ReadSymbol(ac_huff, br);
+      if (s >= kJpegHuffmanAlphabetSize) {
+        return false;
+      }
+      r = s >> 4;
+      s &= 15;
+      if (s) {
+        if (s != 1) {
+          return false;
+        }
+        s = br->ReadBits(1) ? p1 : m1;
+        in_zero_run = false;
+      } else {
+        if (r != 15) {
+          *eobrun = 1 << r;
+          if (r > 0) {
+            if (!eobrun_allowed) {
+              return false;
+            }
+            *eobrun += br->ReadBits(r);
+          }
+          break;
+        }
+        in_zero_run = true;
+      }
+      do {
+        coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]];
+        if (thiscoef != 0) {
+          if (br->ReadBits(1)) {
+            if ((thiscoef & p1) == 0) {
+              if (thiscoef >= 0) {
+                thiscoef += p1;
+              } else {
+                thiscoef += m1;
+              }
+            }
+          }
+          coeffs[kJPEGNaturalOrder[k]] = thiscoef;
+        } else {
+          if (--r < 0) {
+            break;
+          }
+        }
+        k++;
+      } while (k <= Se);
+      if (s) {
+        if (k > Se) {
+          return false;
+        }
+        coeffs[kJPEGNaturalOrder[k]] = s;
+      }
+    }
+  }
+  if (in_zero_run) {
+    return false;
+  }
+  if (*eobrun > 0) {
+    for (; k <= Se; k++) {
+      coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]];
+      if (thiscoef != 0) {
+        if (br->ReadBits(1)) {
+          if ((thiscoef & p1) == 0) {
+            if (thiscoef >= 0) {
+              thiscoef += p1;
+            } else {
+              thiscoef += m1;
+            }
+          }
+        }
+        coeffs[kJPEGNaturalOrder[k]] = thiscoef;
+      }
+    }
+  }
+  --(*eobrun);
+  return true;
+}
+
+void SaveMCUCodingState(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  memcpy(m->mcu_.last_dc_coeff, m->last_dc_coeff_, sizeof(m->last_dc_coeff_));
+  m->mcu_.eobrun = m->eobrun_;
+  size_t offset = 0;
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    const jpeg_component_info* comp = cinfo->cur_comp_info[i];
+    int c = comp->component_index;
+    size_t block_x = m->scan_mcu_col_ * comp->MCU_width;
+    for (int iy = 0; iy < comp->MCU_height; ++iy) {
+      size_t block_y = m->scan_mcu_row_ * comp->MCU_height + iy;
+      size_t biy = block_y % comp->v_samp_factor;
+      if (block_y >= comp->height_in_blocks) {
+        continue;
+      }
+      size_t nblocks =
+          std::min<size_t>(comp->MCU_width, comp->width_in_blocks - block_x);
+      size_t ncoeffs = nblocks * DCTSIZE2;
+      coeff_t* coeffs = &m->coeff_rows[c][biy][block_x][0];
+      memcpy(&m->mcu_.coeffs[offset], coeffs, ncoeffs * sizeof(coeffs[0]));
+      offset += ncoeffs;
+    }
+  }
+}
+
+void RestoreMCUCodingState(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  memcpy(m->last_dc_coeff_, m->mcu_.last_dc_coeff, sizeof(m->last_dc_coeff_));
+  m->eobrun_ = m->mcu_.eobrun;
+  size_t offset = 0;
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    const jpeg_component_info* comp = cinfo->cur_comp_info[i];
+    int c = comp->component_index;
+    size_t block_x = m->scan_mcu_col_ * comp->MCU_width;
+    for (int iy = 0; iy < comp->MCU_height; ++iy) {
+      size_t block_y = m->scan_mcu_row_ * comp->MCU_height + iy;
+      size_t biy = block_y % comp->v_samp_factor;
+      if (block_y >= comp->height_in_blocks) {
+        continue;
+      }
+      size_t nblocks =
+          std::min<size_t>(comp->MCU_width, comp->width_in_blocks - block_x);
+      size_t ncoeffs = nblocks * DCTSIZE2;
+      coeff_t* coeffs = &m->coeff_rows[c][biy][block_x][0];
+      memcpy(coeffs, &m->mcu_.coeffs[offset], ncoeffs * sizeof(coeffs[0]));
+      offset += ncoeffs;
+    }
+  }
+}
+
+bool FinishScan(j_decompress_ptr cinfo, const uint8_t* data, const size_t len,
+                size_t* pos, size_t* bit_pos) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (m->eobrun_ > 0) {
+    JPEGLI_ERROR("End-of-block run too long.");
+  }
+  m->eobrun_ = -1;
+  memset(m->last_dc_coeff_, 0, sizeof(m->last_dc_coeff_));
+  if (*bit_pos == 0) {
+    return true;
+  }
+  if (data[*pos] == 0xff) {
+    // After last br.FinishStream we checked that there is at least 2 bytes
+    // in the buffer.
+    JXL_DASSERT(*pos + 1 < len);
+    // br.FinishStream would have detected an early marker.
+    JXL_DASSERT(data[*pos + 1] == 0);
+    *pos += 2;
+  } else {
+    *pos += 1;
+  }
+  *bit_pos = 0;
+  return true;
+}
+
+}  // namespace
+
+void PrepareForiMCURow(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    const jpeg_component_info* comp = cinfo->cur_comp_info[i];
+    int c = comp->component_index;
+    int by0 = cinfo->input_iMCU_row * comp->v_samp_factor;
+    int block_rows_left = comp->height_in_blocks - by0;
+    int max_block_rows = std::min(comp->v_samp_factor, block_rows_left);
+    int offset = m->streaming_mode_ ? 0 : by0;
+    m->coeff_rows[c] = (*cinfo->mem->access_virt_barray)(
+        reinterpret_cast<j_common_ptr>(cinfo), m->coef_arrays[c], offset,
+        max_block_rows, true);
+  }
+}
+
+int ProcessScan(j_decompress_ptr cinfo, const uint8_t* const data,
+                const size_t len, size_t* pos, size_t* bit_pos) {
+  if (len == 0) {
+    return kNeedMoreInput;
+  }
+  jpeg_decomp_master* m = cinfo->master;
+  for (;;) {
+    // Handle the restart intervals.
+    if (cinfo->restart_interval > 0 && m->restarts_to_go_ == 0) {
+      if (!FinishScan(cinfo, data, len, pos, bit_pos)) {
+        return kNeedMoreInput;
+      }
+      // Go to the next marker, warn if we had to skip any data.
+      size_t num_skipped = 0;
+      while (*pos + 1 < len && (data[*pos] != 0xff || data[*pos + 1] == 0 ||
+                                data[*pos + 1] == 0xff)) {
+        ++(*pos);
+        ++num_skipped;
+      }
+      if (num_skipped > 0) {
+        JPEGLI_WARN("Skipped %d bytes before restart marker", (int)num_skipped);
+      }
+      if (*pos + 2 > len) {
+        return kNeedMoreInput;
+      }
+      cinfo->unread_marker = data[*pos + 1];
+      *pos += 2;
+      return kHandleRestart;
+    }
+
+    size_t start_pos = *pos;
+    BitReaderState br(data, len, start_pos);
+    if (*bit_pos > 0) {
+      br.ReadBits(*bit_pos);
+    }
+    if (start_pos + kMaxMCUByteSize > len) {
+      SaveMCUCodingState(cinfo);
+    }
+
+    // Decode one MCU.
+    HWY_ALIGN_MAX coeff_t dummy_block[DCTSIZE2];
+    bool scan_ok = true;
+    for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+      const jpeg_component_info* comp = cinfo->cur_comp_info[i];
+      int c = comp->component_index;
+      const HuffmanTableEntry* dc_lut =
+          &m->dc_huff_lut_[comp->dc_tbl_no * kJpegHuffmanLutSize];
+      const HuffmanTableEntry* ac_lut =
+          &m->ac_huff_lut_[comp->ac_tbl_no * kJpegHuffmanLutSize];
+      for (int iy = 0; iy < comp->MCU_height; ++iy) {
+        size_t block_y = m->scan_mcu_row_ * comp->MCU_height + iy;
+        int biy = block_y % comp->v_samp_factor;
+        for (int ix = 0; ix < comp->MCU_width; ++ix) {
+          size_t block_x = m->scan_mcu_col_ * comp->MCU_width + ix;
+          coeff_t* coeffs;
+          if (block_x >= comp->width_in_blocks ||
+              block_y >= comp->height_in_blocks) {
+            // Note that it is OK that dummy_block is uninitialized because
+            // it will never be used in any branches, even in the RefineDCTBlock
+            // case, because only DC scans can be interleaved and we don't use
+            // the zero-ness of the DC coeff in the DC refinement code-path.
+            coeffs = dummy_block;
+          } else {
+            coeffs = &m->coeff_rows[c][biy][block_x][0];
+          }
+          if (cinfo->Ah == 0) {
+            if (!DecodeDCTBlock(dc_lut, ac_lut, cinfo->Ss, cinfo->Se, cinfo->Al,
+                                &m->eobrun_, &br,
+                                &m->last_dc_coeff_[comp->component_index],
+                                coeffs)) {
+              scan_ok = false;
+            }
+          } else {
+            if (!RefineDCTBlock(ac_lut, cinfo->Ss, cinfo->Se, cinfo->Al,
+                                &m->eobrun_, &br, coeffs)) {
+              scan_ok = false;
+            }
+          }
+        }
+      }
+    }
+    size_t new_pos;
+    size_t new_bit_pos;
+    bool stream_ok = br.FinishStream(&new_pos, &new_bit_pos);
+    if (new_pos + 2 > len) {
+      // If reading stopped within the last two bytes, we have to request more
+      // input even if FinishStream() returned true, since the Huffman code
+      // reader could have peaked ahead some bits past the current input chunk
+      // and thus the last prefix code length could have been wrong. We can do
+      // this because a valid JPEG bit stream has two extra bytes at the end.
+      RestoreMCUCodingState(cinfo);
+      return kNeedMoreInput;
+    }
+    *pos = new_pos;
+    *bit_pos = new_bit_pos;
+    if (!stream_ok) {
+      // We hit a marker during parsing.
+      JXL_DASSERT(data[*pos] == 0xff);
+      JXL_DASSERT(data[*pos + 1] != 0);
+      RestoreMCUCodingState(cinfo);
+      JPEGLI_WARN("Incomplete scan detected.");
+      return JPEG_SCAN_COMPLETED;
+    }
+    if (!scan_ok) {
+      JPEGLI_ERROR("Failed to decode DCT block");
+    }
+    if (m->restarts_to_go_ > 0) {
+      --m->restarts_to_go_;
+    }
+    ++m->scan_mcu_col_;
+    if (m->scan_mcu_col_ == cinfo->MCUs_per_row) {
+      ++m->scan_mcu_row_;
+      m->scan_mcu_col_ = 0;
+      if (m->scan_mcu_row_ == cinfo->MCU_rows_in_scan) {
+        if (!FinishScan(cinfo, data, len, pos, bit_pos)) {
+          return kNeedMoreInput;
+        }
+        break;
+      } else if ((m->scan_mcu_row_ % m->mcu_rows_per_iMCU_row_) == 0) {
+        // Current iMCU row is done.
+        break;
+      }
+    }
+  }
+  ++cinfo->input_iMCU_row;
+  if (cinfo->input_iMCU_row < cinfo->total_iMCU_rows) {
+    PrepareForiMCURow(cinfo);
+    return JPEG_ROW_COMPLETED;
+  }
+  return JPEG_SCAN_COMPLETED;
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/decode_scan.h b/third_party/jpeg-xl/lib/jpegli/decode_scan.h
new file mode 100644
index 0000000000..61d05c67d6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode_scan.h
@@ -0,0 +1,33 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_DECODE_SCAN_H_
+#define LIB_JPEGLI_DECODE_SCAN_H_
+
+/* clang-format off */
+#include <stdint.h>
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+namespace jpegli {
+
+// Reads the available input in the source manager's input buffer until the end
+// of the next iMCU row.
+// The corresponding fields of cinfo are updated with the processed input data.
+// Upon return, the input buffer will be at the start of an MCU, or at the end
+// of the scan.
+// Return value is one of:
+//   * JPEG_SUSPENDED, if the input buffer ends before the end of an iMCU row;
+//   * JPEG_ROW_COMPLETED, if the next iMCU row (but not the scan) is reached;
+//   * JPEG_SCAN_COMPLETED, if the end of the scan is reached.
+int ProcessScan(j_decompress_ptr cinfo, const uint8_t* const data,
+                const size_t len, size_t* pos, size_t* bit_pos);
+
+void PrepareForiMCURow(j_decompress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_DECODE_SCAN_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/destination_manager.cc b/third_party/jpeg-xl/lib/jpegli/destination_manager.cc
new file mode 100644
index 0000000000..9bc269f0c9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/destination_manager.cc
@@ -0,0 +1,148 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string.h>
+
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+
+namespace jpegli {
+
+constexpr size_t kDestBufferSize = 64 << 10;
+
+struct StdioDestinationManager {
+  jpeg_destination_mgr pub;
+  FILE* f;
+  uint8_t* buffer;
+
+  static void init_destination(j_compress_ptr cinfo) {
+    auto dest = reinterpret_cast<StdioDestinationManager*>(cinfo->dest);
+    dest->pub.next_output_byte = dest->buffer;
+    dest->pub.free_in_buffer = kDestBufferSize;
+  }
+
+  static boolean empty_output_buffer(j_compress_ptr cinfo) {
+    auto dest = reinterpret_cast<StdioDestinationManager*>(cinfo->dest);
+    if (fwrite(dest->buffer, 1, kDestBufferSize, dest->f) != kDestBufferSize) {
+      JPEGLI_ERROR("Failed to write to output stream.");
+    }
+    dest->pub.next_output_byte = dest->buffer;
+    dest->pub.free_in_buffer = kDestBufferSize;
+    return TRUE;
+  }
+
+  static void term_destination(j_compress_ptr cinfo) {
+    auto dest = reinterpret_cast<StdioDestinationManager*>(cinfo->dest);
+    size_t bytes_left = kDestBufferSize - dest->pub.free_in_buffer;
+    if (bytes_left &&
+        fwrite(dest->buffer, 1, bytes_left, dest->f) != bytes_left) {
+      JPEGLI_ERROR("Failed to write to output stream.");
+    }
+    fflush(dest->f);
+    if (ferror(dest->f)) {
+      JPEGLI_ERROR("Failed to write to output stream.");
+    }
+  }
+};
+
+struct MemoryDestinationManager {
+  jpeg_destination_mgr pub;
+  // Output buffer supplied by the application
+  uint8_t** output;
+  unsigned long* output_size;
+  // Output buffer allocated by us.
+  uint8_t* temp_buffer;
+  // Current output buffer (either application supplied or allocated by us).
+  uint8_t* current_buffer;
+  size_t buffer_size;
+
+  static void init_destination(j_compress_ptr cinfo) {}
+
+  static boolean empty_output_buffer(j_compress_ptr cinfo) {
+    auto dest = reinterpret_cast<MemoryDestinationManager*>(cinfo->dest);
+    uint8_t* next_buffer =
+        reinterpret_cast<uint8_t*>(malloc(dest->buffer_size * 2));
+    memcpy(next_buffer, dest->current_buffer, dest->buffer_size);
+    if (dest->temp_buffer != nullptr) {
+      free(dest->temp_buffer);
+    }
+    dest->temp_buffer = next_buffer;
+    dest->current_buffer = next_buffer;
+    *dest->output = next_buffer;
+    *dest->output_size = dest->buffer_size;
+    dest->pub.next_output_byte = next_buffer + dest->buffer_size;
+    dest->pub.free_in_buffer = dest->buffer_size;
+    dest->buffer_size *= 2;
+    return TRUE;
+  }
+
+  static void term_destination(j_compress_ptr cinfo) {
+    auto dest = reinterpret_cast<MemoryDestinationManager*>(cinfo->dest);
+    *dest->output_size = dest->buffer_size - dest->pub.free_in_buffer;
+  }
+};
+
+}  // namespace jpegli
+
+void jpegli_stdio_dest(j_compress_ptr cinfo, FILE* outfile) {
+  if (outfile == nullptr) {
+    JPEGLI_ERROR("jpegli_stdio_dest: Invalid destination.");
+  }
+  if (cinfo->dest && cinfo->dest->init_destination !=
+                         jpegli::StdioDestinationManager::init_destination) {
+    JPEGLI_ERROR("jpegli_stdio_dest: a different dest manager was already set");
+  }
+  if (!cinfo->dest) {
+    cinfo->dest = reinterpret_cast<jpeg_destination_mgr*>(
+        jpegli::Allocate<jpegli::StdioDestinationManager>(cinfo, 1));
+  }
+  auto dest = reinterpret_cast<jpegli::StdioDestinationManager*>(cinfo->dest);
+  dest->f = outfile;
+  dest->buffer = jpegli::Allocate<uint8_t>(cinfo, jpegli::kDestBufferSize);
+  dest->pub.next_output_byte = dest->buffer;
+  dest->pub.free_in_buffer = jpegli::kDestBufferSize;
+  dest->pub.init_destination =
+      jpegli::StdioDestinationManager::init_destination;
+  dest->pub.empty_output_buffer =
+      jpegli::StdioDestinationManager::empty_output_buffer;
+  dest->pub.term_destination =
+      jpegli::StdioDestinationManager::term_destination;
+}
+
+void jpegli_mem_dest(j_compress_ptr cinfo, unsigned char** outbuffer,
+                     unsigned long* outsize) {
+  if (outbuffer == nullptr || outsize == nullptr) {
+    JPEGLI_ERROR("jpegli_mem_dest: Invalid destination.");
+  }
+  if (cinfo->dest && cinfo->dest->init_destination !=
+                         jpegli::MemoryDestinationManager::init_destination) {
+    JPEGLI_ERROR("jpegli_mem_dest: a different dest manager was already set");
+  }
+  if (!cinfo->dest) {
+    auto dest = jpegli::Allocate<jpegli::MemoryDestinationManager>(cinfo, 1);
+    dest->temp_buffer = nullptr;
+    cinfo->dest = reinterpret_cast<jpeg_destination_mgr*>(dest);
+  }
+  auto dest = reinterpret_cast<jpegli::MemoryDestinationManager*>(cinfo->dest);
+  dest->pub.init_destination =
+      jpegli::MemoryDestinationManager::init_destination;
+  dest->pub.empty_output_buffer =
+      jpegli::MemoryDestinationManager::empty_output_buffer;
+  dest->pub.term_destination =
+      jpegli::MemoryDestinationManager::term_destination;
+  dest->output = outbuffer;
+  dest->output_size = outsize;
+  if (*outbuffer == nullptr || *outsize == 0) {
+    dest->temp_buffer =
+        reinterpret_cast<uint8_t*>(malloc(jpegli::kDestBufferSize));
+    *outbuffer = dest->temp_buffer;
+    *outsize = jpegli::kDestBufferSize;
+  }
+  dest->current_buffer = *outbuffer;
+  dest->buffer_size = *outsize;
+  dest->pub.next_output_byte = dest->current_buffer;
+  dest->pub.free_in_buffer = dest->buffer_size;
+}
diff --git a/third_party/jpeg-xl/lib/jpegli/downsample.cc b/third_party/jpeg-xl/lib/jpegli/downsample.cc
new file mode 100644
index 0000000000..df2c156972
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/downsample.cc
@@ -0,0 +1,356 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/downsample.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/downsample.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/error.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::Vec;
+
+using D = HWY_CAPPED(float, 8);
+constexpr D d;
+
+void DownsampleRow2x1(const float* row_in, size_t len, float* row_out) {
+  const size_t N = Lanes(d);
+  const size_t len_out = len / 2;
+  const auto mul = Set(d, 0.5f);
+  Vec<D> v0, v1;
+  for (size_t x = 0; x < len_out; x += N) {
+    LoadInterleaved2(d, row_in + 2 * x, v0, v1);
+    Store(Mul(mul, Add(v0, v1)), d, row_out + x);
+  }
+}
+
+void DownsampleRow3x1(const float* row_in, size_t len, float* row_out) {
+  const size_t N = Lanes(d);
+  const size_t len_out = len / 3;
+  const auto mul = Set(d, 1.0f / 3);
+  Vec<D> v0, v1, v2;
+  for (size_t x = 0; x < len_out; x += N) {
+    LoadInterleaved3(d, row_in + 3 * x, v0, v1, v2);
+    Store(Mul(mul, Add(Add(v0, v1), v2)), d, row_out + x);
+  }
+}
+
+void DownsampleRow4x1(const float* row_in, size_t len, float* row_out) {
+  const size_t N = Lanes(d);
+  const size_t len_out = len / 4;
+  const auto mul = Set(d, 0.25f);
+  Vec<D> v0, v1, v2, v3;
+  for (size_t x = 0; x < len_out; x += N) {
+    LoadInterleaved4(d, row_in + 4 * x, v0, v1, v2, v3);
+    Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x);
+  }
+}
+
+void Downsample2x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow2x1(rows_in[0], len, row_out);
+}
+
+void Downsample3x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow3x1(rows_in[0], len, row_out);
+}
+
+void Downsample4x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow4x1(rows_in[0], len, row_out);
+}
+
+void Downsample1x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  const size_t N = Lanes(d);
+  const auto mul = Set(d, 0.5f);
+  float* row0 = rows_in[0];
+  float* row1 = rows_in[1];
+  for (size_t x = 0; x < len; x += N) {
+    Store(Mul(mul, Add(Load(d, row0 + x), Load(d, row1 + x))), d, row_out + x);
+  }
+}
+
+void Downsample2x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  const size_t N = Lanes(d);
+  const size_t len_out = len / 2;
+  const auto mul = Set(d, 0.25f);
+  float* row0 = rows_in[0];
+  float* row1 = rows_in[1];
+  Vec<D> v0, v1, v2, v3;
+  for (size_t x = 0; x < len_out; x += N) {
+    LoadInterleaved2(d, row0 + 2 * x, v0, v1);
+    LoadInterleaved2(d, row1 + 2 * x, v2, v3);
+    Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x);
+  }
+}
+
+void Downsample3x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow3x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow3x1(rows_in[1], len, rows_in[1]);
+  Downsample1x2(rows_in, len / 3, row_out);
+}
+
+void Downsample4x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow4x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow4x1(rows_in[1], len, rows_in[1]);
+  Downsample1x2(rows_in, len / 4, row_out);
+}
+
+void Downsample1x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  const size_t N = Lanes(d);
+  const auto mul = Set(d, 1.0f / 3);
+  float* row0 = rows_in[0];
+  float* row1 = rows_in[1];
+  float* row2 = rows_in[2];
+  for (size_t x = 0; x < len; x += N) {
+    const auto in0 = Load(d, row0 + x);
+    const auto in1 = Load(d, row1 + x);
+    const auto in2 = Load(d, row2 + x);
+    Store(Mul(mul, Add(Add(in0, in1), in2)), d, row_out + x);
+  }
+}
+
+void Downsample2x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow2x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow2x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow2x1(rows_in[2], len, rows_in[2]);
+  Downsample1x3(rows_in, len / 2, row_out);
+}
+
+void Downsample3x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow3x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow3x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow3x1(rows_in[2], len, rows_in[2]);
+  Downsample1x3(rows_in, len / 3, row_out);
+}
+
+void Downsample4x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow4x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow4x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow4x1(rows_in[2], len, rows_in[2]);
+  Downsample1x3(rows_in, len / 4, row_out);
+}
+
+void Downsample1x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  const size_t N = Lanes(d);
+  const auto mul = Set(d, 0.25f);
+  float* row0 = rows_in[0];
+  float* row1 = rows_in[1];
+  float* row2 = rows_in[2];
+  float* row3 = rows_in[3];
+  for (size_t x = 0; x < len; x += N) {
+    const auto in0 = Load(d, row0 + x);
+    const auto in1 = Load(d, row1 + x);
+    const auto in2 = Load(d, row2 + x);
+    const auto in3 = Load(d, row3 + x);
+    Store(Mul(mul, Add(Add(in0, in1), Add(in2, in3))), d, row_out + x);
+  }
+}
+
+void Downsample2x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow2x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow2x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow2x1(rows_in[2], len, rows_in[2]);
+  DownsampleRow2x1(rows_in[3], len, rows_in[3]);
+  Downsample1x4(rows_in, len / 2, row_out);
+}
+
+void Downsample3x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow3x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow3x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow3x1(rows_in[2], len, rows_in[2]);
+  DownsampleRow3x1(rows_in[3], len, rows_in[3]);
+  Downsample1x4(rows_in, len / 3, row_out);
+}
+
+void Downsample4x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow4x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow4x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow4x1(rows_in[2], len, rows_in[2]);
+  DownsampleRow4x1(rows_in[3], len, rows_in[3]);
+  Downsample1x4(rows_in, len / 4, row_out);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(Downsample1x2);
+HWY_EXPORT(Downsample1x3);
+HWY_EXPORT(Downsample1x4);
+HWY_EXPORT(Downsample2x1);
+HWY_EXPORT(Downsample2x2);
+HWY_EXPORT(Downsample2x3);
+HWY_EXPORT(Downsample2x4);
+HWY_EXPORT(Downsample3x1);
+HWY_EXPORT(Downsample3x2);
+HWY_EXPORT(Downsample3x3);
+HWY_EXPORT(Downsample3x4);
+HWY_EXPORT(Downsample4x1);
+HWY_EXPORT(Downsample4x2);
+HWY_EXPORT(Downsample4x3);
+HWY_EXPORT(Downsample4x4);
+
+void NullDownsample(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                    float* row_out) {}
+
+void ChooseDownsampleMethods(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  for (int c = 0; c < cinfo->num_components; c++) {
+    m->downsample_method[c] = nullptr;
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor;
+    const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor;
+    if (v_factor == 1) {
+      if (h_factor == 1) {
+        m->downsample_method[c] = NullDownsample;
+      } else if (h_factor == 2) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x1);
+      } else if (h_factor == 3) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x1);
+      } else if (h_factor == 4) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x1);
+      }
+    } else if (v_factor == 2) {
+      if (h_factor == 1) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2);
+      } else if (h_factor == 2) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2);
+      } else if (h_factor == 3) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2);
+      } else if (h_factor == 4) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2);
+      }
+    } else if (v_factor == 3) {
+      if (h_factor == 1) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2);
+      } else if (h_factor == 2) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2);
+      } else if (h_factor == 3) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2);
+      } else if (h_factor == 4) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2);
+      }
+    } else if (v_factor == 4) {
+      if (h_factor == 1) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x4);
+      } else if (h_factor == 2) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x4);
+      } else if (h_factor == 3) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x4);
+      } else if (h_factor == 4) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x4);
+      }
+    }
+    if (m->downsample_method[c] == nullptr) {
+      JPEGLI_ERROR("Unsupported downsampling ratio %dx%d", h_factor, v_factor);
+    }
+  }
+}
+
+void DownsampleInputBuffer(j_compress_ptr cinfo) {
+  if (cinfo->max_h_samp_factor == 1 && cinfo->max_v_samp_factor == 1) {
+    return;
+  }
+  jpeg_comp_master* m = cinfo->master;
+  const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  const size_t y0 = m->next_iMCU_row * iMCU_height;
+  const size_t y1 = y0 + iMCU_height;
+  const size_t xsize_padded = m->xsize_blocks * DCTSIZE;
+  for (int c = 0; c < cinfo->num_components; c++) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor;
+    const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor;
+    if (h_factor == 1 && v_factor == 1) {
+      continue;
+    }
+    auto& input = *m->smooth_input[c];
+    auto& output = *m->raw_data[c];
+    const size_t yout0 = y0 / v_factor;
+    float* rows_in[MAX_SAMP_FACTOR];
+    for (size_t yin = y0, yout = yout0; yin < y1; yin += v_factor, ++yout) {
+      for (int iy = 0; iy < v_factor; ++iy) {
+        rows_in[iy] = input.Row(yin + iy);
+      }
+      float* row_out = output.Row(yout);
+      (*m->downsample_method[c])(rows_in, xsize_padded, row_out);
+    }
+  }
+}
+
+void ApplyInputSmoothing(j_compress_ptr cinfo) {
+  if (!cinfo->smoothing_factor) {
+    return;
+  }
+  jpeg_comp_master* m = cinfo->master;
+  const float kW1 = cinfo->smoothing_factor / 1024.0;
+  const float kW0 = 1.0f - 8.0f * kW1;
+  const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  const ssize_t y0 = m->next_iMCU_row * iMCU_height;
+  const ssize_t y1 = y0 + iMCU_height;
+  const ssize_t xsize_padded = m->xsize_blocks * DCTSIZE;
+  for (int c = 0; c < cinfo->num_components; c++) {
+    auto& input = m->input_buffer[c];
+    auto& output = *m->smooth_input[c];
+    if (m->next_iMCU_row == 0) {
+      input.CopyRow(-1, 0, 1);
+    }
+    if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
+      size_t last_row = m->ysize_blocks * DCTSIZE - 1;
+      input.CopyRow(last_row + 1, last_row, 1);
+    }
+    // TODO(szabadka) SIMDify this.
+    for (ssize_t y = y0; y < y1; ++y) {
+      const float* row_t = input.Row(y - 1);
+      const float* row_m = input.Row(y);
+      const float* row_b = input.Row(y + 1);
+      float* row_out = output.Row(y);
+      for (ssize_t x = 0; x < xsize_padded; ++x) {
+        float val_tl = row_t[x - 1];
+        float val_tm = row_t[x];
+        float val_tr = row_t[x + 1];
+        float val_ml = row_m[x - 1];
+        float val_mm = row_m[x];
+        float val_mr = row_m[x + 1];
+        float val_bl = row_b[x - 1];
+        float val_bm = row_b[x];
+        float val_br = row_b[x + 1];
+        float val1 = (val_tl + val_tm + val_tr + val_ml + val_mr + val_bl +
+                      val_bm + val_br);
+        row_out[x] = val_mm * kW0 + val1 * kW1;
+      }
+    }
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/downsample.h b/third_party/jpeg-xl/lib/jpegli/downsample.h
new file mode 100644
index 0000000000..9d87047758
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/downsample.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_DOWNSAMPLE_H_
+#define LIB_JPEGLI_DOWNSAMPLE_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+namespace jpegli {
+
+void ChooseDownsampleMethods(j_compress_ptr cinfo);
+
+void DownsampleInputBuffer(j_compress_ptr cinfo);
+
+void ApplyInputSmoothing(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_DOWNSAMPLE_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/encode.cc b/third_party/jpeg-xl/lib/jpegli/encode.cc
new file mode 100644
index 0000000000..6015d7d9bb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/encode.cc
@@ -0,0 +1,1153 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/encode.h"
+
+#include <cmath>
+#include <initializer_list>
+#include <vector>
+
+#include "lib/jpegli/adaptive_quantization.h"
+#include "lib/jpegli/bit_writer.h"
+#include "lib/jpegli/bitstream.h"
+#include "lib/jpegli/color_transform.h"
+#include "lib/jpegli/dct.h"
+#include "lib/jpegli/downsample.h"
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/entropy_coding.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/huffman.h"
+#include "lib/jpegli/input.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jpegli/quant.h"
+
+namespace jpegli {
+
+constexpr size_t kMaxBytesInMarker = 65533;
+
+void CheckState(j_compress_ptr cinfo, int state) {
+  if (cinfo->global_state != state) {
+    JPEGLI_ERROR("Unexpected global state %d [expected %d]",
+                 cinfo->global_state, state);
+  }
+}
+
+void CheckState(j_compress_ptr cinfo, int state1, int state2) {
+  if (cinfo->global_state != state1 && cinfo->global_state != state2) {
+    JPEGLI_ERROR("Unexpected global state %d [expected %d or %d]",
+                 cinfo->global_state, state1, state2);
+  }
+}
+
+// Initialize cinfo fields that are not dependent on input image. This is shared
+// between jpegli_CreateCompress() and jpegli_set_defaults()
+void InitializeCompressParams(j_compress_ptr cinfo) {
+  cinfo->data_precision = 8;
+  cinfo->num_scans = 0;
+  cinfo->scan_info = nullptr;
+  cinfo->raw_data_in = FALSE;
+  cinfo->arith_code = FALSE;
+  cinfo->optimize_coding = FALSE;
+  cinfo->CCIR601_sampling = FALSE;
+  cinfo->smoothing_factor = 0;
+  cinfo->dct_method = JDCT_FLOAT;
+  cinfo->restart_interval = 0;
+  cinfo->restart_in_rows = 0;
+  cinfo->write_JFIF_header = FALSE;
+  cinfo->JFIF_major_version = 1;
+  cinfo->JFIF_minor_version = 1;
+  cinfo->density_unit = 0;
+  cinfo->X_density = 1;
+  cinfo->Y_density = 1;
+#if JPEG_LIB_VERSION >= 70
+  cinfo->scale_num = 1;
+  cinfo->scale_denom = 1;
+  cinfo->do_fancy_downsampling = FALSE;
+  cinfo->min_DCT_h_scaled_size = DCTSIZE;
+  cinfo->min_DCT_v_scaled_size = DCTSIZE;
+#endif
+}
+
+float LinearQualityToDistance(int scale_factor) {
+  scale_factor = std::min(5000, std::max(0, scale_factor));
+  int quality =
+      scale_factor < 100 ? 100 - scale_factor / 2 : 5000 / scale_factor;
+  return jpegli_quality_to_distance(quality);
+}
+
+template <typename T>
+void SetSentTableFlag(T** table_ptrs, size_t num, boolean val) {
+  for (size_t i = 0; i < num; ++i) {
+    if (table_ptrs[i]) table_ptrs[i]->sent_table = val;
+  }
+}
+
+struct ProgressiveScan {
+  int Ss, Se, Ah, Al;
+  bool interleaved;
+};
+
+void SetDefaultScanScript(j_compress_ptr cinfo) {
+  int level = cinfo->master->progressive_level;
+  std::vector<ProgressiveScan> progressive_mode;
+  bool interleave_dc =
+      (cinfo->max_h_samp_factor == 1 && cinfo->max_v_samp_factor == 1);
+  if (level == 0) {
+    progressive_mode.push_back({0, 63, 0, 0, true});
+  } else if (level == 1) {
+    progressive_mode.push_back({0, 0, 0, 0, interleave_dc});
+    progressive_mode.push_back({1, 63, 0, 1, false});
+    progressive_mode.push_back({1, 63, 1, 0, false});
+  } else {
+    progressive_mode.push_back({0, 0, 0, 0, interleave_dc});
+    progressive_mode.push_back({1, 2, 0, 0, false});
+    progressive_mode.push_back({3, 63, 0, 2, false});
+    progressive_mode.push_back({3, 63, 2, 1, false});
+    progressive_mode.push_back({3, 63, 1, 0, false});
+  }
+
+  cinfo->script_space_size = 0;
+  for (const auto& scan : progressive_mode) {
+    int comps = scan.interleaved ? MAX_COMPS_IN_SCAN : 1;
+    cinfo->script_space_size += DivCeil(cinfo->num_components, comps);
+  }
+  cinfo->script_space =
+      Allocate<jpeg_scan_info>(cinfo, cinfo->script_space_size);
+
+  jpeg_scan_info* next_scan = cinfo->script_space;
+  for (const auto& scan : progressive_mode) {
+    int comps = scan.interleaved ? MAX_COMPS_IN_SCAN : 1;
+    for (int c = 0; c < cinfo->num_components; c += comps) {
+      next_scan->Ss = scan.Ss;
+      next_scan->Se = scan.Se;
+      next_scan->Ah = scan.Ah;
+      next_scan->Al = scan.Al;
+      next_scan->comps_in_scan = std::min(comps, cinfo->num_components - c);
+      for (int j = 0; j < next_scan->comps_in_scan; ++j) {
+        next_scan->component_index[j] = c + j;
+      }
+      ++next_scan;
+    }
+  }
+  JXL_ASSERT(next_scan - cinfo->script_space == cinfo->script_space_size);
+  cinfo->scan_info = cinfo->script_space;
+  cinfo->num_scans = cinfo->script_space_size;
+}
+
+void ValidateScanScript(j_compress_ptr cinfo) {
+  // Mask of coefficient bits defined by the scan script, for each component
+  // and coefficient index.
+  uint16_t comp_mask[kMaxComponents][DCTSIZE2] = {};
+  static constexpr int kMaxRefinementBit = 10;
+
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    const jpeg_scan_info& si = cinfo->scan_info[i];
+    if (si.comps_in_scan < 1 || si.comps_in_scan > MAX_COMPS_IN_SCAN) {
+      JPEGLI_ERROR("Invalid number of components in scan %d", si.comps_in_scan);
+    }
+    int last_ci = -1;
+    for (int j = 0; j < si.comps_in_scan; ++j) {
+      int ci = si.component_index[j];
+      if (ci < 0 || ci >= cinfo->num_components) {
+        JPEGLI_ERROR("Invalid component index %d in scan", ci);
+      } else if (ci == last_ci) {
+        JPEGLI_ERROR("Duplicate component index %d in scan", ci);
+      } else if (ci < last_ci) {
+        JPEGLI_ERROR("Out of order component index %d in scan", ci);
+      }
+      last_ci = ci;
+    }
+    if (si.Ss < 0 || si.Se < si.Ss || si.Se >= DCTSIZE2) {
+      JPEGLI_ERROR("Invalid spectral range %d .. %d in scan", si.Ss, si.Se);
+    }
+    if (si.Ah < 0 || si.Al < 0 || si.Al > kMaxRefinementBit) {
+      JPEGLI_ERROR("Invalid refinement bits %d/%d", si.Ah, si.Al);
+    }
+    if (!cinfo->progressive_mode) {
+      if (si.Ss != 0 || si.Se != DCTSIZE2 - 1 || si.Ah != 0 || si.Al != 0) {
+        JPEGLI_ERROR("Invalid scan for sequential mode");
+      }
+    } else {
+      if (si.Ss == 0 && si.Se != 0) {
+        JPEGLI_ERROR("DC and AC together in progressive scan");
+      }
+    }
+    if (si.Ss != 0 && si.comps_in_scan != 1) {
+      JPEGLI_ERROR("Interleaved AC only scan.");
+    }
+    for (int j = 0; j < si.comps_in_scan; ++j) {
+      int ci = si.component_index[j];
+      if (si.Ss != 0 && comp_mask[ci][0] == 0) {
+        JPEGLI_ERROR("AC before DC in component %d of scan", ci);
+      }
+      for (int k = si.Ss; k <= si.Se; ++k) {
+        if (comp_mask[ci][k] == 0) {
+          if (si.Ah != 0) {
+            JPEGLI_ERROR("Invalid first scan refinement bit");
+          }
+          comp_mask[ci][k] = ((0xffff << si.Al) & 0xffff);
+        } else {
+          if (comp_mask[ci][k] != ((0xffff << si.Ah) & 0xffff) ||
+              si.Al != si.Ah - 1) {
+            JPEGLI_ERROR("Invalid refinement bit progression.");
+          }
+          comp_mask[ci][k] |= 1 << si.Al;
+        }
+      }
+    }
+    if (si.comps_in_scan > 1) {
+      size_t mcu_size = 0;
+      for (int j = 0; j < si.comps_in_scan; ++j) {
+        int ci = si.component_index[j];
+        jpeg_component_info* comp = &cinfo->comp_info[ci];
+        mcu_size += comp->h_samp_factor * comp->v_samp_factor;
+      }
+      if (mcu_size > C_MAX_BLOCKS_IN_MCU) {
+        JPEGLI_ERROR("MCU size too big");
+      }
+    }
+  }
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    for (int k = 0; k < DCTSIZE2; ++k) {
+      if (comp_mask[c][k] != 0xffff) {
+        JPEGLI_ERROR("Incomplete scan of component %d and frequency %d", c, k);
+      }
+    }
+  }
+}
+
+void ProcessCompressionParams(j_compress_ptr cinfo) {
+  if (cinfo->dest == nullptr) {
+    JPEGLI_ERROR("Missing destination.");
+  }
+  if (cinfo->image_width < 1 || cinfo->image_height < 1 ||
+      cinfo->input_components < 1) {
+    JPEGLI_ERROR("Empty input image.");
+  }
+  if (cinfo->image_width > static_cast<int>(JPEG_MAX_DIMENSION) ||
+      cinfo->image_height > static_cast<int>(JPEG_MAX_DIMENSION) ||
+      cinfo->input_components > static_cast<int>(kMaxComponents)) {
+    JPEGLI_ERROR("Input image too big.");
+  }
+  if (cinfo->num_components < 1 ||
+      cinfo->num_components > static_cast<int>(kMaxComponents)) {
+    JPEGLI_ERROR("Invalid number of components.");
+  }
+  if (cinfo->data_precision != kJpegPrecision) {
+    JPEGLI_ERROR("Invalid data precision");
+  }
+  if (cinfo->arith_code) {
+    JPEGLI_ERROR("Arithmetic coding is not implemented.");
+  }
+  if (cinfo->CCIR601_sampling) {
+    JPEGLI_ERROR("CCIR601 sampling is not implemented.");
+  }
+  if (cinfo->restart_interval > 65535u) {
+    JPEGLI_ERROR("Restart interval too big");
+  }
+  if (cinfo->smoothing_factor < 0 || cinfo->smoothing_factor > 100) {
+    JPEGLI_ERROR("Invalid smoothing factor %d", cinfo->smoothing_factor);
+  }
+  jpeg_comp_master* m = cinfo->master;
+  cinfo->max_h_samp_factor = cinfo->max_v_samp_factor = 1;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    if (comp->component_index != c) {
+      JPEGLI_ERROR("Invalid component index");
+    }
+    for (int j = 0; j < c; ++j) {
+      if (cinfo->comp_info[j].component_id == comp->component_id) {
+        JPEGLI_ERROR("Duplicate component id %d", comp->component_id);
+      }
+    }
+    if (comp->h_samp_factor <= 0 || comp->v_samp_factor <= 0 ||
+        comp->h_samp_factor > MAX_SAMP_FACTOR ||
+        comp->v_samp_factor > MAX_SAMP_FACTOR) {
+      JPEGLI_ERROR("Invalid sampling factor %d x %d", comp->h_samp_factor,
+                   comp->v_samp_factor);
+    }
+    cinfo->max_h_samp_factor =
+        std::max(comp->h_samp_factor, cinfo->max_h_samp_factor);
+    cinfo->max_v_samp_factor =
+        std::max(comp->v_samp_factor, cinfo->max_v_samp_factor);
+  }
+  if (cinfo->num_components == 1 &&
+      (cinfo->max_h_samp_factor != 1 || cinfo->max_v_samp_factor != 1)) {
+    JPEGLI_ERROR("Sampling is not supported for simgle component image.");
+  }
+  size_t iMCU_width = DCTSIZE * cinfo->max_h_samp_factor;
+  size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  size_t total_iMCU_cols = DivCeil(cinfo->image_width, iMCU_width);
+  cinfo->total_iMCU_rows = DivCeil(cinfo->image_height, iMCU_height);
+  m->xsize_blocks = total_iMCU_cols * cinfo->max_h_samp_factor;
+  m->ysize_blocks = cinfo->total_iMCU_rows * cinfo->max_v_samp_factor;
+
+  size_t blocks_per_iMCU = 0;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    if (cinfo->max_h_samp_factor % comp->h_samp_factor != 0 ||
+        cinfo->max_v_samp_factor % comp->v_samp_factor != 0) {
+      JPEGLI_ERROR("Non-integral sampling ratios are not supported.");
+    }
+    m->h_factor[c] = cinfo->max_h_samp_factor / comp->h_samp_factor;
+    m->v_factor[c] = cinfo->max_v_samp_factor / comp->v_samp_factor;
+    comp->downsampled_width = DivCeil(cinfo->image_width, m->h_factor[c]);
+    comp->downsampled_height = DivCeil(cinfo->image_height, m->v_factor[c]);
+    comp->width_in_blocks = DivCeil(comp->downsampled_width, DCTSIZE);
+    comp->height_in_blocks = DivCeil(comp->downsampled_height, DCTSIZE);
+    blocks_per_iMCU += comp->h_samp_factor * comp->v_samp_factor;
+  }
+  m->blocks_per_iMCU_row = total_iMCU_cols * blocks_per_iMCU;
+  // Disable adaptive quantization for subsampled luma channel.
+  int y_channel = cinfo->jpeg_color_space == JCS_RGB ? 1 : 0;
+  jpeg_component_info* y_comp = &cinfo->comp_info[y_channel];
+  if (y_comp->h_samp_factor != cinfo->max_h_samp_factor ||
+      y_comp->v_samp_factor != cinfo->max_v_samp_factor) {
+    m->use_adaptive_quantization = false;
+  }
+  if (cinfo->scan_info == nullptr) {
+    SetDefaultScanScript(cinfo);
+  }
+  cinfo->progressive_mode =
+      cinfo->scan_info->Ss != 0 || cinfo->scan_info->Se != DCTSIZE2 - 1;
+  ValidateScanScript(cinfo);
+}
+
+void ResetForImage(j_compress_ptr cinfo) {
+  (*cinfo->err->reset_error_mgr)(reinterpret_cast<j_common_ptr>(cinfo));
+  (*cinfo->dest->init_destination)(cinfo);
+  jpeg_comp_master* m = cinfo->master;
+  m->next_iMCU_row = 0;
+  m->last_restart_interval = 0;
+  m->last_dht_index = 0;
+  m->num_huffman_codes = 0;
+  if (cinfo->num_scans > 0) {
+    m->scan_coding_info =
+        Allocate<ScanCodingInfo>(cinfo, cinfo->num_scans, JPOOL_IMAGE_ALIGNED);
+  }
+}
+
+bool IsStreamingSupported(j_compress_ptr cinfo) {
+  if (cinfo->global_state == kEncWriteCoeffs) {
+    return false;
+  }
+  // TODO(szabadka) Remove this restriction.
+  if (cinfo->restart_interval > 0 || cinfo->restart_in_rows > 0) {
+    return false;
+  }
+  if (cinfo->optimize_coding || cinfo->num_scans > 1) {
+    return false;
+  }
+  return true;
+}
+
+bool IsSinglePassOptimizerSupported(j_compress_ptr cinfo) {
+  return cinfo->num_scans == 1 && cinfo->optimize_coding &&
+         cinfo->restart_interval == 0 && cinfo->restart_in_rows == 0;
+}
+
+void AllocateBuffers(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  size_t iMCU_width = DCTSIZE * cinfo->max_h_samp_factor;
+  size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  size_t total_iMCU_cols = DivCeil(cinfo->image_width, iMCU_width);
+  size_t xsize_full = total_iMCU_cols * iMCU_width;
+  size_t ysize_full = 3 * iMCU_height;
+  if (!cinfo->raw_data_in) {
+    int num_all_components =
+        std::max(cinfo->input_components, cinfo->num_components);
+    for (int c = 0; c < num_all_components; ++c) {
+      m->input_buffer[c].Allocate(cinfo, ysize_full, xsize_full);
+    }
+  }
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    size_t xsize = total_iMCU_cols * comp->h_samp_factor * DCTSIZE;
+    size_t ysize = 3 * comp->v_samp_factor * DCTSIZE;
+    if (cinfo->raw_data_in) {
+      m->input_buffer[c].Allocate(cinfo, ysize, xsize);
+    }
+    m->smooth_input[c] = &m->input_buffer[c];
+    if (!cinfo->raw_data_in && cinfo->smoothing_factor) {
+      m->smooth_input[c] = Allocate<RowBuffer<float>>(cinfo, 1, JPOOL_IMAGE);
+      m->smooth_input[c]->Allocate(cinfo, ysize_full, xsize_full);
+    }
+    m->raw_data[c] = m->smooth_input[c];
+    if (!cinfo->raw_data_in && (m->h_factor[c] > 1 || m->v_factor[c] > 1)) {
+      m->raw_data[c] = Allocate<RowBuffer<float>>(cinfo, 1, JPOOL_IMAGE);
+      m->raw_data[c]->Allocate(cinfo, ysize, xsize);
+    }
+    m->quant_mul[c] = Allocate<float>(cinfo, DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+  }
+  m->dct_buffer = Allocate<float>(cinfo, 2 * DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+  m->block_tmp = Allocate<int32_t>(cinfo, DCTSIZE2 * 4, JPOOL_IMAGE_ALIGNED);
+  if (!IsStreamingSupported(cinfo)) {
+    m->coeff_buffers =
+        Allocate<jvirt_barray_ptr>(cinfo, cinfo->num_components, JPOOL_IMAGE);
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      jpeg_component_info* comp = &cinfo->comp_info[c];
+      const size_t xsize_blocks = comp->width_in_blocks;
+      const size_t ysize_blocks = comp->height_in_blocks;
+      m->coeff_buffers[c] = (*cinfo->mem->request_virt_barray)(
+          reinterpret_cast<j_common_ptr>(cinfo), JPOOL_IMAGE,
+          /*pre_zero=*/false, xsize_blocks, ysize_blocks, comp->v_samp_factor);
+    }
+  }
+  if (m->use_adaptive_quantization) {
+    int y_channel = cinfo->jpeg_color_space == JCS_RGB ? 1 : 0;
+    jpeg_component_info* y_comp = &cinfo->comp_info[y_channel];
+    const size_t xsize_blocks = y_comp->width_in_blocks;
+    const size_t vecsize = VectorSize();
+    const size_t xsize_padded = DivCeil(2 * xsize_blocks, vecsize) * vecsize;
+    m->diff_buffer =
+        Allocate<float>(cinfo, xsize_blocks * DCTSIZE + 8, JPOOL_IMAGE_ALIGNED);
+    m->fuzzy_erosion_tmp.Allocate(cinfo, 2, xsize_padded);
+    m->pre_erosion.Allocate(cinfo, 6 * cinfo->max_v_samp_factor, xsize_padded);
+    m->quant_field.Allocate(cinfo, cinfo->max_v_samp_factor, xsize_blocks);
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      m->zero_bias_offset[c] =
+          Allocate<float>(cinfo, DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+      m->zero_bias_mul[c] =
+          Allocate<float>(cinfo, DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+    }
+  }
+}
+
+void ReadInputRow(j_compress_ptr cinfo, const uint8_t* scanline,
+                  float* row[kMaxComponents]) {
+  jpeg_comp_master* m = cinfo->master;
+  int num_all_components =
+      std::max(cinfo->input_components, cinfo->num_components);
+  for (int c = 0; c < num_all_components; ++c) {
+    row[c] = m->input_buffer[c].Row(m->next_input_row);
+  }
+  ++m->next_input_row;
+  if (scanline == nullptr) {
+    for (int c = 0; c < cinfo->input_components; ++c) {
+      memset(row[c], 0, cinfo->image_width * sizeof(row[c][0]));
+    }
+    return;
+  }
+  (*m->input_method)(scanline, cinfo->image_width, row);
+}
+
+void PadInputBuffer(j_compress_ptr cinfo, float* row[kMaxComponents]) {
+  jpeg_comp_master* m = cinfo->master;
+  const size_t len0 = cinfo->image_width;
+  const size_t len1 = m->xsize_blocks * DCTSIZE;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    // Pad row to a multiple of the iMCU width, plus create a border of 1
+    // repeated pixel for adaptive quant field calculation.
+    float last_val = row[c][len0 - 1];
+    for (size_t x = len0; x <= len1; ++x) {
+      row[c][x] = last_val;
+    }
+    row[c][-1] = row[c][0];
+  }
+  if (m->next_input_row == cinfo->image_height) {
+    size_t num_rows = m->ysize_blocks * DCTSIZE - cinfo->image_height;
+    for (size_t i = 0; i < num_rows; ++i) {
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        float* dest = m->input_buffer[c].Row(m->next_input_row) - 1;
+        memcpy(dest, row[c] - 1, (len1 + 2) * sizeof(dest[0]));
+      }
+      ++m->next_input_row;
+    }
+  }
+}
+
+void ProcessiMCURow(j_compress_ptr cinfo) {
+  JXL_ASSERT(cinfo->master->next_iMCU_row < cinfo->total_iMCU_rows);
+  if (!cinfo->raw_data_in) {
+    ApplyInputSmoothing(cinfo);
+    DownsampleInputBuffer(cinfo);
+  }
+  ComputeAdaptiveQuantField(cinfo);
+  if (IsStreamingSupported(cinfo)) {
+    WriteiMCURow(cinfo);
+  } else {
+    ComputeDCTCoefficients(cinfo);
+  }
+  ++cinfo->master->next_iMCU_row;
+}
+
+void ProcessiMCURows(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  // To have context rows both above and below the current iMCU row, we delay
+  // processing the first iMCU row and process two iMCU rows after we receive
+  // the last input row.
+  if (m->next_input_row % iMCU_height == 0 && m->next_input_row > iMCU_height) {
+    ProcessiMCURow(cinfo);
+  }
+  if (m->next_input_row >= cinfo->image_height) {
+    ProcessiMCURow(cinfo);
+  }
+}
+
+void InitProgressMonitor(j_compress_ptr cinfo) {
+  if (cinfo->progress == nullptr) {
+    return;
+  }
+  if (IsStreamingSupported(cinfo)) {
+    // We have only one input pass.
+    cinfo->progress->total_passes = 1;
+  } else if (IsSinglePassOptimizerSupported(cinfo)) {
+    // We have one input pass and an encode pass for each scan.
+    cinfo->progress->total_passes = 1 + cinfo->num_scans;
+  } else {
+    // We have one input pass, a histogram pass for each scan, and an encode
+    // pass for each scan.
+    cinfo->progress->total_passes = 1 + 2 * cinfo->num_scans;
+  }
+}
+
+void ProgressMonitorInputPass(j_compress_ptr cinfo) {
+  if (cinfo->progress == nullptr) {
+    return;
+  }
+  cinfo->progress->completed_passes = 0;
+  cinfo->progress->pass_counter = cinfo->next_scanline;
+  cinfo->progress->pass_limit = cinfo->image_height;
+  (*cinfo->progress->progress_monitor)(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void WriteFileHeader(j_compress_ptr cinfo) {
+  WriteOutput(cinfo, {0xFF, 0xD8});  // SOI
+  if (cinfo->write_JFIF_header) {
+    EncodeAPP0(cinfo);
+  }
+  if (cinfo->write_Adobe_marker) {
+    EncodeAPP14(cinfo);
+  }
+}
+
+void WriteScanHeader(j_compress_ptr cinfo, size_t scan_idx) {
+  jpeg_comp_master* m = cinfo->master;
+  cinfo->restart_interval = RestartIntervalForScan(cinfo, scan_idx);
+  if (cinfo->restart_interval != m->last_restart_interval) {
+    EncodeDRI(cinfo);
+    m->last_restart_interval = cinfo->restart_interval;
+  }
+  size_t num_dht = cinfo->master->scan_coding_info[scan_idx].num_huffman_codes;
+  if (num_dht > 0) {
+    bool pre_shifted = IsStreamingSupported(cinfo);
+    EncodeDHT(cinfo, m->huffman_codes + m->last_dht_index, num_dht,
+              pre_shifted);
+    m->last_dht_index += num_dht;
+  }
+  EncodeSOS(cinfo, scan_idx);
+}
+
+void WriteHeaderMarkers(j_compress_ptr cinfo) {
+  bool is_baseline = true;
+  CopyHuffmanCodes(cinfo, &is_baseline);
+  EncodeDQT(cinfo, /*write_all_tables=*/false, &is_baseline);
+  EncodeSOF(cinfo, is_baseline);
+  WriteScanHeader(cinfo, 0);
+  memset(cinfo->master->last_dc_coeff, 0, sizeof(cinfo->master->last_dc_coeff));
+}
+
+void EncodeScans(j_compress_ptr cinfo) {
+  if (IsSinglePassOptimizerSupported(cinfo)) {
+    EncodeSingleScan(cinfo);
+    return;
+  }
+  bool is_baseline = false;
+  if (cinfo->optimize_coding || cinfo->progressive_mode) {
+    OptimizeHuffmanCodes(cinfo, &is_baseline);
+  } else {
+    CopyHuffmanCodes(cinfo, &is_baseline);
+  }
+  EncodeDQT(cinfo, /*write_all_tables=*/false, &is_baseline);
+  EncodeSOF(cinfo, is_baseline);
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    WriteScanHeader(cinfo, i);
+    if (!EncodeScan(cinfo, i)) {
+      JPEGLI_ERROR("Failed to encode scan.");
+    }
+  }
+}
+
+}  // namespace jpegli
+
+void jpegli_CreateCompress(j_compress_ptr cinfo, int version,
+                           size_t structsize) {
+  cinfo->mem = nullptr;
+  if (structsize != sizeof(*cinfo)) {
+    JPEGLI_ERROR("jpegli_compress_struct has wrong size.");
+  }
+  jpegli::InitMemoryManager(reinterpret_cast<j_common_ptr>(cinfo));
+  cinfo->progress = nullptr;
+  cinfo->is_decompressor = FALSE;
+  cinfo->global_state = jpegli::kEncStart;
+  cinfo->dest = nullptr;
+  cinfo->image_width = 0;
+  cinfo->image_height = 0;
+  cinfo->input_components = 0;
+  cinfo->in_color_space = JCS_UNKNOWN;
+  cinfo->input_gamma = 1.0f;
+  cinfo->num_components = 0;
+  cinfo->jpeg_color_space = JCS_UNKNOWN;
+  cinfo->comp_info = nullptr;
+  for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+    cinfo->quant_tbl_ptrs[i] = nullptr;
+  }
+  for (int i = 0; i < NUM_HUFF_TBLS; ++i) {
+    cinfo->dc_huff_tbl_ptrs[i] = nullptr;
+    cinfo->ac_huff_tbl_ptrs[i] = nullptr;
+  }
+  memset(cinfo->arith_dc_L, 0, sizeof(cinfo->arith_dc_L));
+  memset(cinfo->arith_dc_U, 0, sizeof(cinfo->arith_dc_U));
+  memset(cinfo->arith_ac_K, 0, sizeof(cinfo->arith_ac_K));
+  cinfo->write_Adobe_marker = false;
+  jpegli::InitializeCompressParams(cinfo);
+  cinfo->master = jpegli::Allocate<jpeg_comp_master>(cinfo, 1);
+  cinfo->master->force_baseline = true;
+  cinfo->master->xyb_mode = false;
+  cinfo->master->cicp_transfer_function = 2;  // unknown transfer function code
+  cinfo->master->use_std_tables = false;
+  cinfo->master->use_adaptive_quantization = true;
+  cinfo->master->progressive_level = jpegli::kDefaultProgressiveLevel;
+  cinfo->master->data_type = JPEGLI_TYPE_UINT8;
+  cinfo->master->endianness = JPEGLI_NATIVE_ENDIAN;
+  cinfo->master->coeff_buffers = nullptr;
+}
+
+void jpegli_set_xyb_mode(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->xyb_mode = true;
+}
+
+void jpegli_set_cicp_transfer_function(j_compress_ptr cinfo, int code) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->cicp_transfer_function = code;
+}
+
+void jpegli_set_defaults(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  jpegli::InitializeCompressParams(cinfo);
+  jpegli_default_colorspace(cinfo);
+  jpegli_set_quality(cinfo, 90, TRUE);
+  jpegli_set_progressive_level(cinfo, jpegli::kDefaultProgressiveLevel);
+  jpegli::AddStandardHuffmanTables(reinterpret_cast<j_common_ptr>(cinfo),
+                                   /*is_dc=*/false);
+  jpegli::AddStandardHuffmanTables(reinterpret_cast<j_common_ptr>(cinfo),
+                                   /*is_dc=*/true);
+}
+
+void jpegli_default_colorspace(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  switch (cinfo->in_color_space) {
+    case JCS_GRAYSCALE:
+      jpegli_set_colorspace(cinfo, JCS_GRAYSCALE);
+      break;
+    case JCS_RGB: {
+      if (cinfo->master->xyb_mode) {
+        jpegli_set_colorspace(cinfo, JCS_RGB);
+      } else {
+        jpegli_set_colorspace(cinfo, JCS_YCbCr);
+      }
+      break;
+    }
+    case JCS_YCbCr:
+      jpegli_set_colorspace(cinfo, JCS_YCbCr);
+      break;
+    case JCS_CMYK:
+      jpegli_set_colorspace(cinfo, JCS_CMYK);
+      break;
+    case JCS_YCCK:
+      jpegli_set_colorspace(cinfo, JCS_YCCK);
+      break;
+    case JCS_UNKNOWN:
+      jpegli_set_colorspace(cinfo, JCS_UNKNOWN);
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported input colorspace %d", cinfo->in_color_space);
+  }
+}
+
+void jpegli_set_colorspace(j_compress_ptr cinfo, J_COLOR_SPACE colorspace) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->jpeg_color_space = colorspace;
+  switch (colorspace) {
+    case JCS_GRAYSCALE:
+      cinfo->num_components = 1;
+      break;
+    case JCS_RGB:
+    case JCS_YCbCr:
+      cinfo->num_components = 3;
+      break;
+    case JCS_CMYK:
+    case JCS_YCCK:
+      cinfo->num_components = 4;
+      break;
+    case JCS_UNKNOWN:
+      cinfo->num_components =
+          std::min<int>(jpegli::kMaxComponents, cinfo->input_components);
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported jpeg colorspace %d", colorspace);
+  }
+  // Adobe marker is only needed to distinguish CMYK and YCCK JPEGs.
+  cinfo->write_Adobe_marker = (cinfo->jpeg_color_space == JCS_YCCK);
+  if (cinfo->comp_info == nullptr) {
+    cinfo->comp_info =
+        jpegli::Allocate<jpeg_component_info>(cinfo, MAX_COMPONENTS);
+  }
+  memset(cinfo->comp_info, 0,
+         jpegli::kMaxComponents * sizeof(jpeg_component_info));
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    comp->component_index = c;
+    comp->component_id = c + 1;
+    comp->h_samp_factor = 1;
+    comp->v_samp_factor = 1;
+    comp->quant_tbl_no = 0;
+    comp->dc_tbl_no = 0;
+    comp->ac_tbl_no = 0;
+  }
+  if (colorspace == JCS_RGB) {
+    cinfo->comp_info[0].component_id = 'R';
+    cinfo->comp_info[1].component_id = 'G';
+    cinfo->comp_info[2].component_id = 'B';
+    if (cinfo->master->xyb_mode) {
+      // Subsample blue channel.
+      cinfo->comp_info[0].h_samp_factor = cinfo->comp_info[0].v_samp_factor = 2;
+      cinfo->comp_info[1].h_samp_factor = cinfo->comp_info[1].v_samp_factor = 2;
+      cinfo->comp_info[2].h_samp_factor = cinfo->comp_info[2].v_samp_factor = 1;
+      // Use separate quantization tables for each component
+      cinfo->comp_info[1].quant_tbl_no = 1;
+      cinfo->comp_info[2].quant_tbl_no = 2;
+    }
+  } else if (colorspace == JCS_CMYK) {
+    cinfo->comp_info[0].component_id = 'C';
+    cinfo->comp_info[1].component_id = 'M';
+    cinfo->comp_info[2].component_id = 'Y';
+    cinfo->comp_info[3].component_id = 'K';
+  } else if (colorspace == JCS_YCbCr || colorspace == JCS_YCCK) {
+    // Use separate quantization and Huffman tables for luma and chroma
+    cinfo->comp_info[1].quant_tbl_no = 1;
+    cinfo->comp_info[2].quant_tbl_no = 1;
+    cinfo->comp_info[1].dc_tbl_no = cinfo->comp_info[1].ac_tbl_no = 1;
+    cinfo->comp_info[2].dc_tbl_no = cinfo->comp_info[2].ac_tbl_no = 1;
+  }
+}
+
+void jpegli_set_distance(j_compress_ptr cinfo, float distance,
+                         boolean force_baseline) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->force_baseline = force_baseline;
+  float distances[NUM_QUANT_TBLS] = {distance, distance, distance};
+  jpegli::SetQuantMatrices(cinfo, distances, /*add_two_chroma_tables=*/true);
+}
+
+float jpegli_quality_to_distance(int quality) {
+  return (quality >= 100  ? 0.01f
+          : quality >= 30 ? 0.1f + (100 - quality) * 0.09f
+                          : 53.0f / 3000.0f * quality * quality -
+                                23.0f / 20.0f * quality + 25.0f);
+}
+
+void jpegli_set_quality(j_compress_ptr cinfo, int quality,
+                        boolean force_baseline) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->force_baseline = force_baseline;
+  float distance = jpegli_quality_to_distance(quality);
+  float distances[NUM_QUANT_TBLS] = {distance, distance, distance};
+  jpegli::SetQuantMatrices(cinfo, distances, /*add_two_chroma_tables=*/false);
+}
+
+void jpegli_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                               boolean force_baseline) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->force_baseline = force_baseline;
+  float distance = jpegli::LinearQualityToDistance(scale_factor);
+  float distances[NUM_QUANT_TBLS] = {distance, distance, distance};
+  jpegli::SetQuantMatrices(cinfo, distances, /*add_two_chroma_tables=*/false);
+}
+
+#if JPEG_LIB_VERSION >= 70
+void jpegli_default_qtables(j_compress_ptr cinfo, boolean force_baseline) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->force_baseline = force_baseline;
+  float distances[NUM_QUANT_TBLS];
+  for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+    distances[i] = jpegli::LinearQualityToDistance(cinfo->q_scale_factor[i]);
+  }
+  jpegli::SetQuantMatrices(cinfo, distances, /*add_two_chroma_tables=*/false);
+}
+#endif
+
+int jpegli_quality_scaling(int quality) {
+  quality = std::min(100, std::max(1, quality));
+  return quality < 50 ? 5000 / quality : 200 - 2 * quality;
+}
+
+void jpegli_use_standard_quant_tables(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->use_std_tables = true;
+}
+
+void jpegli_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                            const unsigned int* basic_table, int scale_factor,
+                            boolean force_baseline) {
+  CheckState(cinfo, jpegli::kEncStart);
+  if (which_tbl < 0 || which_tbl > NUM_QUANT_TBLS) {
+    JPEGLI_ERROR("Invalid quant table index %d", which_tbl);
+  }
+  if (cinfo->quant_tbl_ptrs[which_tbl] == nullptr) {
+    cinfo->quant_tbl_ptrs[which_tbl] =
+        jpegli_alloc_quant_table(reinterpret_cast<j_common_ptr>(cinfo));
+  }
+  int max_qval = force_baseline ? 255 : 32767U;
+  JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[which_tbl];
+  for (int k = 0; k < DCTSIZE2; ++k) {
+    int qval = (basic_table[k] * scale_factor + 50) / 100;
+    qval = std::max(1, std::min(qval, max_qval));
+    quant_table->quantval[k] = qval;
+  }
+  quant_table->sent_table = FALSE;
+}
+
+void jpegli_enable_adaptive_quantization(j_compress_ptr cinfo, boolean value) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->use_adaptive_quantization = value;
+}
+
+void jpegli_simple_progression(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  jpegli_set_progressive_level(cinfo, 2);
+}
+
+void jpegli_set_progressive_level(j_compress_ptr cinfo, int level) {
+  CheckState(cinfo, jpegli::kEncStart);
+  if (level < 0) {
+    JPEGLI_ERROR("Invalid progressive level %d", level);
+  }
+  cinfo->master->progressive_level = level;
+}
+
+void jpegli_set_input_format(j_compress_ptr cinfo, JpegliDataType data_type,
+                             JpegliEndianness endianness) {
+  CheckState(cinfo, jpegli::kEncStart);
+  switch (data_type) {
+    case JPEGLI_TYPE_UINT8:
+    case JPEGLI_TYPE_UINT16:
+    case JPEGLI_TYPE_FLOAT:
+      cinfo->master->data_type = data_type;
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported data type %d", data_type);
+  }
+  switch (endianness) {
+    case JPEGLI_NATIVE_ENDIAN:
+    case JPEGLI_LITTLE_ENDIAN:
+    case JPEGLI_BIG_ENDIAN:
+      cinfo->master->endianness = endianness;
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported endianness %d", endianness);
+  }
+}
+
+#if JPEG_LIB_VERSION >= 70
+void jpegli_calc_jpeg_dimensions(j_compress_ptr cinfo) {
+  // Since input scaling is not supported, we just copy the image dimensions.
+  cinfo->jpeg_width = cinfo->image_width;
+  cinfo->jpeg_height = cinfo->image_height;
+}
+#endif
+
+void jpegli_copy_critical_parameters(j_decompress_ptr srcinfo,
+                                     j_compress_ptr dstinfo) {
+  CheckState(dstinfo, jpegli::kEncStart);
+  // Image parameters.
+  dstinfo->image_width = srcinfo->image_width;
+  dstinfo->image_height = srcinfo->image_height;
+  dstinfo->input_components = srcinfo->num_components;
+  dstinfo->in_color_space = srcinfo->jpeg_color_space;
+  dstinfo->input_gamma = srcinfo->output_gamma;
+  // Compression parameters.
+  jpegli_set_defaults(dstinfo);
+  jpegli_set_colorspace(dstinfo, srcinfo->jpeg_color_space);
+  if (dstinfo->num_components != srcinfo->num_components) {
+    const auto& cinfo = dstinfo;
+    return JPEGLI_ERROR("Mismatch between src colorspace and components");
+  }
+  dstinfo->data_precision = srcinfo->data_precision;
+  dstinfo->CCIR601_sampling = srcinfo->CCIR601_sampling;
+  dstinfo->JFIF_major_version = srcinfo->JFIF_major_version;
+  dstinfo->JFIF_minor_version = srcinfo->JFIF_minor_version;
+  dstinfo->density_unit = srcinfo->density_unit;
+  dstinfo->X_density = srcinfo->X_density;
+  dstinfo->Y_density = srcinfo->Y_density;
+  for (int c = 0; c < dstinfo->num_components; ++c) {
+    jpeg_component_info* srccomp = &srcinfo->comp_info[c];
+    jpeg_component_info* dstcomp = &dstinfo->comp_info[c];
+    dstcomp->component_id = srccomp->component_id;
+    dstcomp->h_samp_factor = srccomp->h_samp_factor;
+    dstcomp->v_samp_factor = srccomp->v_samp_factor;
+    dstcomp->quant_tbl_no = srccomp->quant_tbl_no;
+  }
+  for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+    if (!srcinfo->quant_tbl_ptrs[i]) continue;
+    if (dstinfo->quant_tbl_ptrs[i] == nullptr) {
+      dstinfo->quant_tbl_ptrs[i] = jpegli::Allocate<JQUANT_TBL>(dstinfo, 1);
+    }
+    memcpy(dstinfo->quant_tbl_ptrs[i], srcinfo->quant_tbl_ptrs[i],
+           sizeof(JQUANT_TBL));
+    dstinfo->quant_tbl_ptrs[i]->sent_table = FALSE;
+  }
+}
+
+void jpegli_suppress_tables(j_compress_ptr cinfo, boolean suppress) {
+  jpegli::SetSentTableFlag(cinfo->quant_tbl_ptrs, NUM_QUANT_TBLS, suppress);
+  jpegli::SetSentTableFlag(cinfo->dc_huff_tbl_ptrs, NUM_HUFF_TBLS, suppress);
+  jpegli::SetSentTableFlag(cinfo->ac_huff_tbl_ptrs, NUM_HUFF_TBLS, suppress);
+}
+
+void jpegli_start_compress(j_compress_ptr cinfo, boolean write_all_tables) {
+  CheckState(cinfo, jpegli::kEncStart);
+  jpegli::ProcessCompressionParams(cinfo);
+  jpegli::InitProgressMonitor(cinfo);
+  jpegli::AllocateBuffers(cinfo);
+  jpegli::ChooseInputMethod(cinfo);
+  if (!cinfo->raw_data_in) {
+    jpegli::ChooseColorTransform(cinfo);
+    jpegli::ChooseDownsampleMethods(cinfo);
+  }
+  jpegli::InitQuantizer(cinfo);
+  if (write_all_tables) {
+    jpegli_suppress_tables(cinfo, FALSE);
+  }
+  (*cinfo->mem->realize_virt_arrays)(reinterpret_cast<j_common_ptr>(cinfo));
+  jpegli::ResetForImage(cinfo);
+  jpegli::WriteFileHeader(cinfo);
+  jpegli::JpegBitWriterInit(cinfo);
+  cinfo->next_scanline = 0;
+  cinfo->master->next_input_row = 0;
+  cinfo->global_state = jpegli::kEncHeader;
+}
+
+void jpegli_write_coefficients(j_compress_ptr cinfo,
+                               jvirt_barray_ptr* coef_arrays) {
+  CheckState(cinfo, jpegli::kEncStart);
+  jpegli::ProcessCompressionParams(cinfo);
+  jpegli::InitProgressMonitor(cinfo);
+  (*cinfo->mem->realize_virt_arrays)(reinterpret_cast<j_common_ptr>(cinfo));
+  cinfo->master->coeff_buffers = coef_arrays;
+  jpegli_suppress_tables(cinfo, FALSE);
+  jpegli::ResetForImage(cinfo);
+  jpegli::WriteFileHeader(cinfo);
+  jpegli::JpegBitWriterInit(cinfo);
+  cinfo->next_scanline = cinfo->image_height;
+  cinfo->master->next_input_row = cinfo->image_height;
+  cinfo->global_state = jpegli::kEncWriteCoeffs;
+}
+
+void jpegli_write_tables(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  if (cinfo->dest == nullptr) {
+    JPEGLI_ERROR("Missing destination.");
+  }
+  jpegli::ResetForImage(cinfo);
+  bool is_baseline = true;
+  jpeg_comp_master* m = cinfo->master;
+  jpegli::WriteOutput(cinfo, {0xFF, 0xD8});  // SOI
+  jpegli::EncodeDQT(cinfo, /*write_all_tables=*/true, &is_baseline);
+  jpegli::CopyHuffmanCodes(cinfo, &is_baseline);
+  jpegli::EncodeDHT(cinfo, m->huffman_codes, m->num_huffman_codes);
+  jpegli::WriteOutput(cinfo, {0xFF, 0xD9});  // EOI
+  (*cinfo->dest->term_destination)(cinfo);
+  jpegli_suppress_tables(cinfo, TRUE);
+}
+
+void jpegli_write_m_header(j_compress_ptr cinfo, int marker,
+                           unsigned int datalen) {
+  CheckState(cinfo, jpegli::kEncHeader, jpegli::kEncWriteCoeffs);
+  if (datalen > jpegli::kMaxBytesInMarker) {
+    JPEGLI_ERROR("Invalid marker length %u", datalen);
+  }
+  if (marker != 0xfe && (marker < 0xe0 || marker > 0xef)) {
+    JPEGLI_ERROR(
+        "jpegli_write_m_header: Only APP and COM markers are supported.");
+  }
+  std::vector<uint8_t> marker_data(4 + datalen);
+  marker_data[0] = 0xff;
+  marker_data[1] = marker;
+  marker_data[2] = (datalen + 2) >> 8;
+  marker_data[3] = (datalen + 2) & 0xff;
+  jpegli::WriteOutput(cinfo, &marker_data[0], 4);
+}
+
+void jpegli_write_m_byte(j_compress_ptr cinfo, int val) {
+  uint8_t data = val;
+  jpegli::WriteOutput(cinfo, &data, 1);
+}
+
+void jpegli_write_marker(j_compress_ptr cinfo, int marker,
+                         const JOCTET* dataptr, unsigned int datalen) {
+  jpegli_write_m_header(cinfo, marker, datalen);
+  jpegli::WriteOutput(cinfo, dataptr, datalen);
+}
+
+void jpegli_write_icc_profile(j_compress_ptr cinfo, const JOCTET* icc_data_ptr,
+                              unsigned int icc_data_len) {
+  constexpr size_t kMaxIccBytesInMarker =
+      jpegli::kMaxBytesInMarker - sizeof jpegli::kICCSignature - 2;
+  const int num_markers =
+      static_cast<int>(jpegli::DivCeil(icc_data_len, kMaxIccBytesInMarker));
+  size_t begin = 0;
+  for (int current_marker = 0; current_marker < num_markers; ++current_marker) {
+    const size_t length = std::min(kMaxIccBytesInMarker, icc_data_len - begin);
+    jpegli_write_m_header(
+        cinfo, jpegli::kICCMarker,
+        static_cast<unsigned int>(length + sizeof jpegli::kICCSignature + 2));
+    for (const unsigned char c : jpegli::kICCSignature) {
+      jpegli_write_m_byte(cinfo, c);
+    }
+    jpegli_write_m_byte(cinfo, current_marker + 1);
+    jpegli_write_m_byte(cinfo, num_markers);
+    for (size_t i = 0; i < length; ++i) {
+      jpegli_write_m_byte(cinfo, icc_data_ptr[begin]);
+      ++begin;
+    }
+  }
+}
+
+JDIMENSION jpegli_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
+                                  JDIMENSION num_lines) {
+  CheckState(cinfo, jpegli::kEncHeader, jpegli::kEncReadImage);
+  if (cinfo->raw_data_in) {
+    JPEGLI_ERROR("jpegli_write_raw_data() must be called for raw data mode.");
+  }
+  jpegli::ProgressMonitorInputPass(cinfo);
+  if (cinfo->global_state == jpegli::kEncHeader &&
+      jpegli::IsStreamingSupported(cinfo)) {
+    jpegli::WriteHeaderMarkers(cinfo);
+  }
+  cinfo->global_state = jpegli::kEncReadImage;
+  jpeg_comp_master* m = cinfo->master;
+  if (num_lines + cinfo->next_scanline > cinfo->image_height) {
+    num_lines = cinfo->image_height - cinfo->next_scanline;
+  }
+  JDIMENSION prev_scanline = cinfo->next_scanline;
+  size_t input_lag = (std::min<size_t>(cinfo->image_height, m->next_input_row) -
+                      cinfo->next_scanline);
+  if (input_lag > num_lines) {
+    JPEGLI_ERROR("Need at least %u lines to continue", input_lag);
+  }
+  if (input_lag > 0) {
+    if (!jpegli::EmptyBitWriterBuffer(&m->bw)) {
+      return 0;
+    }
+    cinfo->next_scanline += input_lag;
+  }
+  float* rows[jpegli::kMaxComponents];
+  for (size_t i = input_lag; i < num_lines; ++i) {
+    jpegli::ReadInputRow(cinfo, scanlines[i], rows);
+    (*m->color_transform)(rows, cinfo->image_width);
+    jpegli::PadInputBuffer(cinfo, rows);
+    jpegli::ProcessiMCURows(cinfo);
+    if (!jpegli::EmptyBitWriterBuffer(&m->bw)) {
+      break;
+    }
+    ++cinfo->next_scanline;
+  }
+  return cinfo->next_scanline - prev_scanline;
+}
+
+JDIMENSION jpegli_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                                 JDIMENSION num_lines) {
+  CheckState(cinfo, jpegli::kEncHeader, jpegli::kEncReadImage);
+  if (!cinfo->raw_data_in) {
+    JPEGLI_ERROR("jpegli_write_raw_data(): raw data mode was not set");
+  }
+  jpegli::ProgressMonitorInputPass(cinfo);
+  if (cinfo->global_state == jpegli::kEncHeader &&
+      jpegli::IsStreamingSupported(cinfo)) {
+    jpegli::WriteHeaderMarkers(cinfo);
+  }
+  cinfo->global_state = jpegli::kEncReadImage;
+  jpeg_comp_master* m = cinfo->master;
+  if (cinfo->next_scanline >= cinfo->image_height) {
+    return 0;
+  }
+  size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  if (num_lines < iMCU_height) {
+    JPEGLI_ERROR("Missing input lines, minimum is %u", iMCU_height);
+  }
+  if (cinfo->next_scanline < m->next_input_row) {
+    JXL_ASSERT(m->next_input_row - cinfo->next_scanline == iMCU_height);
+    if (!jpegli::EmptyBitWriterBuffer(&m->bw)) {
+      return 0;
+    }
+    cinfo->next_scanline = m->next_input_row;
+    return iMCU_height;
+  }
+  size_t iMCU_y = m->next_input_row / iMCU_height;
+  float* rows[jpegli::kMaxComponents];
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    JSAMPARRAY plane = data[c];
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    size_t xsize = comp->width_in_blocks * DCTSIZE;
+    size_t ysize = comp->v_samp_factor * DCTSIZE;
+    size_t y0 = iMCU_y * ysize;
+    auto& buffer = m->input_buffer[c];
+    for (size_t i = 0; i < ysize; ++i) {
+      rows[0] = buffer.Row(y0 + i);
+      if (plane[i] == nullptr) {
+        memset(rows[0], 0, xsize * sizeof(rows[0][0]));
+      } else {
+        (*m->input_method)(plane[i], xsize, rows);
+      }
+      // We need a border of 1 repeated pixel for adaptive quant field.
+      buffer.PadRow(y0 + i, xsize, /*border=*/1);
+    }
+  }
+  m->next_input_row += iMCU_height;
+  jpegli::ProcessiMCURows(cinfo);
+  if (!jpegli::EmptyBitWriterBuffer(&m->bw)) {
+    return 0;
+  }
+  cinfo->next_scanline += iMCU_height;
+  return iMCU_height;
+}
+
+void jpegli_finish_compress(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncReadImage, jpegli::kEncWriteCoeffs);
+  jpeg_comp_master* m = cinfo->master;
+  if (cinfo->next_scanline < cinfo->image_height) {
+    JPEGLI_ERROR("Incomplete image, expected %d rows, got %d",
+                 cinfo->image_height, cinfo->next_scanline);
+  }
+
+  if (jpegli::IsStreamingSupported(cinfo)) {
+    jpegli::JumpToByteBoundary(&m->bw);
+    if (!jpegli::EmptyBitWriterBuffer(&m->bw)) {
+      JPEGLI_ERROR("Output suspension is not supported in finish_compress");
+    }
+    if (!m->bw.healthy) {
+      JPEGLI_ERROR("Failed to encode scan.");
+    }
+  } else {
+    jpegli::EncodeScans(cinfo);
+  }
+
+  jpegli::WriteOutput(cinfo, {0xFF, 0xD9});  // EOI
+  (*cinfo->dest->term_destination)(cinfo);
+
+  // Release memory and reset global state.
+  jpegli_abort_compress(cinfo);
+}
+
+void jpegli_abort_compress(j_compress_ptr cinfo) {
+  jpegli_abort(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void jpegli_destroy_compress(j_compress_ptr cinfo) {
+  jpegli_destroy(reinterpret_cast<j_common_ptr>(cinfo));
+}
diff --git a/third_party/jpeg-xl/lib/jpegli/encode.h b/third_party/jpeg-xl/lib/jpegli/encode.h
new file mode 100644
index 0000000000..075b6b855f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/encode.h
@@ -0,0 +1,159 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// This file conatins the C API of the encoder part of the libjpegli library,
+// which is based on the C API of libjpeg, with the function names changed from
+// jpeg_* to jpegli_*, while compressor object definitions are included directly
+// from jpeglib.h
+//
+// Applications can use the libjpegli library in one of the following ways:
+//
+//  (1) Include jpegli/encode.h and/or jpegli/decode.h, update the function
+//      names of the API and link against libjpegli.
+//
+//  (2) Leave the application code unchanged, but replace the libjpeg.so library
+//      with the one built by this project that is API- and ABI-compatible with
+//      libjpeg-turbo's version of libjpeg.so.
+
+#ifndef LIB_JPEGLI_ENCODE_H_
+#define LIB_JPEGLI_ENCODE_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#include "lib/jpegli/common.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define jpegli_create_compress(cinfo)              \
+  jpegli_CreateCompress((cinfo), JPEG_LIB_VERSION, \
+                        (size_t)sizeof(struct jpeg_compress_struct))
+void jpegli_CreateCompress(j_compress_ptr cinfo, int version,
+                           size_t structsize);
+
+void jpegli_stdio_dest(j_compress_ptr cinfo, FILE* outfile);
+
+void jpegli_mem_dest(j_compress_ptr cinfo, unsigned char** outbuffer,
+                     unsigned long* outsize);
+
+void jpegli_set_defaults(j_compress_ptr cinfo);
+
+void jpegli_default_colorspace(j_compress_ptr cinfo);
+
+void jpegli_set_colorspace(j_compress_ptr cinfo, J_COLOR_SPACE colorspace);
+
+void jpegli_set_quality(j_compress_ptr cinfo, int quality,
+                        boolean force_baseline);
+
+void jpegli_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                               boolean force_baseline);
+
+#if JPEG_LIB_VERSION >= 70
+void jpegli_default_qtables(j_compress_ptr cinfo, boolean force_baseline);
+#endif
+
+int jpegli_quality_scaling(int quality);
+
+void jpegli_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                            const unsigned int* basic_table, int scale_factor,
+                            boolean force_baseline);
+
+void jpegli_simple_progression(j_compress_ptr cinfo);
+
+void jpegli_suppress_tables(j_compress_ptr cinfo, boolean suppress);
+
+#if JPEG_LIB_VERSION >= 70
+void jpegli_calc_jpeg_dimensions(j_compress_ptr cinfo);
+#endif
+
+void jpegli_copy_critical_parameters(j_decompress_ptr srcinfo,
+                                     j_compress_ptr dstinfo);
+
+void jpegli_write_m_header(j_compress_ptr cinfo, int marker,
+                           unsigned int datalen);
+
+void jpegli_write_m_byte(j_compress_ptr cinfo, int val);
+
+void jpegli_write_marker(j_compress_ptr cinfo, int marker,
+                         const JOCTET* dataptr, unsigned int datalen);
+
+void jpegli_write_icc_profile(j_compress_ptr cinfo, const JOCTET* icc_data_ptr,
+                              unsigned int icc_data_len);
+
+void jpegli_start_compress(j_compress_ptr cinfo, boolean write_all_tables);
+
+void jpegli_write_tables(j_compress_ptr cinfo);
+
+JDIMENSION jpegli_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
+                                  JDIMENSION num_lines);
+
+JDIMENSION jpegli_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                                 JDIMENSION num_lines);
+
+void jpegli_write_coefficients(j_compress_ptr cinfo,
+                               jvirt_barray_ptr* coef_arrays);
+
+void jpegli_finish_compress(j_compress_ptr cinfo);
+
+void jpegli_abort_compress(j_compress_ptr cinfo);
+
+void jpegli_destroy_compress(j_compress_ptr cinfo);
+
+//
+// New API functions that are not available in libjpeg
+//
+// NOTE: This part of the API is still experimental and will probably change in
+// the future.
+//
+
+// Sets the butteraugli target distance for the compressor. This may override
+// the default quantization table indexes based on jpeg colorspace, therefore
+// it must be called after jpegli_set_defaults() or after the last
+// jpegli_set_colorspace() or jpegli_default_colorspace() calls.
+void jpegli_set_distance(j_compress_ptr cinfo, float distance,
+                         boolean force_baseline);
+
+// Returns the butteraugli target distance for the given quality parameter.
+float jpegli_quality_to_distance(int quality);
+
+// Changes the default behaviour of the encoder in the selection of quantization
+// matrices and chroma subsampling. Must be called before jpegli_set_defaults()
+// because some default setting depend on the XYB mode.
+void jpegli_set_xyb_mode(j_compress_ptr cinfo);
+
+// Signals to the encoder that the pixel data that will be provided later
+// through jpegli_write_scanlines() has this transfer function. This must be
+// called before jpegli_set_defaults() because it changes the default
+// quantization tables.
+void jpegli_set_cicp_transfer_function(j_compress_ptr cinfo, int code);
+
+void jpegli_set_input_format(j_compress_ptr cinfo, JpegliDataType data_type,
+                             JpegliEndianness endianness);
+
+// Sets whether or not the encoder uses adaptive quantization for createing more
+// zero coefficients based on the local properties of the image.
+// Enabled by default.
+void jpegli_enable_adaptive_quantization(j_compress_ptr cinfo, boolean value);
+
+// Sets the default progression parameters, where level 0 is sequential, and
+// greater level value means more progression steps. Default is 2.
+void jpegli_set_progressive_level(j_compress_ptr cinfo, int level);
+
+// If this function is called before starting compression, the quality and
+// linear quality parameters will be used to scale the standard quantization
+// tables from Annex K of the JPEG standard. By default jpegli uses a different
+// set of quantization tables and used different scaling parameters for DC and
+// AC coefficients. Must be called before jpegli_set_defaults().
+void jpegli_use_standard_quant_tables(j_compress_ptr cinfo);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}  // extern "C"
+#endif
+
+#endif  // LIB_JPEGLI_ENCODE_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/encode_api_test.cc b/third_party/jpeg-xl/lib/jpegli/encode_api_test.cc
new file mode 100644
index 0000000000..4358b2b6e0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/encode_api_test.cc
@@ -0,0 +1,856 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jpegli {
+namespace {
+
+struct TestConfig {
+  TestImage input;
+  CompressParams jparams;
+  JpegIOMode input_mode = PIXELS;
+  double max_bpp;
+  double max_dist;
+};
+
+class EncodeAPITestParam : public ::testing::TestWithParam<TestConfig> {};
+
+void GenerateInput(JpegIOMode input_mode, const CompressParams& jparams,
+                   TestImage* input) {
+  GeneratePixels(input);
+  if (input_mode == RAW_DATA) {
+    GenerateRawData(jparams, input);
+  } else if (input_mode == COEFFICIENTS) {
+    GenerateCoeffs(jparams, input);
+  }
+}
+
+TEST_P(EncodeAPITestParam, TestAPI) {
+  TestConfig config = GetParam();
+  GenerateInput(config.input_mode, config.jparams, &config.input);
+  std::vector<uint8_t> compressed;
+  ASSERT_TRUE(EncodeWithJpegli(config.input, config.jparams, &compressed));
+  if (config.jparams.icc.empty()) {
+    double bpp =
+        compressed.size() * 8.0 / (config.input.xsize * config.input.ysize);
+    printf("bpp: %f\n", bpp);
+    EXPECT_LT(bpp, config.max_bpp);
+  }
+  DecompressParams dparams;
+  dparams.output_mode =
+      config.input_mode == COEFFICIENTS ? COEFFICIENTS : PIXELS;
+  if (config.jparams.set_jpeg_colorspace &&
+      config.jparams.jpeg_color_space == JCS_GRAYSCALE) {
+    ConvertToGrayscale(&config.input);
+  } else {
+    dparams.set_out_color_space = true;
+    dparams.out_color_space = config.input.color_space;
+  }
+  TestImage output;
+  DecodeWithLibjpeg(config.jparams, dparams, compressed, &output);
+  VerifyOutputImage(config.input, output, config.max_dist);
+}
+
+TEST(EncodeAPITest, ReuseCinfoSameImageTwice) {
+  TestImage input;
+  input.xsize = 129;
+  input.ysize = 73;
+  CompressParams jparams;
+  GenerateInput(PIXELS, jparams, &input);
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  std::vector<uint8_t> compressed0;
+  std::vector<uint8_t> compressed1;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    EncodeWithJpegli(input, jparams, &cinfo);
+    compressed0.assign(buffer, buffer + buffer_size);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    EncodeWithJpegli(input, jparams, &cinfo);
+    compressed1.assign(buffer, buffer + buffer_size);
+    return true;
+  };
+  EXPECT_TRUE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+  ASSERT_EQ(compressed0.size(), compressed1.size());
+  EXPECT_EQ(0,
+            memcmp(compressed0.data(), compressed1.data(), compressed0.size()));
+}
+
+std::vector<TestConfig> GenerateBasicConfigs() {
+  std::vector<TestConfig> all_configs;
+  for (int samp : {1, 2}) {
+    for (int progr : {0, 2}) {
+      for (int optimize : {0, 1}) {
+        if (progr && optimize) continue;
+        TestConfig config;
+        config.input.xsize = 257 + samp * 37;
+        config.input.ysize = 265 + optimize * 17;
+        config.jparams.h_sampling = {samp, 1, 1};
+        config.jparams.v_sampling = {samp, 1, 1};
+        config.jparams.progressive_mode = progr;
+        config.jparams.optimize_coding = optimize;
+        config.max_dist = 2.4f;
+        GeneratePixels(&config.input);
+        all_configs.push_back(config);
+      }
+    }
+  }
+  return all_configs;
+}
+
+TEST(EncodeAPITest, ReuseCinfoSameMemOutput) {
+  std::vector<TestConfig> all_configs = GenerateBasicConfigs();
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+      for (const TestConfig& config : all_configs) {
+        EncodeWithJpegli(config.input, config.jparams, &cinfo);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_compress(&cinfo);
+  }
+  std::vector<TestImage> all_outputs(all_configs.size());
+  {
+    jpeg_decompress_struct cinfo = {};
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpeg_create_decompress(&cinfo);
+      jpeg_mem_src(&cinfo, buffer, buffer_size);
+      for (size_t i = 0; i < all_configs.size(); ++i) {
+        DecodeWithLibjpeg(all_configs[i].jparams, DecompressParams(), &cinfo,
+                          &all_outputs[i]);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpeg_destroy_decompress(&cinfo);
+  }
+  for (size_t i = 0; i < all_configs.size(); ++i) {
+    VerifyOutputImage(all_configs[i].input, all_outputs[i],
+                      all_configs[i].max_dist);
+  }
+  if (buffer) free(buffer);
+}
+
+TEST(EncodeAPITest, ReuseCinfoSameStdOutput) {
+  std::vector<TestConfig> all_configs = GenerateBasicConfigs();
+  FILE* tmpf = tmpfile();
+  JXL_CHECK(tmpf);
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_stdio_dest(&cinfo, tmpf);
+      for (const TestConfig& config : all_configs) {
+        EncodeWithJpegli(config.input, config.jparams, &cinfo);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_compress(&cinfo);
+  }
+  rewind(tmpf);
+  std::vector<TestImage> all_outputs(all_configs.size());
+  {
+    jpeg_decompress_struct cinfo = {};
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpeg_create_decompress(&cinfo);
+      jpeg_stdio_src(&cinfo, tmpf);
+      for (size_t i = 0; i < all_configs.size(); ++i) {
+        DecodeWithLibjpeg(all_configs[i].jparams, DecompressParams(), &cinfo,
+                          &all_outputs[i]);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpeg_destroy_decompress(&cinfo);
+  }
+  for (size_t i = 0; i < all_configs.size(); ++i) {
+    VerifyOutputImage(all_configs[i].input, all_outputs[i],
+                      all_configs[i].max_dist);
+  }
+  fclose(tmpf);
+}
+
+TEST(EncodeAPITest, ReuseCinfoChangeParams) {
+  TestImage input, output;
+  CompressParams jparams;
+  DecompressParams dparams;
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  std::vector<uint8_t> compressed;
+  jpeg_compress_struct cinfo;
+  const auto max_rms = [](int q, int hs, int vs) {
+    if (hs == 1 && vs == 1) return q == 90 ? 2.2 : 0.6;
+    if (hs == 2 && vs == 2) return q == 90 ? 2.8 : 1.2;
+    return q == 90 ? 2.4 : 1.0;
+  };
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    input.xsize = 129;
+    input.ysize = 73;
+    dparams.set_out_color_space = true;
+    for (JpegIOMode input_mode : {PIXELS, RAW_DATA, PIXELS, COEFFICIENTS}) {
+      for (int h_samp : {2, 1}) {
+        for (int v_samp : {2, 1}) {
+          for (int progr : {0, 2}) {
+            for (int quality : {90, 100}) {
+              input.Clear();
+              input.color_space =
+                  (input_mode == RAW_DATA ? JCS_YCbCr : JCS_RGB);
+              jparams.quality = quality;
+              jparams.h_sampling = {h_samp, 1, 1};
+              jparams.v_sampling = {v_samp, 1, 1};
+              jparams.progressive_mode = progr;
+              printf(
+                  "Generating input with quality %d chroma subsampling %dx%d "
+                  "input mode %d progressive_mode %d\n",
+                  quality, h_samp, v_samp, input_mode, progr);
+              GenerateInput(input_mode, jparams, &input);
+              jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+              if (input_mode != COEFFICIENTS) {
+                cinfo.image_width = input.xsize;
+                cinfo.image_height = input.ysize;
+                cinfo.input_components = input.components;
+                jpegli_set_defaults(&cinfo);
+                jpegli_start_compress(&cinfo, TRUE);
+                jpegli_abort_compress(&cinfo);
+                jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+              }
+              EncodeWithJpegli(input, jparams, &cinfo);
+              compressed.resize(buffer_size);
+              std::copy_n(buffer, buffer_size, compressed.data());
+              dparams.output_mode =
+                  input_mode == COEFFICIENTS ? COEFFICIENTS : PIXELS;
+              dparams.out_color_space = input.color_space;
+              output.Clear();
+              DecodeWithLibjpeg(jparams, dparams, compressed, &output);
+              VerifyOutputImage(input, output,
+                                max_rms(quality, h_samp, v_samp));
+            }
+          }
+        }
+      }
+    }
+    return true;
+  };
+  EXPECT_TRUE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncodeAPITest, AbbreviatedStreams) {
+  uint8_t* table_stream = nullptr;
+  unsigned long table_stream_size = 0;
+  uint8_t* data_stream = nullptr;
+  unsigned long data_stream_size = 0;
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_mem_dest(&cinfo, &table_stream, &table_stream_size);
+      cinfo.input_components = 3;
+      cinfo.in_color_space = JCS_RGB;
+      jpegli_set_defaults(&cinfo);
+      jpegli_write_tables(&cinfo);
+      jpegli_mem_dest(&cinfo, &data_stream, &data_stream_size);
+      cinfo.image_width = 1;
+      cinfo.image_height = 1;
+      cinfo.optimize_coding = FALSE;
+      jpegli_set_progressive_level(&cinfo, 0);
+      jpegli_start_compress(&cinfo, FALSE);
+      JSAMPLE image[3] = {0};
+      JSAMPROW row[] = {image};
+      jpegli_write_scanlines(&cinfo, row, 1);
+      jpegli_finish_compress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    EXPECT_LT(data_stream_size, 50);
+    jpegli_destroy_compress(&cinfo);
+  }
+  {
+    jpeg_decompress_struct cinfo = {};
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpeg);
+      jpeg_create_decompress(&cinfo);
+      jpeg_mem_src(&cinfo, table_stream, table_stream_size);
+      jpeg_read_header(&cinfo, FALSE);
+      jpeg_mem_src(&cinfo, data_stream, data_stream_size);
+      jpeg_read_header(&cinfo, TRUE);
+      EXPECT_EQ(1, cinfo.image_width);
+      EXPECT_EQ(1, cinfo.image_height);
+      EXPECT_EQ(3, cinfo.num_components);
+      jpeg_start_decompress(&cinfo);
+      JSAMPLE image[3] = {0};
+      JSAMPROW row[] = {image};
+      jpeg_read_scanlines(&cinfo, row, 1);
+      jxl::msan::UnpoisonMemory(image, 3);
+      EXPECT_EQ(0, image[0]);
+      EXPECT_EQ(0, image[1]);
+      EXPECT_EQ(0, image[2]);
+      jpeg_finish_decompress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpeg_destroy_decompress(&cinfo);
+  }
+  if (table_stream) free(table_stream);
+  if (data_stream) free(data_stream);
+}
+
+void CopyQuantTables(j_compress_ptr cinfo, uint16_t* quant_tables) {
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    int quant_idx = cinfo->comp_info[c].quant_tbl_no;
+    JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[quant_idx];
+    for (int k = 0; k < DCTSIZE2; ++k) {
+      quant_tables[c * DCTSIZE2 + k] = quant_table->quantval[k];
+    }
+  }
+}
+
+TEST(EncodeAPITest, QualitySettings) {
+  // Test that jpegli_set_quality, jpegli_set_linear_quality and
+  // jpegli_quality_scaling are consistent with each other.
+  uint16_t quant_tables0[3 * DCTSIZE2];
+  uint16_t quant_tables1[3 * DCTSIZE2];
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.input_components = 3;
+    cinfo.in_color_space = JCS_RGB;
+    jpegli_set_defaults(&cinfo);
+    for (boolean baseline : {FALSE, TRUE}) {
+      for (int q = 1; q <= 100; ++q) {
+        jpegli_set_quality(&cinfo, q, baseline);
+        CopyQuantTables(&cinfo, quant_tables0);
+        jpegli_set_linear_quality(&cinfo, jpegli_quality_scaling(q), baseline);
+        CopyQuantTables(&cinfo, quant_tables1);
+        EXPECT_EQ(0,
+                  memcmp(quant_tables0, quant_tables1, sizeof(quant_tables0)));
+#if JPEG_LIB_VERSION >= 70
+        for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+          cinfo.q_scale_factor[i] = jpegli_quality_scaling(q);
+        }
+        jpegli_default_qtables(&cinfo, baseline);
+        CopyQuantTables(&cinfo, quant_tables1);
+        EXPECT_EQ(0,
+                  memcmp(quant_tables0, quant_tables1, sizeof(quant_tables0)));
+#endif
+      }
+    }
+    return true;
+  };
+  EXPECT_TRUE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  // Test jpegli_quality_scaling for some specific values .
+  EXPECT_EQ(5000, jpegli_quality_scaling(-1));
+  EXPECT_EQ(5000, jpegli_quality_scaling(0));
+  EXPECT_EQ(5000, jpegli_quality_scaling(1));
+  EXPECT_EQ(100, jpegli_quality_scaling(50));
+  EXPECT_EQ(50, jpegli_quality_scaling(75));
+  EXPECT_EQ(20, jpegli_quality_scaling(90));
+  EXPECT_EQ(0, jpegli_quality_scaling(100));
+  EXPECT_EQ(0, jpegli_quality_scaling(101));
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  for (int h_samp : {1, 2}) {
+    for (int v_samp : {1, 2}) {
+      for (int progr : {0, 2}) {
+        for (int optimize : {0, 1}) {
+          if (progr && optimize) continue;
+          TestConfig config;
+          config.jparams.h_sampling = {h_samp, 1, 1};
+          config.jparams.v_sampling = {v_samp, 1, 1};
+          config.jparams.progressive_mode = progr;
+          if (!progr) {
+            config.jparams.optimize_coding = optimize;
+          }
+          const float kMaxBpp[4] = {1.55, 1.45, 1.45, 1.32};
+          const float kMaxDist[4] = {1.95, 2.1, 2.1, 2.0};
+          const int idx = v_samp * 2 + h_samp - 3;
+          config.max_bpp =
+              kMaxBpp[idx] * (optimize ? 0.97 : 1.0) * (progr ? 0.97 : 1.0);
+          config.max_dist = kMaxDist[idx];
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  {
+    TestConfig config;
+    config.jparams.quality = 100;
+    config.max_bpp = 6.6;
+    config.max_dist = 0.6;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.jparams.quality = 80;
+    config.max_bpp = 1.05;
+    config.max_dist = 2.7;
+    all_tests.push_back(config);
+  }
+  for (int samp : {1, 2}) {
+    for (int progr : {0, 2}) {
+      for (int optimize : {0, 1}) {
+        if (progr && optimize) continue;
+        TestConfig config;
+        config.input.xsize = 257;
+        config.input.ysize = 265;
+        config.jparams.h_sampling = {samp, 1, 1};
+        config.jparams.v_sampling = {samp, 1, 1};
+        config.jparams.progressive_mode = progr;
+        if (!progr) {
+          config.jparams.optimize_coding = optimize;
+        }
+        config.jparams.use_adaptive_quantization = false;
+        config.max_bpp = 2.05f;
+        config.max_dist = 2.3f;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  for (int h0_samp : {1, 2, 4}) {
+    for (int v0_samp : {1, 2, 4}) {
+      for (int h2_samp : {1, 2, 4}) {
+        for (int v2_samp : {1, 2, 4}) {
+          TestConfig config;
+          config.input.xsize = 137;
+          config.input.ysize = 75;
+          config.jparams.progressive_mode = 2;
+          config.jparams.h_sampling = {h0_samp, 1, h2_samp};
+          config.jparams.v_sampling = {v0_samp, 1, v2_samp};
+          config.max_bpp = 2.5;
+          config.max_dist = 12.0;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (int h0_samp : {1, 3}) {
+    for (int v0_samp : {1, 3}) {
+      for (int h2_samp : {1, 3}) {
+        for (int v2_samp : {1, 3}) {
+          TestConfig config;
+          config.input.xsize = 205;
+          config.input.ysize = 99;
+          config.jparams.progressive_mode = 2;
+          config.jparams.h_sampling = {h0_samp, 1, h2_samp};
+          config.jparams.v_sampling = {v0_samp, 1, v2_samp};
+          config.max_bpp = 2.5;
+          config.max_dist = 10.0;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (int h0_samp : {1, 2, 3, 4}) {
+    for (int v0_samp : {1, 2, 3, 4}) {
+      TestConfig config;
+      config.input.xsize = 217;
+      config.input.ysize = 129;
+      config.jparams.progressive_mode = 2;
+      config.jparams.h_sampling = {h0_samp, 1, 1};
+      config.jparams.v_sampling = {v0_samp, 1, 1};
+      config.max_bpp = 2.0;
+      config.max_dist = 5.5;
+      all_tests.push_back(config);
+    }
+  }
+  for (int p = 0; p < 3 + kNumTestScripts; ++p) {
+    TestConfig config;
+    config.jparams.progressive_mode = p;
+    const float kMaxBpp[] = {1.59, 1.51, 1.48, 1.59, 1.55, 1.55, 1.51};
+    config.max_bpp = kMaxBpp[p];
+    config.max_dist = 2.0;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.jparams.simple_progression = true;
+    config.max_bpp = 1.48;
+    config.max_dist = 2.0;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.input_mode = COEFFICIENTS;
+    config.jparams.h_sampling = {2, 1, 1};
+    config.jparams.v_sampling = {2, 1, 1};
+    config.jparams.progressive_mode = 0;
+    config.jparams.optimize_coding = 0;
+    config.max_bpp = 16;
+    config.max_dist = 0.0;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.jparams.xyb_mode = true;
+    config.jparams.progressive_mode = 2;
+    config.max_bpp = 1.5;
+    config.max_dist = 3.5;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.jparams.libjpeg_mode = true;
+    config.max_bpp = 2.1;
+    config.max_dist = 1.7;
+    all_tests.push_back(config);
+  }
+
+  for (J_COLOR_SPACE in_color_space : {JCS_RGB, JCS_YCbCr, JCS_GRAYSCALE}) {
+    for (J_COLOR_SPACE jpeg_color_space : {JCS_RGB, JCS_YCbCr, JCS_GRAYSCALE}) {
+      if (jpeg_color_space == JCS_RGB && in_color_space == JCS_YCbCr) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 256;
+      config.input.color_space = in_color_space;
+      config.jparams.set_jpeg_colorspace = true;
+      config.jparams.jpeg_color_space = jpeg_color_space;
+      config.max_bpp = jpeg_color_space == JCS_RGB ? 4.5 : 1.85;
+      config.max_dist = jpeg_color_space == JCS_RGB ? 1.4 : 2.05;
+      all_tests.push_back(config);
+    }
+  }
+  for (J_COLOR_SPACE in_color_space : {JCS_CMYK, JCS_YCCK}) {
+    for (J_COLOR_SPACE jpeg_color_space : {JCS_CMYK, JCS_YCCK}) {
+      if (jpeg_color_space == JCS_CMYK && in_color_space == JCS_YCCK) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 256;
+      config.input.color_space = in_color_space;
+      if (in_color_space != jpeg_color_space) {
+        config.jparams.set_jpeg_colorspace = true;
+        config.jparams.jpeg_color_space = jpeg_color_space;
+      }
+      config.max_bpp = jpeg_color_space == JCS_CMYK ? 4.0 : 3.6;
+      config.max_dist = jpeg_color_space == JCS_CMYK ? 1.2 : 1.5;
+      all_tests.push_back(config);
+    }
+  }
+  {
+    TestConfig config;
+    config.input.color_space = JCS_YCbCr;
+    config.max_bpp = 1.6;
+    config.max_dist = 1.35;
+    all_tests.push_back(config);
+  }
+  for (bool xyb : {false, true}) {
+    TestConfig config;
+    config.input.color_space = JCS_GRAYSCALE;
+    config.jparams.xyb_mode = xyb;
+    config.max_bpp = 1.35;
+    config.max_dist = 1.4;
+    all_tests.push_back(config);
+  }
+  for (int channels = 1; channels <= 4; ++channels) {
+    TestConfig config;
+    config.input.color_space = JCS_UNKNOWN;
+    config.input.components = channels;
+    config.max_bpp = 1.35 * channels;
+    config.max_dist = 1.4;
+    all_tests.push_back(config);
+  }
+  for (size_t r : {1, 3, 17, 1024}) {
+    for (int progr : {0, 2}) {
+      TestConfig config;
+      config.jparams.restart_interval = r;
+      config.jparams.progressive_mode = progr;
+      config.max_bpp = 1.58 + 5.5 / r;
+      config.max_dist = 2.2;
+      all_tests.push_back(config);
+    }
+  }
+  for (size_t rr : {1, 3, 8, 100}) {
+    TestConfig config;
+    config.jparams.restart_in_rows = rr;
+    config.max_bpp = 1.6;
+    config.max_dist = 2.2;
+    all_tests.push_back(config);
+  }
+  for (int type : {0, 1, 10, 100, 10000}) {
+    for (int scale : {1, 50, 100, 200, 500}) {
+      for (bool add_raw : {false, true}) {
+        for (bool baseline : {true, false}) {
+          if (!baseline && (add_raw || type * scale < 25500)) continue;
+          TestConfig config;
+          config.input.xsize = 64;
+          config.input.ysize = 64;
+          CustomQuantTable table;
+          table.table_type = type;
+          table.scale_factor = scale;
+          table.force_baseline = baseline;
+          table.add_raw = add_raw;
+          table.Generate();
+          config.jparams.optimize_coding = 1;
+          config.jparams.quant_tables.push_back(table);
+          config.jparams.quant_indexes = {0, 0, 0};
+          float q = (type == 0 ? 16 : type) * scale * 0.01f;
+          if (baseline && !add_raw) q = std::max(1.0f, std::min(255.0f, q));
+          config.max_bpp = 1.5f + 25.0f / q;
+          config.max_dist = 0.6f + 0.25f * q;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    if (qidx == 3) continue;
+    TestConfig config;
+    config.input.xsize = 256;
+    config.input.ysize = 256;
+    config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                    (qidx >> 0) & 1};
+    config.max_bpp = 2.25;
+    config.max_dist = 2.8;
+    all_tests.push_back(config);
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    for (int slot_idx = 0; slot_idx < 2; ++slot_idx) {
+      if (qidx == 0 && slot_idx == 0) continue;
+      TestConfig config;
+      config.input.xsize = 256;
+      config.input.ysize = 256;
+      config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                      (qidx >> 0) & 1};
+      CustomQuantTable table;
+      table.slot_idx = slot_idx;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+      config.max_bpp = 2.3;
+      config.max_dist = 2.9;
+      all_tests.push_back(config);
+    }
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    for (bool xyb : {false, true}) {
+      TestConfig config;
+      config.input.xsize = 256;
+      config.input.ysize = 256;
+      config.jparams.xyb_mode = xyb;
+      config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                      (qidx >> 0) & 1};
+      {
+        CustomQuantTable table;
+        table.slot_idx = 0;
+        table.Generate();
+        config.jparams.quant_tables.push_back(table);
+      }
+      {
+        CustomQuantTable table;
+        table.slot_idx = 1;
+        table.table_type = 20;
+        table.Generate();
+        config.jparams.quant_tables.push_back(table);
+      }
+      config.max_bpp = 2.0;
+      config.max_dist = 3.85;
+      all_tests.push_back(config);
+    }
+  }
+  for (bool xyb : {false, true}) {
+    TestConfig config;
+    config.input.xsize = 256;
+    config.input.ysize = 256;
+    config.jparams.xyb_mode = xyb;
+    config.jparams.quant_indexes = {0, 1, 2};
+    {
+      CustomQuantTable table;
+      table.slot_idx = 0;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    {
+      CustomQuantTable table;
+      table.slot_idx = 1;
+      table.table_type = 20;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    {
+      CustomQuantTable table;
+      table.slot_idx = 2;
+      table.table_type = 30;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    config.max_bpp = 1.5;
+    config.max_dist = 3.75;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.jparams.comp_ids = {7, 17, 177};
+    config.input.xsize = config.input.ysize = 128;
+    config.max_bpp = 2.25;
+    config.max_dist = 2.4;
+    all_tests.push_back(config);
+  }
+  for (int override_JFIF : {-1, 0, 1}) {
+    for (int override_Adobe : {-1, 0, 1}) {
+      if (override_JFIF == -1 && override_Adobe == -1) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 128;
+      config.jparams.override_JFIF = override_JFIF;
+      config.jparams.override_Adobe = override_Adobe;
+      config.max_bpp = 2.25;
+      config.max_dist = 2.4;
+      all_tests.push_back(config);
+    }
+  }
+  {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.max_bpp = 1.85;
+    config.max_dist = 2.05;
+    config.jparams.add_marker = true;
+    all_tests.push_back(config);
+  }
+  for (size_t icc_size : {728, 70000, 1000000}) {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.max_dist = 2.05;
+    config.jparams.icc.resize(icc_size);
+    for (size_t i = 0; i < icc_size; ++i) {
+      config.jparams.icc[i] = (i * 17) & 0xff;
+    }
+    all_tests.push_back(config);
+  }
+  for (JpegIOMode input_mode : {PIXELS, RAW_DATA, COEFFICIENTS}) {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.input_mode = input_mode;
+    if (input_mode == RAW_DATA) {
+      config.input.color_space = JCS_YCbCr;
+    }
+    config.jparams.progressive_mode = 0;
+    config.jparams.optimize_coding = 0;
+    config.max_bpp = 1.85;
+    config.max_dist = 2.05;
+    if (input_mode == COEFFICIENTS) {
+      config.max_bpp = 3.5;
+      config.max_dist = 0.0;
+    }
+    all_tests.push_back(config);
+    config.jparams.use_flat_dc_luma_code = true;
+    all_tests.push_back(config);
+  }
+  for (int xsize : {640, 641, 648, 649}) {
+    for (int ysize : {640, 641, 648, 649}) {
+      for (int h_sampling : {1, 2}) {
+        for (int v_sampling : {1, 2}) {
+          if (h_sampling == 1 && v_sampling == 1) continue;
+          for (int progr : {0, 2}) {
+            TestConfig config;
+            config.input.xsize = xsize;
+            config.input.ysize = ysize;
+            config.input.color_space = JCS_YCbCr;
+            config.jparams.h_sampling = {h_sampling, 1, 1};
+            config.jparams.v_sampling = {v_sampling, 1, 1};
+            config.jparams.progressive_mode = progr;
+            config.input_mode = RAW_DATA;
+            config.max_bpp = 1.75;
+            config.max_dist = 2.0;
+            all_tests.push_back(config);
+            config.input_mode = COEFFICIENTS;
+            if (xsize & 1) {
+              config.jparams.add_marker = true;
+            }
+            config.max_bpp = 24.0;
+            all_tests.push_back(config);
+          }
+        }
+      }
+    }
+  }
+  for (JpegliDataType data_type : {JPEGLI_TYPE_UINT16, JPEGLI_TYPE_FLOAT}) {
+    for (JpegliEndianness endianness :
+         {JPEGLI_LITTLE_ENDIAN, JPEGLI_BIG_ENDIAN, JPEGLI_NATIVE_ENDIAN}) {
+      J_COLOR_SPACE colorspace[4] = {JCS_GRAYSCALE, JCS_UNKNOWN, JCS_RGB,
+                                     JCS_CMYK};
+      float max_bpp[4] = {1.32, 2.7, 1.6, 4.0};
+      for (int channels = 1; channels <= 4; ++channels) {
+        TestConfig config;
+        config.input.data_type = data_type;
+        config.input.endianness = endianness;
+        config.input.components = channels;
+        config.input.color_space = colorspace[channels - 1];
+        config.max_bpp = max_bpp[channels - 1];
+        config.max_dist = 2.2;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  for (int smoothing : {1, 5, 50, 100}) {
+    for (int h_sampling : {1, 2}) {
+      for (int v_sampling : {1, 2}) {
+        TestConfig config;
+        config.input.xsize = 257;
+        config.input.ysize = 265;
+        config.jparams.smoothing_factor = smoothing;
+        config.jparams.h_sampling = {h_sampling, 1, 1};
+        config.jparams.v_sampling = {v_sampling, 1, 1};
+        config.max_bpp = 1.85;
+        config.max_dist = 3.05f;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  return all_tests;
+};
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  os << c.input;
+  os << c.jparams;
+  if (c.input_mode == RAW_DATA) {
+    os << "RawDataIn";
+  } else if (c.input_mode == COEFFICIENTS) {
+    os << "WriteCoeffs";
+  }
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<EncodeAPITestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(EncodeAPITest, EncodeAPITestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/encode_internal.h b/third_party/jpeg-xl/lib/jpegli/encode_internal.h
new file mode 100644
index 0000000000..8f08272fd2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/encode_internal.h
@@ -0,0 +1,101 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_ENCODE_INTERNAL_H_
+#define LIB_JPEGLI_ENCODE_INTERNAL_H_
+
+/* clang-format off */
+#include <stdint.h>
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#include "lib/jpegli/bit_writer.h"
+#include "lib/jpegli/common_internal.h"
+#include "lib/jpegli/encode.h"
+
+namespace jpegli {
+
+constexpr unsigned char kICCSignature[12] = {
+    0x49, 0x43, 0x43, 0x5F, 0x50, 0x52, 0x4F, 0x46, 0x49, 0x4C, 0x45, 0x00};
+constexpr int kICCMarker = JPEG_APP0 + 2;
+
+struct JPEGHuffmanCode {
+  // Bit length histogram.
+  uint32_t counts[kJpegHuffmanMaxBitLength + 1];
+  // Symbol values sorted by increasing bit lengths.
+  uint32_t values[kJpegHuffmanAlphabetSize + 1];
+  // The index of the Huffman code in the current set of Huffman codes. For AC
+  // component Huffman codes, 0x10 is added to the index.
+  int slot_id;
+  boolean sent_table;
+};
+
+// DCTCodingState: maximum number of correction bits to buffer
+const int kJPEGMaxCorrectionBits = 1u << 16;
+
+constexpr int kDefaultProgressiveLevel = 0;
+
+struct HuffmanCodeTable {
+  int depth[256];
+  int code[256];
+};
+
+struct ScanCodingInfo {
+  uint32_t dc_tbl_idx[MAX_COMPS_IN_SCAN];
+  uint32_t ac_tbl_idx[MAX_COMPS_IN_SCAN];
+  // Number of Huffman codes defined in the DHT segment preceding this scan.
+  size_t num_huffman_codes;
+};
+
+typedef int16_t coeff_t;
+
+}  // namespace jpegli
+
+struct jpeg_comp_master {
+  jpegli::RowBuffer<float> input_buffer[jpegli::kMaxComponents];
+  jpegli::RowBuffer<float>* smooth_input[jpegli::kMaxComponents];
+  jpegli::RowBuffer<float>* raw_data[jpegli::kMaxComponents];
+  bool force_baseline;
+  bool xyb_mode;
+  uint8_t cicp_transfer_function;
+  bool use_std_tables;
+  bool use_adaptive_quantization;
+  int progressive_level;
+  size_t xsize_blocks;
+  size_t ysize_blocks;
+  size_t blocks_per_iMCU_row;
+  jpegli::ScanCodingInfo* scan_coding_info;
+  JpegliDataType data_type;
+  JpegliEndianness endianness;
+  void (*input_method)(const uint8_t* row_in, size_t len,
+                       float* row_out[jpegli::kMaxComponents]);
+  void (*color_transform)(float* row[jpegli::kMaxComponents], size_t len);
+  void (*downsample_method[jpegli::kMaxComponents])(
+      float* rows_in[MAX_SAMP_FACTOR], size_t len, float* row_out);
+  float* quant_mul[jpegli::kMaxComponents];
+  float* zero_bias_offset[jpegli::kMaxComponents];
+  float* zero_bias_mul[jpegli::kMaxComponents];
+  int h_factor[jpegli::kMaxComponents];
+  int v_factor[jpegli::kMaxComponents];
+  jpegli::JPEGHuffmanCode* huffman_codes;
+  size_t num_huffman_codes;
+  jpegli::HuffmanCodeTable huff_tables[8];
+  float* diff_buffer;
+  jpegli::RowBuffer<float> fuzzy_erosion_tmp;
+  jpegli::RowBuffer<float> pre_erosion;
+  jpegli::RowBuffer<float> quant_field;
+  jvirt_barray_ptr* coeff_buffers;
+  size_t next_input_row;
+  size_t next_iMCU_row;
+  size_t last_dht_index;
+  size_t last_restart_interval;
+  JCOEF last_dc_coeff[MAX_COMPS_IN_SCAN];
+  jpegli::JpegBitWriter bw;
+  float* dct_buffer;
+  int32_t* block_tmp;
+};
+
+#endif  // LIB_JPEGLI_ENCODE_INTERNAL_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/entropy_coding.cc b/third_party/jpeg-xl/lib/jpegli/entropy_coding.cc
new file mode 100644
index 0000000000..110a36a3e9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/entropy_coding.cc
@@ -0,0 +1,605 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/entropy_coding.h"
+
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/huffman.h"
+#include "lib/jxl/base/bits.h"
+
+namespace jpegli {
+namespace {
+
+float HistogramCost(const Histogram& histo) {
+  std::vector<uint32_t> counts(kJpegHuffmanAlphabetSize + 1);
+  std::vector<uint8_t> depths(kJpegHuffmanAlphabetSize + 1);
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    counts[i] = histo.count[i];
+  }
+  counts[kJpegHuffmanAlphabetSize] = 1;
+  CreateHuffmanTree(counts.data(), counts.size(), kJpegHuffmanMaxBitLength,
+                    &depths[0]);
+  size_t header_bits = (1 + kJpegHuffmanMaxBitLength) * 8;
+  size_t data_bits = 0;
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    if (depths[i] > 0) {
+      header_bits += 8;
+      data_bits += counts[i] * depths[i];
+    }
+  }
+  return header_bits + data_bits;
+}
+
+void AddHistograms(const Histogram& a, const Histogram& b, Histogram* c) {
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    c->count[i] = a.count[i] + b.count[i];
+  }
+}
+
+bool IsEmptyHistogram(const Histogram& histo) {
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    if (histo.count[i]) return false;
+  }
+  return true;
+}
+
+}  // namespace
+
+void ClusterJpegHistograms(const Histogram* histograms, size_t num,
+                           JpegClusteredHistograms* clusters) {
+  clusters->histogram_indexes.resize(num);
+  std::vector<uint32_t> slot_histograms;
+  std::vector<float> slot_costs;
+  for (size_t i = 0; i < num; ++i) {
+    const Histogram& cur = histograms[i];
+    if (IsEmptyHistogram(cur)) {
+      continue;
+    }
+    float best_cost = HistogramCost(cur);
+    size_t best_slot = slot_histograms.size();
+    for (size_t j = 0; j < slot_histograms.size(); ++j) {
+      size_t prev_idx = slot_histograms[j];
+      const Histogram& prev = clusters->histograms[prev_idx];
+      Histogram combined;
+      AddHistograms(prev, cur, &combined);
+      float combined_cost = HistogramCost(combined);
+      float cost = combined_cost - slot_costs[j];
+      if (cost < best_cost) {
+        best_cost = cost;
+        best_slot = j;
+      }
+    }
+    if (best_slot == slot_histograms.size()) {
+      // Create new histogram.
+      size_t histogram_index = clusters->histograms.size();
+      clusters->histograms.push_back(cur);
+      clusters->histogram_indexes[i] = histogram_index;
+      if (best_slot < 4) {
+        // We have a free slot, so we put the new histogram there.
+        slot_histograms.push_back(histogram_index);
+        slot_costs.push_back(best_cost);
+      } else {
+        // TODO(szabadka) Find the best histogram to replce.
+        best_slot = (clusters->slot_ids.back() + 1) % 4;
+      }
+      slot_histograms[best_slot] = histogram_index;
+      slot_costs[best_slot] = best_cost;
+      clusters->slot_ids.push_back(best_slot);
+    } else {
+      // Merge this histogram with a previous one.
+      size_t histogram_index = slot_histograms[best_slot];
+      const Histogram& prev = clusters->histograms[histogram_index];
+      AddHistograms(prev, cur, &clusters->histograms[histogram_index]);
+      clusters->histogram_indexes[i] = histogram_index;
+      JXL_ASSERT(clusters->slot_ids[histogram_index] == best_slot);
+      slot_costs[best_slot] += best_cost;
+    }
+  }
+}
+
+void BuildJpegHuffmanCode(const Histogram& histo, JPEGHuffmanCode* huff) {
+  std::vector<uint32_t> counts(kJpegHuffmanAlphabetSize + 1);
+  std::vector<uint8_t> depths(kJpegHuffmanAlphabetSize + 1);
+  for (size_t j = 0; j < kJpegHuffmanAlphabetSize; ++j) {
+    counts[j] = histo.count[j];
+  }
+  counts[kJpegHuffmanAlphabetSize] = 1;
+  CreateHuffmanTree(counts.data(), counts.size(), kJpegHuffmanMaxBitLength,
+                    &depths[0]);
+  std::fill(std::begin(huff->counts), std::end(huff->counts), 0);
+  std::fill(std::begin(huff->values), std::end(huff->values), 0);
+  for (size_t i = 0; i <= kJpegHuffmanAlphabetSize; ++i) {
+    if (depths[i] > 0) {
+      ++huff->counts[depths[i]];
+    }
+  }
+  int offset[kJpegHuffmanMaxBitLength + 1] = {0};
+  for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+    offset[i] = offset[i - 1] + huff->counts[i - 1];
+  }
+  for (size_t i = 0; i <= kJpegHuffmanAlphabetSize; ++i) {
+    if (depths[i] > 0) {
+      huff->values[offset[depths[i]]++] = i;
+    }
+  }
+}
+
+void AddJpegHuffmanCode(const Histogram& histogram, size_t slot_id,
+                        JPEGHuffmanCode* huff_codes, size_t* num_huff_codes) {
+  JPEGHuffmanCode huff_code = {};
+  huff_code.slot_id = slot_id;
+  BuildJpegHuffmanCode(histogram, &huff_code);
+  memcpy(&huff_codes[*num_huff_codes], &huff_code, sizeof(huff_code));
+  ++(*num_huff_codes);
+}
+
+namespace {
+void SetJpegHuffmanCode(const JpegClusteredHistograms& clusters,
+                        size_t histogram_id, size_t slot_id_offset,
+                        std::vector<uint32_t>& slot_histograms,
+                        uint32_t* slot_id, bool* is_baseline,
+                        JPEGHuffmanCode* huff_codes, size_t* num_huff_codes) {
+  JXL_ASSERT(histogram_id < clusters.histogram_indexes.size());
+  uint32_t histogram_index = clusters.histogram_indexes[histogram_id];
+  uint32_t id = clusters.slot_ids[histogram_index];
+  if (id > 1) {
+    *is_baseline = false;
+  }
+  *slot_id = id + (slot_id_offset / 4);
+  if (slot_histograms[id] != histogram_index) {
+    AddJpegHuffmanCode(clusters.histograms[histogram_index],
+                       slot_id_offset + id, huff_codes, num_huff_codes);
+    slot_histograms[id] = histogram_index;
+  }
+}
+
+struct DCTState {
+  int eob_run = 0;
+  size_t num_refinement_bits = 0;
+  Histogram* ac_histo = nullptr;
+};
+
+static JXL_INLINE void ProcessFlush(DCTState* s) {
+  if (s->eob_run > 0) {
+    int nbits = jxl::FloorLog2Nonzero<uint32_t>(s->eob_run);
+    int symbol = nbits << 4u;
+    ++s->ac_histo->count[symbol];
+    s->eob_run = 0;
+  }
+  s->num_refinement_bits = 0;
+}
+
+static JXL_INLINE void ProcessEndOfBand(DCTState* s, size_t new_refinement_bits,
+                                        Histogram* new_ac_histo) {
+  if (s->eob_run == 0) {
+    s->ac_histo = new_ac_histo;
+  }
+  ++s->eob_run;
+  s->num_refinement_bits += new_refinement_bits;
+  if (s->eob_run == 0x7FFF ||
+      s->num_refinement_bits > kJPEGMaxCorrectionBits - kDCTBlockSize + 1) {
+    ProcessFlush(s);
+  }
+}
+
+bool ProcessDCTBlockSequential(const coeff_t* coeffs, Histogram* dc_histo,
+                               Histogram* ac_histo, coeff_t* last_dc_coeff) {
+  coeff_t temp2;
+  coeff_t temp;
+  temp2 = coeffs[0];
+  temp = temp2 - *last_dc_coeff;
+  *last_dc_coeff = temp2;
+  temp2 = temp;
+  if (temp < 0) {
+    temp = -temp;
+    if (temp < 0) return false;
+    temp2--;
+  }
+  int dc_nbits = (temp == 0) ? 0 : (jxl::FloorLog2Nonzero<uint32_t>(temp) + 1);
+  ++dc_histo->count[dc_nbits];
+  if (dc_nbits >= 12) return false;
+  int r = 0;
+  for (int k = 1; k < 64; ++k) {
+    if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) {
+      r++;
+      continue;
+    }
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp2 = ~temp;
+    } else {
+      temp2 = temp;
+    }
+    while (r > 15) {
+      ++ac_histo->count[0xf0];
+      r -= 16;
+    }
+    int ac_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    if (ac_nbits >= 16) return false;
+    int symbol = (r << 4u) + ac_nbits;
+    ++ac_histo->count[symbol];
+    r = 0;
+  }
+  if (r > 0) {
+    ++ac_histo->count[0];
+  }
+  return true;
+}
+
+bool ProcessDCTBlockProgressive(const coeff_t* coeffs, Histogram* dc_histo,
+                                Histogram* ac_histo, int Ss, int Se, int Al,
+                                DCTState* s, coeff_t* last_dc_coeff) {
+  bool eob_run_allowed = Ss > 0;
+  coeff_t temp2;
+  coeff_t temp;
+  if (Ss == 0) {
+    temp2 = coeffs[0] >> Al;
+    temp = temp2 - *last_dc_coeff;
+    *last_dc_coeff = temp2;
+    temp2 = temp;
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp2--;
+    }
+    int nbits = (temp == 0) ? 0 : (jxl::FloorLog2Nonzero<uint32_t>(temp) + 1);
+    ++dc_histo->count[nbits];
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int r = 0;
+  for (int k = Ss; k <= Se; ++k) {
+    if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) {
+      r++;
+      continue;
+    }
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp >>= Al;
+      temp2 = ~temp;
+    } else {
+      temp >>= Al;
+      temp2 = temp;
+    }
+    if (temp == 0) {
+      r++;
+      continue;
+    }
+    ProcessFlush(s);
+    while (r > 15) {
+      ++ac_histo->count[0xf0];
+      r -= 16;
+    }
+    int nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int symbol = (r << 4u) + nbits;
+    ++ac_histo->count[symbol];
+    r = 0;
+  }
+  if (r > 0) {
+    ProcessEndOfBand(s, 0, ac_histo);
+    if (!eob_run_allowed) {
+      ProcessFlush(s);
+    }
+  }
+  return true;
+}
+
+bool ProcessRefinementBits(const coeff_t* coeffs, Histogram* ac_histo, int Ss,
+                           int Se, int Al, DCTState* s) {
+  bool eob_run_allowed = Ss > 0;
+  if (Ss == 0) {
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int abs_values[kDCTBlockSize];
+  int eob = 0;
+  for (int k = Ss; k <= Se; k++) {
+    const coeff_t abs_val = std::abs(coeffs[kJPEGNaturalOrder[k]]);
+    abs_values[k] = abs_val >> Al;
+    if (abs_values[k] == 1) {
+      eob = k;
+    }
+  }
+  int r = 0;
+  size_t num_refinement_bits = 0;
+  for (int k = Ss; k <= Se; k++) {
+    if (abs_values[k] == 0) {
+      r++;
+      continue;
+    }
+    while (r > 15 && k <= eob) {
+      ProcessFlush(s);
+      ++ac_histo->count[0xf0];
+      r -= 16;
+      num_refinement_bits = 0;
+    }
+    if (abs_values[k] > 1) {
+      ++num_refinement_bits;
+      continue;
+    }
+    ProcessFlush(s);
+    int symbol = (r << 4u) + 1;
+    ++ac_histo->count[symbol];
+    num_refinement_bits = 0;
+    r = 0;
+  }
+  if (r > 0 || num_refinement_bits > 0) {
+    ProcessEndOfBand(s, num_refinement_bits, ac_histo);
+    if (!eob_run_allowed) {
+      ProcessFlush(s);
+    }
+  }
+  return true;
+}
+
+void ProgressMonitorHistogramPass(j_compress_ptr cinfo, size_t scan_index,
+                                  size_t mcu_y) {
+  if (cinfo->progress == nullptr) {
+    return;
+  }
+  cinfo->progress->completed_passes = 1 + scan_index;
+  cinfo->progress->pass_counter = mcu_y;
+  cinfo->progress->pass_limit = cinfo->total_iMCU_rows;
+  (*cinfo->progress->progress_monitor)(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+bool ProcessScan(j_compress_ptr cinfo,
+                 size_t scan_index, int* histo_index, Histogram* dc_histograms,
+                 Histogram* ac_histograms) {
+  jpeg_comp_master* m = cinfo->master;
+  size_t restart_interval = RestartIntervalForScan(cinfo, scan_index);
+  int restarts_to_go = restart_interval;
+  coeff_t last_dc_coeff[MAX_COMPS_IN_SCAN] = {0};
+  DCTState s;
+
+  const jpeg_scan_info* scan_info = &cinfo->scan_info[scan_index];
+  // "Non-interleaved" means color data comes in separate scans, in other words
+  // each scan can contain only one color component.
+  const bool is_interleaved = (scan_info->comps_in_scan > 1);
+  jpeg_component_info* base_comp =
+      &cinfo->comp_info[scan_info->component_index[0]];
+  // h_group / v_group act as numerators for converting number of blocks to
+  // number of MCU. In interleaved mode it is 1, so MCU is represented with
+  // max_*_samp_factor blocks. In non-interleaved mode we choose numerator to
+  // be the samping factor, consequently MCU is always represented with single
+  // block.
+  const int h_group = is_interleaved ? 1 : base_comp->h_samp_factor;
+  const int v_group = is_interleaved ? 1 : base_comp->v_samp_factor;
+  int MCUs_per_row =
+      DivCeil(cinfo->image_width * h_group, 8 * cinfo->max_h_samp_factor);
+  int MCU_rows =
+      DivCeil(cinfo->image_height * v_group, 8 * cinfo->max_v_samp_factor);
+  const bool is_progressive = cinfo->progressive_mode;
+  const int Al = scan_info->Al;
+  const int Ah = scan_info->Ah;
+  const int Ss = scan_info->Ss;
+  const int Se = scan_info->Se;
+  constexpr coeff_t kDummyBlock[DCTSIZE2] = {0};
+
+  JBLOCKARRAY ba[MAX_COMPS_IN_SCAN];
+  for (int mcu_y = 0; mcu_y < MCU_rows; ++mcu_y) {
+    ProgressMonitorHistogramPass(cinfo, scan_index, mcu_y);
+    for (int i = 0; i < scan_info->comps_in_scan; ++i) {
+      int comp_idx = scan_info->component_index[i];
+      jpeg_component_info* comp = &cinfo->comp_info[comp_idx];
+      int n_blocks_y = is_interleaved ? comp->v_samp_factor : 1;
+      int by0 = mcu_y * n_blocks_y;
+      int block_rows_left = comp->height_in_blocks - by0;
+      int max_block_rows = std::min(n_blocks_y, block_rows_left);
+      ba[i] = (*cinfo->mem->access_virt_barray)(
+          reinterpret_cast<j_common_ptr>(cinfo), m->coeff_buffers[comp_idx],
+          by0, max_block_rows, false);
+    }
+    for (int mcu_x = 0; mcu_x < MCUs_per_row; ++mcu_x) {
+      // Possibly emit a restart marker.
+      if (restart_interval > 0 && restarts_to_go == 0) {
+        ProcessFlush(&s);
+        restarts_to_go = restart_interval;
+        memset(last_dc_coeff, 0, sizeof(last_dc_coeff));
+      }
+      // Encode one MCU
+      for (int i = 0; i < scan_info->comps_in_scan; ++i) {
+        int comp_idx = scan_info->component_index[i];
+        jpeg_component_info* comp = &cinfo->comp_info[comp_idx];
+        int histo_idx = *histo_index + i;
+        Histogram* dc_histo = &dc_histograms[histo_idx];
+        Histogram* ac_histo = &ac_histograms[histo_idx];
+        int n_blocks_y = is_interleaved ? comp->v_samp_factor : 1;
+        int n_blocks_x = is_interleaved ? comp->h_samp_factor : 1;
+        for (int iy = 0; iy < n_blocks_y; ++iy) {
+          for (int ix = 0; ix < n_blocks_x; ++ix) {
+            size_t block_y = mcu_y * n_blocks_y + iy;
+            size_t block_x = mcu_x * n_blocks_x + ix;
+            const coeff_t* block;
+            if (block_x >= comp->width_in_blocks ||
+                block_y >= comp->height_in_blocks) {
+              block = kDummyBlock;
+            } else {
+              block = &ba[i][iy][block_x][0];
+            }
+            bool ok;
+            if (!is_progressive) {
+              ok = ProcessDCTBlockSequential(block, dc_histo, ac_histo,
+                                             last_dc_coeff + i);
+            } else if (Ah == 0) {
+              ok = ProcessDCTBlockProgressive(block, dc_histo, ac_histo, Ss, Se,
+                                              Al, &s, last_dc_coeff + i);
+            } else {
+              ok = ProcessRefinementBits(block, ac_histo, Ss, Se, Al, &s);
+            }
+            if (!ok) return false;
+          }
+        }
+      }
+      --restarts_to_go;
+    }
+  }
+  ProcessFlush(&s);
+  *histo_index += scan_info->comps_in_scan;
+  return true;
+}
+
+void ProcessJpeg(j_compress_ptr cinfo,
+                 std::vector<Histogram>* dc_histograms,
+                 std::vector<Histogram>* ac_histograms) {
+  int histo_index = 0;
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    if (!ProcessScan(cinfo, i, &histo_index, &(*dc_histograms)[0],
+                     &(*ac_histograms)[0])) {
+      JPEGLI_ERROR("Invalid scan.");
+    }
+  }
+}
+
+void CopyHuffmanTable(j_compress_ptr cinfo, int index, bool is_dc,
+                      JPEGHuffmanCode* huffman_codes,
+                      size_t* num_huffman_codes) {
+  const char* type = is_dc ? "DC" : "AC";
+  if (index < 0 || index >= NUM_HUFF_TBLS) {
+    JPEGLI_ERROR("Invalid %s Huffman table index %d", type, index);
+  }
+  JHUFF_TBL* table =
+      is_dc ? cinfo->dc_huff_tbl_ptrs[index] : cinfo->ac_huff_tbl_ptrs[index];
+  if (table == nullptr) {
+    JPEGLI_ERROR("Missing %s Huffman table %d", type, index);
+  }
+  ValidateHuffmanTable(reinterpret_cast<j_common_ptr>(cinfo), table, is_dc);
+  JPEGHuffmanCode huff = {};
+  size_t max_depth = 0;
+  for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+    if (table->bits[i] != 0) max_depth = i;
+    huff.counts[i] = table->bits[i];
+  }
+  ++huff.counts[max_depth];
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    huff.values[i] = table->huffval[i];
+  }
+  huff.slot_id = index + (is_dc ? 0 : 0x10);
+  huff.sent_table = table->sent_table;
+  bool have_slot = false;
+  for (size_t i = 0; i < *num_huffman_codes; ++i) {
+    if (huffman_codes[i].slot_id == huff.slot_id) have_slot = true;
+  }
+  if (!have_slot) {
+    memcpy(&huffman_codes[*num_huffman_codes], &huff, sizeof(huff));
+    ++(*num_huffman_codes);
+  }
+}
+
+}  // namespace
+
+void CopyHuffmanCodes(j_compress_ptr cinfo, bool* is_baseline) {
+  jpeg_comp_master* m = cinfo->master;
+  m->huffman_codes =
+      Allocate<JPEGHuffmanCode>(cinfo, 2 * cinfo->num_components, JPOOL_IMAGE);
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    if (comp->dc_tbl_no > 1 || comp->ac_tbl_no > 1) {
+      *is_baseline = false;
+    }
+    CopyHuffmanTable(cinfo, comp->dc_tbl_no, /*is_dc=*/true, m->huffman_codes,
+                     &m->num_huffman_codes);
+    CopyHuffmanTable(cinfo, comp->ac_tbl_no, /*is_dc=*/false, m->huffman_codes,
+                     &m->num_huffman_codes);
+  }
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    const jpeg_scan_info* si = &cinfo->scan_info[i];
+    ScanCodingInfo sci = {};
+    for (int j = 0; j < si->comps_in_scan; ++j) {
+      int ci = si->component_index[j];
+      sci.dc_tbl_idx[j] = cinfo->comp_info[ci].dc_tbl_no;
+      sci.ac_tbl_idx[j] = cinfo->comp_info[ci].ac_tbl_no + 4;
+    }
+    if (i == 0) {
+      sci.num_huffman_codes = m->num_huffman_codes;
+    }
+    memcpy(&m->scan_coding_info[i], &sci, sizeof(sci));
+  }
+}
+
+size_t RestartIntervalForScan(j_compress_ptr cinfo, size_t scan_index) {
+  if (cinfo->restart_in_rows <= 0) {
+    return cinfo->restart_interval;
+  } else {
+    const jpeg_scan_info* scan_info = &cinfo->scan_info[scan_index];
+    const bool is_interleaved = (scan_info->comps_in_scan > 1);
+    jpeg_component_info* base_comp =
+        &cinfo->comp_info[scan_info->component_index[0]];
+    const int h_group = is_interleaved ? 1 : base_comp->h_samp_factor;
+    int MCUs_per_row =
+        DivCeil(cinfo->image_width * h_group, 8 * cinfo->max_h_samp_factor);
+    return std::min<size_t>(MCUs_per_row * cinfo->restart_in_rows, 65535u);
+  }
+}
+
+void OptimizeHuffmanCodes(j_compress_ptr cinfo, bool* is_baseline) {
+  jpeg_comp_master* m = cinfo->master;
+  // Gather histograms.
+  size_t num_histo = 0;
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    num_histo += cinfo->scan_info[i].comps_in_scan;
+  }
+  std::vector<Histogram> dc_histograms(num_histo);
+  std::vector<Histogram> ac_histograms(num_histo);
+  ProcessJpeg(cinfo, &dc_histograms, &ac_histograms);
+
+  // Cluster DC histograms.
+  JpegClusteredHistograms dc_clusters;
+  ClusterJpegHistograms(dc_histograms.data(), dc_histograms.size(),
+                        &dc_clusters);
+
+  // Cluster AC histograms.
+  JpegClusteredHistograms ac_clusters;
+  ClusterJpegHistograms(ac_histograms.data(), ac_histograms.size(),
+                        &ac_clusters);
+
+  // Add the first 4 DC and AC histograms in the first DHT segment.
+  std::vector<uint32_t> dc_slot_histograms;
+  std::vector<uint32_t> ac_slot_histograms;
+  m->huffman_codes = Allocate<JPEGHuffmanCode>(cinfo, num_histo, JPOOL_IMAGE);
+  for (size_t i = 0; i < dc_clusters.histograms.size(); ++i) {
+    if (i >= 4) break;
+    JXL_ASSERT(dc_clusters.slot_ids[i] == i);
+    AddJpegHuffmanCode(dc_clusters.histograms[i], i, m->huffman_codes,
+                       &m->num_huffman_codes);
+    dc_slot_histograms.push_back(i);
+  }
+  for (size_t i = 0; i < ac_clusters.histograms.size(); ++i) {
+    if (i >= 4) break;
+    JXL_ASSERT(ac_clusters.slot_ids[i] == i);
+    AddJpegHuffmanCode(ac_clusters.histograms[i], 0x10 + i, m->huffman_codes,
+                       &m->num_huffman_codes);
+    ac_slot_histograms.push_back(i);
+  }
+
+  // Set the Huffman table indexes in the scan_infos and emit additional DHT
+  // segments if necessary.
+  size_t histogram_id = 0;
+  size_t num_huffman_codes_sent = 0;
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    ScanCodingInfo sci = {};
+    for (int c = 0; c < cinfo->scan_info[i].comps_in_scan; ++c) {
+      SetJpegHuffmanCode(dc_clusters, histogram_id, 0, dc_slot_histograms,
+                         &sci.dc_tbl_idx[c], is_baseline, m->huffman_codes,
+                         &m->num_huffman_codes);
+      SetJpegHuffmanCode(ac_clusters, histogram_id, 0x10, ac_slot_histograms,
+                         &sci.ac_tbl_idx[c], is_baseline, m->huffman_codes,
+                         &m->num_huffman_codes);
+      ++histogram_id;
+    }
+    sci.num_huffman_codes = m->num_huffman_codes - num_huffman_codes_sent;
+    num_huffman_codes_sent = m->num_huffman_codes;
+    memcpy(&m->scan_coding_info[i], &sci, sizeof(sci));
+  }
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/entropy_coding.h b/third_party/jpeg-xl/lib/jpegli/entropy_coding.h
new file mode 100644
index 0000000000..6d9dd2303b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/entropy_coding.h
@@ -0,0 +1,45 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_ENTROPY_CODING_H_
+#define LIB_JPEGLI_ENTROPY_CODING_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#include <vector>
+
+#include "lib/jpegli/encode_internal.h"
+
+namespace jpegli {
+
+void CopyHuffmanCodes(j_compress_ptr cinfo, bool* is_baseline);
+
+size_t RestartIntervalForScan(j_compress_ptr cinfo, size_t scan_index);
+
+struct Histogram {
+  int count[kJpegHuffmanAlphabetSize];
+  Histogram() { memset(count, 0, sizeof(count)); }
+};
+
+struct JpegClusteredHistograms {
+  std::vector<Histogram> histograms;
+  std::vector<uint32_t> histogram_indexes;
+  std::vector<uint32_t> slot_ids;
+};
+
+void ClusterJpegHistograms(const Histogram* histograms, size_t num,
+                           JpegClusteredHistograms* clusters);
+
+void AddJpegHuffmanCode(const Histogram& histogram, size_t slot_id,
+                        JPEGHuffmanCode* huff_codes, size_t* num_huff_codes);
+
+void OptimizeHuffmanCodes(j_compress_ptr cinfo, bool* is_baseline);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_ENTROPY_CODING_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/error.cc b/third_party/jpeg-xl/lib/jpegli/error.cc
new file mode 100644
index 0000000000..289261672d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/error.cc
@@ -0,0 +1,102 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/error.h"
+
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <string>
+
+#include "lib/jpegli/common.h"
+
+namespace jpegli {
+
+const char* const kErrorMessageTable[] = {
+    "Message codes are not supported, error message is in msg_parm.s string",
+};
+
+bool FormatString(char* buffer, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  vsnprintf(buffer, JMSG_STR_PARM_MAX, format, args);
+  va_end(args);
+  return false;
+}
+
+void ExitWithAbort(j_common_ptr cinfo) {
+  (*cinfo->err->output_message)(cinfo);
+  jpegli_destroy(cinfo);
+  exit(EXIT_FAILURE);
+}
+
+void EmitMessage(j_common_ptr cinfo, int msg_level) {
+  if (msg_level < 0) {
+    if (cinfo->err->num_warnings <= 5 || cinfo->err->trace_level >= 3) {
+      (*cinfo->err->output_message)(cinfo);
+    }
+    ++cinfo->err->num_warnings;
+  } else if (cinfo->err->trace_level >= msg_level) {
+    (*cinfo->err->output_message)(cinfo);
+  }
+}
+
+void OutputMessage(j_common_ptr cinfo) {
+  char buffer[JMSG_LENGTH_MAX];
+  (*cinfo->err->format_message)(cinfo, buffer);
+  fprintf(stderr, "%s\n", buffer);
+}
+
+void FormatMessage(j_common_ptr cinfo, char* buffer) {
+  jpeg_error_mgr* err = cinfo->err;
+  int code = err->msg_code;
+  if (code == 0) {
+    memcpy(buffer, cinfo->err->msg_parm.s, JMSG_STR_PARM_MAX);
+  } else if (err->addon_message_table != nullptr &&
+             code >= err->first_addon_message &&
+             code <= err->last_addon_message) {
+    std::string msg(err->addon_message_table[code - err->first_addon_message]);
+    if (msg.find("%s") != std::string::npos) {
+      snprintf(buffer, JMSG_LENGTH_MAX, msg.data(), err->msg_parm.s);
+    } else {
+      snprintf(buffer, JMSG_LENGTH_MAX, msg.data(), err->msg_parm.i[0],
+               err->msg_parm.i[1], err->msg_parm.i[2], err->msg_parm.i[3],
+               err->msg_parm.i[4], err->msg_parm.i[5], err->msg_parm.i[6],
+               err->msg_parm.i[7]);
+    }
+  } else {
+    snprintf(buffer, JMSG_LENGTH_MAX, "%s", kErrorMessageTable[0]);
+  }
+}
+
+void ResetErrorManager(j_common_ptr cinfo) {
+  memset(cinfo->err->msg_parm.s, 0, JMSG_STR_PARM_MAX);
+  cinfo->err->msg_code = 0;
+  cinfo->err->num_warnings = 0;
+}
+
+}  // namespace jpegli
+
+struct jpeg_error_mgr* jpegli_std_error(struct jpeg_error_mgr* err) {
+  err->error_exit = jpegli::ExitWithAbort;
+  err->emit_message = jpegli::EmitMessage;
+  err->output_message = jpegli::OutputMessage;
+  err->format_message = jpegli::FormatMessage;
+  err->reset_error_mgr = jpegli::ResetErrorManager;
+  memset(err->msg_parm.s, 0, JMSG_STR_PARM_MAX);
+  err->trace_level = 0;
+  err->num_warnings = 0;
+  // We don't support message codes and message table, but we define one here
+  // in case the application has a custom format_message and tries to access
+  // these fields there.
+  err->msg_code = 0;
+  err->jpeg_message_table = jpegli::kErrorMessageTable;
+  err->last_jpeg_message = 0;
+  err->addon_message_table = nullptr;
+  err->first_addon_message = 0;
+  err->last_addon_message = 0;
+  return err;
+}
diff --git a/third_party/jpeg-xl/lib/jpegli/error.h b/third_party/jpeg-xl/lib/jpegli/error.h
new file mode 100644
index 0000000000..9de030ab3b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/error.h
@@ -0,0 +1,39 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_ERROR_H_
+#define LIB_JPEGLI_ERROR_H_
+
+/* clang-format off */
+#include <stdint.h>
+#include <stdio.h>
+#include <jpeglib.h>
+#include <stdarg.h>
+/* clang-format on */
+
+namespace jpegli {
+
+bool FormatString(char* buffer, const char* format, ...);
+
+}  // namespace jpegli
+
+#define JPEGLI_ERROR(format, ...)                                            \
+  jpegli::FormatString(cinfo->err->msg_parm.s, ("%s:%d: " format), __FILE__, \
+                       __LINE__, ##__VA_ARGS__),                             \
+      (*cinfo->err->error_exit)(reinterpret_cast<j_common_ptr>(cinfo))
+
+#define JPEGLI_WARN(format, ...)                                             \
+  jpegli::FormatString(cinfo->err->msg_parm.s, ("%s:%d: " format), __FILE__, \
+                       __LINE__, ##__VA_ARGS__),                             \
+      (*cinfo->err->emit_message)(reinterpret_cast<j_common_ptr>(cinfo), -1)
+
+#define JPEGLI_TRACE(level, format, ...)                                     \
+  if (cinfo->err->trace_level >= (level))                                    \
+  jpegli::FormatString(cinfo->err->msg_parm.s, ("%s:%d: " format), __FILE__, \
+                       __LINE__, ##__VA_ARGS__),                             \
+      (*cinfo->err->emit_message)(reinterpret_cast<j_common_ptr>(cinfo),     \
+                                  (level))
+
+#endif  // LIB_JPEGLI_ERROR_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/error_handling_test.cc b/third_party/jpeg-xl/lib/jpegli/error_handling_test.cc
new file mode 100644
index 0000000000..f652993827
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/error_handling_test.cc
@@ -0,0 +1,1290 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jpegli {
+namespace {
+
+TEST(EncoderErrorHandlingTest, MinimalSuccess) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+      cinfo.image_width = 1;
+      cinfo.image_height = 1;
+      cinfo.input_components = 1;
+      jpegli_set_defaults(&cinfo);
+      jpegli_start_compress(&cinfo, TRUE);
+      JSAMPLE image[1] = {0};
+      JSAMPROW row[] = {image};
+      jpegli_write_scanlines(&cinfo, row, 1);
+      jpegli_finish_compress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_compress(&cinfo);
+  }
+  {
+    jpeg_decompress_struct cinfo = {};
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpeg);
+      jpeg_create_decompress(&cinfo);
+      jpeg_mem_src(&cinfo, buffer, buffer_size);
+      jpeg_read_header(&cinfo, TRUE);
+      EXPECT_EQ(1, cinfo.image_width);
+      EXPECT_EQ(1, cinfo.image_height);
+      jpeg_start_decompress(&cinfo);
+      JSAMPLE image[1];
+      JSAMPROW row[] = {image};
+      jpeg_read_scanlines(&cinfo, row, 1);
+      jxl::msan::UnpoisonMemory(image, 1);
+      EXPECT_EQ(0, image[0]);
+      jpeg_finish_decompress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpeg_destroy_decompress(&cinfo);
+  }
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoDestination) {
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+}
+
+TEST(EncoderErrorHandlingTest, NoImageDimensions) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, ImageTooBig) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 100000;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoInputComponents) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, TooManyInputComponents) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1000;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoSetDefaults) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoStartCompress) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoWriteScanlines) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoWriteAllScanlines) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 2;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidQuantValue) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.quant_tbl_ptrs[0] = jpegli_alloc_quant_table((j_common_ptr)&cinfo);
+    for (size_t k = 0; k < DCTSIZE2; ++k) {
+      cinfo.quant_tbl_ptrs[0]->quantval[k] = 0;
+    }
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidQuantTableIndex) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].quant_tbl_no = 3;
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch1) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.num_components = 100;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch2) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.num_components = 2;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch3) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.num_components = 2;
+    cinfo.comp_info[1].h_samp_factor = cinfo.comp_info[1].v_samp_factor = 1;
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch4) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    cinfo.in_color_space = JCS_RGB;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch5) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    cinfo.in_color_space = JCS_GRAYSCALE;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[3] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch6) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    cinfo.in_color_space = JCS_RGB;
+    jpegli_set_defaults(&cinfo);
+    cinfo.num_components = 2;
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[3] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidColorTransform) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    cinfo.in_color_space = JCS_YCbCr;
+    jpegli_set_defaults(&cinfo);
+    cinfo.jpeg_color_space = JCS_RGB;
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[3] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, DuplicateComponentIds) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].component_id = 0;
+    cinfo.comp_info[1].component_id = 0;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidComponentIndex) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].component_index = 17;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, ArithmeticCoding) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.arith_code = TRUE;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, CCIR601Sampling) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.CCIR601_sampling = TRUE;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript1) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{1, {0}, 0, 63, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = 0;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript2) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{2, {0, 1}, 0, 63, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript3) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{5, {0}, 0, 63, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript4) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 2;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{2, {0, 0}, 0, 63, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript5) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 2;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{2, {1, 0}, 0, 63, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript6) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{1, {0}, 0, 64, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript7) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{1, {0}, 2, 1, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript8) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 2;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {1, {0}, 0, 63, 0, 0}, {1, {1}, 0, 0, 0, 0}, {1, {1}, 1, 63, 0, 0}  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript9) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {1, {0}, 0, 1, 0, 0}, {1, {0}, 2, 63, 0, 0},  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript10) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 2;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {2, {0, 1}, 0, 0, 0, 0}, {2, {0, 1}, 1, 63, 0, 0}  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript11) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {1, {0}, 1, 63, 0, 0}, {1, {0}, 0, 0, 0, 0}  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript12) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {1, {0}, 0, 0, 10, 1}, {1, {0}, 0, 0, 1, 0}, {1, {0}, 1, 63, 0, 0}  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript13) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {1, {0}, 0, 0, 0, 2},
+        {1, {0}, 0, 0, 1, 0},
+        {1, {0}, 0, 0, 2, 1},  //
+        {1, {0}, 1, 63, 0, 0}  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, MCUSizeTooBig) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    jpegli_set_progressive_level(&cinfo, 0);
+    cinfo.comp_info[0].h_samp_factor = 3;
+    cinfo.comp_info[0].v_samp_factor = 3;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, RestartIntervalTooBig) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.restart_interval = 1000000;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, SamplingFactorTooBig) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].h_samp_factor = 5;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NonIntegralSamplingRatio) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].h_samp_factor = 3;
+    cinfo.comp_info[1].h_samp_factor = 2;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+constexpr const char* kAddOnTable[] = {"First message",
+                                       "Second message with int param %d",
+                                       "Third message with string param %s"};
+
+TEST(EncoderErrorHandlingTest, AddOnTableNoParam) {
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.err->addon_message_table = kAddOnTable;
+    cinfo.err->first_addon_message = 10000;
+    cinfo.err->last_addon_message = 10002;
+    cinfo.err->msg_code = 10000;
+    (*cinfo.err->error_exit)(reinterpret_cast<j_common_ptr>(&cinfo));
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+}
+
+TEST(EncoderErrorHandlingTest, AddOnTableIntParam) {
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.err->addon_message_table = kAddOnTable;
+    cinfo.err->first_addon_message = 10000;
+    cinfo.err->last_addon_message = 10002;
+    cinfo.err->msg_code = 10001;
+    cinfo.err->msg_parm.i[0] = 17;
+    (*cinfo.err->error_exit)(reinterpret_cast<j_common_ptr>(&cinfo));
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+}
+
+TEST(EncoderErrorHandlingTest, AddOnTableNoStringParam) {
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.err->addon_message_table = kAddOnTable;
+    cinfo.err->first_addon_message = 10000;
+    cinfo.err->last_addon_message = 10002;
+    cinfo.err->msg_code = 10002;
+    memcpy(cinfo.err->msg_parm.s, "MESSAGE PARAM", 14);
+    (*cinfo.err->error_exit)(reinterpret_cast<j_common_ptr>(&cinfo));
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+}
+
+static const uint8_t kCompressed0[] = {
+    // SOI
+    0xff, 0xd8,  //
+    // DQT
+    0xff, 0xdb, 0x00, 0x43, 0x00, 0x03, 0x02, 0x02, 0x03, 0x02,  //
+    0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x03, 0x03, 0x04, 0x05,  //
+    0x08, 0x05, 0x05, 0x04, 0x04, 0x05, 0x0a, 0x07, 0x07, 0x06,  //
+    0x08, 0x0c, 0x0a, 0x0c, 0x0c, 0x0b, 0x0a, 0x0b, 0x0b, 0x0d,  //
+    0x0e, 0x12, 0x10, 0x0d, 0x0e, 0x11, 0x0e, 0x0b, 0x0b, 0x10,  //
+    0x16, 0x10, 0x11, 0x13, 0x14, 0x15, 0x15, 0x15, 0x0c, 0x0f,  //
+    0x17, 0x18, 0x16, 0x14, 0x18, 0x12, 0x14, 0x15, 0x14,        //
+    // SOF
+    0xff, 0xc0, 0x00, 0x0b, 0x08, 0x00, 0x01, 0x00, 0x01, 0x01,  //
+    0x01, 0x11, 0x00,                                            //
+    // DHT
+    0xff, 0xc4, 0x00, 0xd2, 0x00, 0x00, 0x01, 0x05, 0x01, 0x01,  //
+    0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
+    0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,  //
+    0x09, 0x0a, 0x0b, 0x10, 0x00, 0x02, 0x01, 0x03, 0x03, 0x02,  //
+    0x04, 0x03, 0x05, 0x05, 0x04, 0x04, 0x00, 0x00, 0x01, 0x7d,  //
+    0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31,  //
+    0x41, 0x06, 0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32,  //
+    0x81, 0x91, 0xa1, 0x08, 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52,  //
+    0xd1, 0xf0, 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,  //
+    0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a,  //
+    0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45,  //
+    0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57,  //
+    0x58, 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,  //
+    0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x83,  //
+    0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94,  //
+    0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,  //
+    0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,  //
+    0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,  //
+    0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8,  //
+    0xd9, 0xda, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8,  //
+    0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,  //
+    0xf9, 0xfa,                                                  //
+    // SOS
+    0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x3f, 0x00,  //
+    // entropy coded data
+    0xfc, 0xaa, 0xaf,  //
+    // EOI
+    0xff, 0xd9,  //
+};
+static const size_t kLen0 = sizeof(kCompressed0);
+
+static const size_t kDQTOffset = 2;
+static const size_t kSOFOffset = 71;
+static const size_t kDHTOffset = 84;
+static const size_t kSOSOffset = 296;
+
+TEST(DecoderErrorHandlingTest, MinimalSuccess) {
+  JXL_CHECK(kCompressed0[kDQTOffset] == 0xff);
+  JXL_CHECK(kCompressed0[kSOFOffset] == 0xff);
+  JXL_CHECK(kCompressed0[kDHTOffset] == 0xff);
+  JXL_CHECK(kCompressed0[kSOSOffset] == 0xff);
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, kCompressed0, kLen0);
+    jpegli_read_header(&cinfo, TRUE);
+    EXPECT_EQ(1, cinfo.image_width);
+    EXPECT_EQ(1, cinfo.image_height);
+    jpegli_start_decompress(&cinfo);
+    JSAMPLE image[1];
+    JSAMPROW row[] = {image};
+    jpegli_read_scanlines(&cinfo, row, 1);
+    EXPECT_EQ(0, image[0]);
+    jpegli_finish_decompress(&cinfo);
+    return true;
+  };
+  EXPECT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+TEST(DecoderErrorHandlingTest, NoSource) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_read_header(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+TEST(DecoderErrorHandlingTest, NoReadHeader) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, kCompressed0, kLen0);
+    jpegli_start_decompress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+TEST(DecoderErrorHandlingTest, NoStartDecompress) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, kCompressed0, kLen0);
+    jpegli_read_header(&cinfo, TRUE);
+    EXPECT_EQ(1, cinfo.image_width);
+    EXPECT_EQ(1, cinfo.image_height);
+    JSAMPLE image[1];
+    JSAMPROW row[] = {image};
+    jpegli_read_scanlines(&cinfo, row, 1);
+    EXPECT_EQ(0, image[0]);
+    jpegli_finish_decompress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+TEST(DecoderErrorHandlingTest, NoReadScanlines) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, kCompressed0, kLen0);
+    jpegli_read_header(&cinfo, TRUE);
+    EXPECT_EQ(1, cinfo.image_width);
+    EXPECT_EQ(1, cinfo.image_height);
+    jpegli_start_decompress(&cinfo);
+    jpegli_finish_decompress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+static const size_t kMaxImageWidth = 0xffff;
+JSAMPLE kOutputBuffer[MAX_COMPONENTS * kMaxImageWidth];
+
+bool ParseCompressed(const std::vector<uint8_t>& compressed) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, compressed.data(), compressed.size());
+    jpegli_read_header(&cinfo, TRUE);
+    jpegli_start_decompress(&cinfo);
+    for (JDIMENSION i = 0; i < cinfo.output_height; ++i) {
+      JSAMPROW row[] = {kOutputBuffer};
+      jpegli_read_scanlines(&cinfo, row, 1);
+    }
+    jpegli_finish_decompress(&cinfo);
+    return true;
+  };
+  bool retval = try_catch_block();
+  jpegli_destroy_decompress(&cinfo);
+  return retval;
+}
+
+TEST(DecoderErrorHandlingTest, NoSOI) {
+  for (int pos : {0, 1}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[pos] = 0;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+}
+
+TEST(DecoderErrorHandlingTest, InvalidDQT) {
+  // Bad marker length
+  for (int diff : {-2, -1, 1, 2}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDQTOffset + 3] += diff;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // inavlid table index / precision
+  for (int val : {0x20, 0x05}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDQTOffset + 4] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // zero quant value
+  for (int k : {0, 1, 17, 63}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDQTOffset + 5 + k] = 0;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+}
+
+TEST(DecoderErrorHandlingTest, InvalidSOF) {
+  // Bad marker length
+  for (int diff : {-2, -1, 1, 2}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + 3] += diff;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // zero width, height or num_components
+  for (int pos : {6, 8, 9}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + pos] = 0;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // invalid data precision
+  for (int val : {0, 1, 127}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + 4] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // too many num_components
+  for (int val : {5, 255}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + 9] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // invalid sampling factors
+  for (int val : {0x00, 0x01, 0x10, 0x15, 0x51}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + 11] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // invalid quant table index
+  for (int val : {5, 17}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + 12] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+}
+
+TEST(DecoderErrorHandlingTest, InvalidDHT) {
+  // Bad marker length
+  for (int diff : {-2, -1, 1, 2}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDHTOffset + 3] += diff;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDHTOffset + 2] += 17;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // inavlid table slot_id
+  for (int val : {0x05, 0x15, 0x20}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDHTOffset + 4] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+}
+
+TEST(DecoderErrorHandlingTest, InvalidSOS) {
+  // Invalid comps_in_scan
+  for (int val : {2, 5, 17}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOSOffset + 4] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // invalid Huffman table indexes
+  for (int val : {0x05, 0x50, 0x15, 0x51}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOSOffset + 6] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // invalid Ss/Se
+  for (int pos : {7, 8}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOSOffset + pos] = 64;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+}
+
+TEST(DecoderErrorHandlingTest, MutateSingleBytes) {
+  for (size_t pos = 0; pos < kLen0; ++pos) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    for (int val : {0x00, 0x0f, 0xf0, 0xff}) {
+      compressed[pos] = val;
+      ParseCompressed(compressed);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/huffman.cc b/third_party/jpeg-xl/lib/jpegli/huffman.cc
new file mode 100644
index 0000000000..1cf88a5536
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/huffman.cc
@@ -0,0 +1,321 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/huffman.h"
+
+#include <limits>
+#include <vector>
+
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/error.h"
+
+namespace jpegli {
+
+// Returns the table width of the next 2nd level table, count is the histogram
+// of bit lengths for the remaining symbols, len is the code length of the next
+// processed symbol.
+static inline int NextTableBitSize(const int* count, int len) {
+  int left = 1 << (len - kJpegHuffmanRootTableBits);
+  while (len < static_cast<int>(kJpegHuffmanMaxBitLength)) {
+    left -= count[len];
+    if (left <= 0) break;
+    ++len;
+    left <<= 1;
+  }
+  return len - kJpegHuffmanRootTableBits;
+}
+
+void BuildJpegHuffmanTable(const uint32_t* count, const uint32_t* symbols,
+                           HuffmanTableEntry* lut) {
+  HuffmanTableEntry code;    // current table entry
+  HuffmanTableEntry* table;  // next available space in table
+  int len;                   // current code length
+  int idx;                   // symbol index
+  int key;                   // prefix code
+  int reps;                  // number of replicate key values in current table
+  int low;                   // low bits for current root entry
+  int table_bits;            // key length of current table
+  int table_size;            // size of current table
+
+  // Make a local copy of the input bit length histogram.
+  int tmp_count[kJpegHuffmanMaxBitLength + 1] = {0};
+  int total_count = 0;
+  for (len = 1; len <= static_cast<int>(kJpegHuffmanMaxBitLength); ++len) {
+    tmp_count[len] = count[len];
+    total_count += tmp_count[len];
+  }
+
+  table = lut;
+  table_bits = kJpegHuffmanRootTableBits;
+  table_size = 1 << table_bits;
+
+  // Special case code with only one value.
+  if (total_count == 1) {
+    code.bits = 0;
+    code.value = symbols[0];
+    for (key = 0; key < table_size; ++key) {
+      table[key] = code;
+    }
+    return;
+  }
+
+  // Fill in root table.
+  key = 0;
+  idx = 0;
+  for (len = 1; len <= kJpegHuffmanRootTableBits; ++len) {
+    for (; tmp_count[len] > 0; --tmp_count[len]) {
+      code.bits = len;
+      code.value = symbols[idx++];
+      reps = 1 << (kJpegHuffmanRootTableBits - len);
+      while (reps--) {
+        table[key++] = code;
+      }
+    }
+  }
+
+  // Fill in 2nd level tables and add pointers to root table.
+  table += table_size;
+  table_size = 0;
+  low = 0;
+  for (len = kJpegHuffmanRootTableBits + 1;
+       len <= static_cast<int>(kJpegHuffmanMaxBitLength); ++len) {
+    for (; tmp_count[len] > 0; --tmp_count[len]) {
+      // Start a new sub-table if the previous one is full.
+      if (low >= table_size) {
+        table += table_size;
+        table_bits = NextTableBitSize(tmp_count, len);
+        table_size = 1 << table_bits;
+        low = 0;
+        lut[key].bits = table_bits + kJpegHuffmanRootTableBits;
+        lut[key].value = (table - lut) - key;
+        ++key;
+      }
+      code.bits = len - kJpegHuffmanRootTableBits;
+      code.value = symbols[idx++];
+      reps = 1 << (table_bits - code.bits);
+      while (reps--) {
+        table[low++] = code;
+      }
+    }
+  }
+}
+
+// A node of a Huffman tree.
+struct HuffmanTree {
+  HuffmanTree(uint32_t count, int16_t left, int16_t right)
+      : total_count(count), index_left(left), index_right_or_value(right) {}
+  uint32_t total_count;
+  int16_t index_left;
+  int16_t index_right_or_value;
+};
+
+void SetDepth(const HuffmanTree& p, HuffmanTree* pool, uint8_t* depth,
+              uint8_t level) {
+  if (p.index_left >= 0) {
+    ++level;
+    SetDepth(pool[p.index_left], pool, depth, level);
+    SetDepth(pool[p.index_right_or_value], pool, depth, level);
+  } else {
+    depth[p.index_right_or_value] = level;
+  }
+}
+
+// Sort the root nodes, least popular first.
+static JXL_INLINE bool Compare(const HuffmanTree& v0, const HuffmanTree& v1) {
+  return v0.total_count < v1.total_count;
+}
+
+// This function will create a Huffman tree.
+//
+// The catch here is that the tree cannot be arbitrarily deep.
+// Brotli specifies a maximum depth of 15 bits for "code trees"
+// and 7 bits for "code length code trees."
+//
+// count_limit is the value that is to be faked as the minimum value
+// and this minimum value is raised until the tree matches the
+// maximum length requirement.
+//
+// This algorithm is not of excellent performance for very long data blocks,
+// especially when population counts are longer than 2**tree_limit, but
+// we are not planning to use this with extremely long blocks.
+//
+// See http://en.wikipedia.org/wiki/Huffman_coding
+void CreateHuffmanTree(const uint32_t* data, const size_t length,
+                       const int tree_limit, uint8_t* depth) {
+  // For block sizes below 64 kB, we never need to do a second iteration
+  // of this loop. Probably all of our block sizes will be smaller than
+  // that, so this loop is mostly of academic interest. If we actually
+  // would need this, we would be better off with the Katajainen algorithm.
+  for (uint32_t count_limit = 1;; count_limit *= 2) {
+    std::vector<HuffmanTree> tree;
+    tree.reserve(2 * length + 1);
+
+    for (size_t i = length; i != 0;) {
+      --i;
+      if (data[i]) {
+        const uint32_t count = std::max(data[i], count_limit - 1);
+        tree.emplace_back(count, -1, static_cast<int16_t>(i));
+      }
+    }
+
+    const size_t n = tree.size();
+    if (n == 1) {
+      // Fake value; will be fixed on upper level.
+      depth[tree[0].index_right_or_value] = 1;
+      break;
+    }
+
+    std::stable_sort(tree.begin(), tree.end(), Compare);
+
+    // The nodes are:
+    // [0, n): the sorted leaf nodes that we start with.
+    // [n]: we add a sentinel here.
+    // [n + 1, 2n): new parent nodes are added here, starting from
+    //              (n+1). These are naturally in ascending order.
+    // [2n]: we add a sentinel at the end as well.
+    // There will be (2n+1) elements at the end.
+    const HuffmanTree sentinel(std::numeric_limits<uint32_t>::max(), -1, -1);
+    tree.push_back(sentinel);
+    tree.push_back(sentinel);
+
+    size_t i = 0;      // Points to the next leaf node.
+    size_t j = n + 1;  // Points to the next non-leaf node.
+    for (size_t k = n - 1; k != 0; --k) {
+      size_t left, right;
+      if (tree[i].total_count <= tree[j].total_count) {
+        left = i;
+        ++i;
+      } else {
+        left = j;
+        ++j;
+      }
+      if (tree[i].total_count <= tree[j].total_count) {
+        right = i;
+        ++i;
+      } else {
+        right = j;
+        ++j;
+      }
+
+      // The sentinel node becomes the parent node.
+      size_t j_end = tree.size() - 1;
+      tree[j_end].total_count =
+          tree[left].total_count + tree[right].total_count;
+      tree[j_end].index_left = static_cast<int16_t>(left);
+      tree[j_end].index_right_or_value = static_cast<int16_t>(right);
+
+      // Add back the last sentinel node.
+      tree.push_back(sentinel);
+    }
+    JXL_DASSERT(tree.size() == 2 * n + 1);
+    SetDepth(tree[2 * n - 1], &tree[0], depth, 0);
+
+    // We need to pack the Huffman tree in tree_limit bits.
+    // If this was not successful, add fake entities to the lowest values
+    // and retry.
+    if (*std::max_element(&depth[0], &depth[length]) <= tree_limit) {
+      break;
+    }
+  }
+}
+
+void ValidateHuffmanTable(j_common_ptr cinfo, const JHUFF_TBL* table,
+                          bool is_dc) {
+  size_t total_symbols = 0;
+  size_t total_p = 0;
+  size_t max_depth = 0;
+  for (size_t d = 1; d <= kJpegHuffmanMaxBitLength; ++d) {
+    uint8_t count = table->bits[d];
+    if (count) {
+      total_symbols += count;
+      total_p += (1u << (kJpegHuffmanMaxBitLength - d)) * count;
+      max_depth = d;
+    }
+  }
+  total_p += 1u << (kJpegHuffmanMaxBitLength - max_depth);  // sentinel symbol
+  if (total_symbols == 0) {
+    JPEGLI_ERROR("Empty Huffman table");
+  }
+  if (total_symbols > kJpegHuffmanAlphabetSize) {
+    JPEGLI_ERROR("Too many symbols in Huffman table");
+  }
+  if (total_p != (1u << kJpegHuffmanMaxBitLength)) {
+    JPEGLI_ERROR("Invalid bit length distribution");
+  }
+  uint8_t symbol_seen[kJpegHuffmanAlphabetSize] = {};
+  for (size_t i = 0; i < total_symbols; ++i) {
+    uint8_t symbol = table->huffval[i];
+    if (symbol_seen[symbol]) {
+      JPEGLI_ERROR("Duplicate symbol %d in Huffman table", symbol);
+    }
+    symbol_seen[symbol] = 1;
+  }
+}
+
+void AddStandardHuffmanTables(j_common_ptr cinfo, bool is_dc) {
+  // Huffman tables from the JPEG standard.
+  static constexpr JHUFF_TBL kStandardDCTables[2] = {
+      // DC luma
+      {{0, 0, 1, 5, 1, 1, 1, 1, 1, 1},
+       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+       FALSE},
+      // DC chroma
+      {{0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+       FALSE}};
+  static constexpr JHUFF_TBL kStandardACTables[2] = {
+      // AC luma
+      {{0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 125},
+       {0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06,
+        0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+        0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0, 0x24, 0x33, 0x62, 0x72,
+        0x82, 0x09, 0x0a, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+        0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45,
+        0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+        0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75,
+        0x76, 0x77, 0x78, 0x79, 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+        0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3,
+        0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+        0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9,
+        0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+        0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4,
+        0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa},
+       FALSE},
+      // AC chroma
+      {{0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 119},
+       {0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41,
+        0x51, 0x07, 0x61, 0x71, 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+        0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0, 0x15, 0x62, 0x72, 0xd1,
+        0x0a, 0x16, 0x24, 0x34, 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+        0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44,
+        0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+        0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74,
+        0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+        0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a,
+        0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+        0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+        0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+        0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4,
+        0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa},
+       FALSE}};
+  const JHUFF_TBL* std_tables = is_dc ? kStandardDCTables : kStandardACTables;
+  JHUFF_TBL** tables;
+  if (cinfo->is_decompressor) {
+    j_decompress_ptr cinfo_d = reinterpret_cast<j_decompress_ptr>(cinfo);
+    tables = is_dc ? cinfo_d->dc_huff_tbl_ptrs : cinfo_d->ac_huff_tbl_ptrs;
+  } else {
+    j_compress_ptr cinfo_c = reinterpret_cast<j_compress_ptr>(cinfo);
+    tables = is_dc ? cinfo_c->dc_huff_tbl_ptrs : cinfo_c->ac_huff_tbl_ptrs;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (tables[i] == nullptr) {
+      tables[i] = jpegli_alloc_huff_table(cinfo);
+      memcpy(tables[i], &std_tables[i], sizeof(JHUFF_TBL));
+      ValidateHuffmanTable(cinfo, tables[i], is_dc);
+    }
+  }
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/huffman.h b/third_party/jpeg-xl/lib/jpegli/huffman.h
new file mode 100644
index 0000000000..f0e5e1de40
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/huffman.h
@@ -0,0 +1,50 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_HUFFMAN_H_
+#define LIB_JPEGLI_HUFFMAN_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "lib/jpegli/common_internal.h"
+
+namespace jpegli {
+
+constexpr int kJpegHuffmanRootTableBits = 8;
+// Maximum huffman lookup table size.
+// According to zlib/examples/enough.c, 758 entries are always enough for
+// an alphabet of 257 symbols (256 + 1 special symbol for the all 1s code) and
+// max bit length 16 if the root table has 8 bits.
+constexpr int kJpegHuffmanLutSize = 758;
+
+struct HuffmanTableEntry {
+  uint8_t bits;    // number of bits used for this symbol
+  uint16_t value;  // symbol value or table offset
+};
+
+void BuildJpegHuffmanTable(const uint32_t* count, const uint32_t* symbols,
+                           HuffmanTableEntry* lut);
+
+// This function will create a Huffman tree.
+//
+// The (data,length) contains the population counts.
+// The tree_limit is the maximum bit depth of the Huffman codes.
+//
+// The depth contains the tree, i.e., how many bits are used for
+// the symbol.
+//
+// See http://en.wikipedia.org/wiki/Huffman_coding
+void CreateHuffmanTree(const uint32_t* data, size_t length, int tree_limit,
+                       uint8_t* depth);
+
+void ValidateHuffmanTable(j_common_ptr cinfo, const JHUFF_TBL* table,
+                          bool is_dc);
+
+void AddStandardHuffmanTables(j_common_ptr cinfo, bool is_dc);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_HUFFMAN_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/idct.cc b/third_party/jpeg-xl/lib/jpegli/idct.cc
new file mode 100644
index 0000000000..4d10563583
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/idct.cc
@@ -0,0 +1,692 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/idct.h"
+
+#include <cmath>
+
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jxl/base/status.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/idct.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/transpose-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Gt;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::NegMulAdd;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::Xor;
+
+using D = HWY_FULL(float);
+using DI = HWY_FULL(int32_t);
+constexpr D d;
+constexpr DI di;
+
+using D8 = HWY_CAPPED(float, 8);
+constexpr D8 d8;
+
+void DequantBlock(const int16_t* JXL_RESTRICT qblock,
+                  const float* JXL_RESTRICT dequant,
+                  const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) {
+  for (size_t k = 0; k < 64; k += Lanes(d)) {
+    const auto mul = Load(d, dequant + k);
+    const auto bias = Load(d, biases + k);
+    const Rebind<int16_t, DI> di16;
+    const Vec<DI> quant_i = PromoteTo(di, Load(di16, qblock + k));
+    const Rebind<float, DI> df;
+    const auto quant = ConvertTo(df, quant_i);
+    const auto abs_quant = Abs(quant);
+    const auto not_0 = Gt(abs_quant, Zero(df));
+    const auto sign_quant = Xor(quant, abs_quant);
+    const auto biased_quant = Sub(quant, Xor(bias, sign_quant));
+    const auto dequant = IfThenElseZero(not_0, Mul(biased_quant, mul));
+    Store(dequant, d, block + k);
+  }
+}
+
+template <size_t N>
+void ForwardEvenOdd(const float* JXL_RESTRICT ain, size_t ain_stride,
+                    float* JXL_RESTRICT aout) {
+  for (size_t i = 0; i < N / 2; i++) {
+    auto in1 = LoadU(d8, ain + 2 * i * ain_stride);
+    Store(in1, d8, aout + i * 8);
+  }
+  for (size_t i = N / 2; i < N; i++) {
+    auto in1 = LoadU(d8, ain + (2 * (i - N / 2) + 1) * ain_stride);
+    Store(in1, d8, aout + i * 8);
+  }
+}
+
+template <size_t N>
+void BTranspose(float* JXL_RESTRICT coeff) {
+  for (size_t i = N - 1; i > 0; i--) {
+    auto in1 = Load(d8, coeff + i * 8);
+    auto in2 = Load(d8, coeff + (i - 1) * 8);
+    Store(Add(in1, in2), d8, coeff + i * 8);
+  }
+  constexpr float kSqrt2 = 1.41421356237f;
+  auto sqrt2 = Set(d8, kSqrt2);
+  auto in1 = Load(d8, coeff);
+  Store(Mul(in1, sqrt2), d8, coeff);
+}
+
+// Constants for DCT implementation. Generated by the following snippet:
+// for i in range(N // 2):
+//    print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
+template <size_t N>
+struct WcMultipliers;
+
+template <>
+struct WcMultipliers<4> {
+  static constexpr float kMultipliers[] = {
+      0.541196100146197,
+      1.3065629648763764,
+  };
+};
+
+template <>
+struct WcMultipliers<8> {
+  static constexpr float kMultipliers[] = {
+      0.5097955791041592,
+      0.6013448869350453,
+      0.8999762231364156,
+      2.5629154477415055,
+  };
+};
+
+constexpr float WcMultipliers<4>::kMultipliers[];
+constexpr float WcMultipliers<8>::kMultipliers[];
+
+template <size_t N>
+void MultiplyAndAdd(const float* JXL_RESTRICT coeff, float* JXL_RESTRICT out,
+                    size_t out_stride) {
+  for (size_t i = 0; i < N / 2; i++) {
+    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
+    auto in1 = Load(d8, coeff + i * 8);
+    auto in2 = Load(d8, coeff + (N / 2 + i) * 8);
+    auto out1 = MulAdd(mul, in2, in1);
+    auto out2 = NegMulAdd(mul, in2, in1);
+    StoreU(out1, d8, out + i * out_stride);
+    StoreU(out2, d8, out + (N - i - 1) * out_stride);
+  }
+}
+
+template <size_t N>
+struct IDCT1DImpl;
+
+template <>
+struct IDCT1DImpl<1> {
+  JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
+                             size_t to_stride) {
+    StoreU(LoadU(d8, from), d8, to);
+  }
+};
+
+template <>
+struct IDCT1DImpl<2> {
+  JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
+                             size_t to_stride) {
+    JXL_DASSERT(from_stride >= 8);
+    JXL_DASSERT(to_stride >= 8);
+    auto in1 = LoadU(d8, from);
+    auto in2 = LoadU(d8, from + from_stride);
+    StoreU(Add(in1, in2), d8, to);
+    StoreU(Sub(in1, in2), d8, to + to_stride);
+  }
+};
+
+template <size_t N>
+struct IDCT1DImpl {
+  void operator()(const float* from, size_t from_stride, float* to,
+                  size_t to_stride) {
+    JXL_DASSERT(from_stride >= 8);
+    JXL_DASSERT(to_stride >= 8);
+    HWY_ALIGN float tmp[64];
+    ForwardEvenOdd<N>(from, from_stride, tmp);
+    IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8);
+    BTranspose<N / 2>(tmp + N * 4);
+    IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8);
+    MultiplyAndAdd<N>(tmp, to, to_stride);
+  }
+};
+
+template <size_t N>
+void IDCT1D(float* JXL_RESTRICT from, float* JXL_RESTRICT output,
+            size_t output_stride) {
+  for (size_t i = 0; i < 8; i += Lanes(d8)) {
+    IDCT1DImpl<N>()(from + i, 8, output + i, output_stride);
+  }
+}
+
+void ComputeScaledIDCT(float* JXL_RESTRICT block0, float* JXL_RESTRICT block1,
+                       float* JXL_RESTRICT output, size_t output_stride) {
+  Transpose8x8Block(block0, block1);
+  IDCT1D<8>(block1, block0, 8);
+  Transpose8x8Block(block0, block1);
+  IDCT1D<8>(block1, output, output_stride);
+}
+
+void InverseTransformBlock8x8(const int16_t* JXL_RESTRICT qblock,
+                              const float* JXL_RESTRICT dequant,
+                              const float* JXL_RESTRICT biases,
+                              float* JXL_RESTRICT scratch_space,
+                              float* JXL_RESTRICT output, size_t output_stride,
+                              size_t dctsize) {
+  float* JXL_RESTRICT block0 = scratch_space;
+  float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2;
+  DequantBlock(qblock, dequant, biases, block0);
+  ComputeScaledIDCT(block0, block1, output, output_stride);
+}
+
+// Computes the N-point IDCT of in[], and stores the result in out[]. The in[]
+// array is at most 8 values long, values in[8:N-1] are assumed to be 0.
+void Compute1dIDCT(float* in, float* out, size_t N) {
+  switch (N) {
+    case 3: {
+      static constexpr float kC3[3] = {
+          1.414213562373,
+          1.224744871392,
+          0.707106781187,
+      };
+      float even0 = in[0] + kC3[2] * in[2];
+      float even1 = in[0] - kC3[0] * in[2];
+      float odd0 = kC3[1] * in[1];
+      out[0] = even0 + odd0;
+      out[2] = even0 - odd0;
+      out[1] = even1;
+      break;
+    }
+    case 5: {
+      static constexpr float kC5[5] = {
+          1.414213562373, 1.344997023928, 1.144122805635,
+          0.831253875555, 0.437016024449,
+      };
+      float even0 = in[0] + kC5[2] * in[2] + kC5[4] * in[4];
+      float even1 = in[0] - kC5[4] * in[2] - kC5[2] * in[4];
+      float even2 = in[0] - kC5[0] * in[2] + kC5[0] * in[4];
+      float odd0 = kC5[1] * in[1] + kC5[3] * in[3];
+      float odd1 = kC5[3] * in[1] - kC5[1] * in[3];
+      out[0] = even0 + odd0;
+      out[4] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[3] = even1 - odd1;
+      out[2] = even2;
+      break;
+    }
+    case 6: {
+      static constexpr float kC6[6] = {
+          1.414213562373, 1.366025403784, 1.224744871392,
+          1.000000000000, 0.707106781187, 0.366025403784,
+      };
+      float even0 = in[0] + kC6[2] * in[2] + kC6[4] * in[4];
+      float even1 = in[0] - kC6[0] * in[4];
+      float even2 = in[0] - kC6[2] * in[2] + kC6[4] * in[4];
+      float odd0 = kC6[1] * in[1] + kC6[3] * in[3] + kC6[5] * in[5];
+      float odd1 = kC6[3] * in[1] - kC6[3] * in[3] - kC6[3] * in[5];
+      float odd2 = kC6[5] * in[1] - kC6[3] * in[3] + kC6[1] * in[5];
+      out[0] = even0 + odd0;
+      out[5] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[4] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[3] = even2 - odd2;
+      break;
+    }
+    case 7: {
+      static constexpr float kC7[7] = {
+          1.414213562373, 1.378756275744, 1.274162392264, 1.105676685997,
+          0.881747733790, 0.613604268353, 0.314692122713,
+      };
+      float even0 = in[0] + kC7[2] * in[2] + kC7[4] * in[4] + kC7[6] * in[6];
+      float even1 = in[0] + kC7[6] * in[2] - kC7[2] * in[4] - kC7[4] * in[6];
+      float even2 = in[0] - kC7[4] * in[2] - kC7[6] * in[4] + kC7[2] * in[6];
+      float even3 = in[0] - kC7[0] * in[2] + kC7[0] * in[4] - kC7[0] * in[6];
+      float odd0 = kC7[1] * in[1] + kC7[3] * in[3] + kC7[5] * in[5];
+      float odd1 = kC7[3] * in[1] - kC7[5] * in[3] - kC7[1] * in[5];
+      float odd2 = kC7[5] * in[1] - kC7[1] * in[3] + kC7[3] * in[5];
+      out[0] = even0 + odd0;
+      out[6] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[5] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[4] = even2 - odd2;
+      out[3] = even3;
+      break;
+    }
+    case 9: {
+      static constexpr float kC9[9] = {
+          1.414213562373, 1.392728480640, 1.328926048777,
+          1.224744871392, 1.083350440839, 0.909038955344,
+          0.707106781187, 0.483689525296, 0.245575607938,
+      };
+      float even0 = in[0] + kC9[2] * in[2] + kC9[4] * in[4] + kC9[6] * in[6];
+      float even1 = in[0] + kC9[6] * in[2] - kC9[6] * in[4] - kC9[0] * in[6];
+      float even2 = in[0] - kC9[8] * in[2] - kC9[2] * in[4] + kC9[6] * in[6];
+      float even3 = in[0] - kC9[4] * in[2] + kC9[8] * in[4] + kC9[6] * in[6];
+      float even4 = in[0] - kC9[0] * in[2] + kC9[0] * in[4] - kC9[0] * in[6];
+      float odd0 =
+          kC9[1] * in[1] + kC9[3] * in[3] + kC9[5] * in[5] + kC9[7] * in[7];
+      float odd1 = kC9[3] * in[1] - kC9[3] * in[5] - kC9[3] * in[7];
+      float odd2 =
+          kC9[5] * in[1] - kC9[3] * in[3] - kC9[7] * in[5] + kC9[1] * in[7];
+      float odd3 =
+          kC9[7] * in[1] - kC9[3] * in[3] + kC9[1] * in[5] - kC9[5] * in[7];
+      out[0] = even0 + odd0;
+      out[8] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[7] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[6] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[5] = even3 - odd3;
+      out[4] = even4;
+      break;
+    }
+    case 10: {
+      static constexpr float kC10[10] = {
+          1.414213562373, 1.396802246667, 1.344997023928, 1.260073510670,
+          1.144122805635, 1.000000000000, 0.831253875555, 0.642039521920,
+          0.437016024449, 0.221231742082,
+      };
+      float even0 = in[0] + kC10[2] * in[2] + kC10[4] * in[4] + kC10[6] * in[6];
+      float even1 = in[0] + kC10[6] * in[2] - kC10[8] * in[4] - kC10[2] * in[6];
+      float even2 = in[0] - kC10[0] * in[4];
+      float even3 = in[0] - kC10[6] * in[2] - kC10[8] * in[4] + kC10[2] * in[6];
+      float even4 = in[0] - kC10[2] * in[2] + kC10[4] * in[4] - kC10[6] * in[6];
+      float odd0 =
+          kC10[1] * in[1] + kC10[3] * in[3] + kC10[5] * in[5] + kC10[7] * in[7];
+      float odd1 =
+          kC10[3] * in[1] + kC10[9] * in[3] - kC10[5] * in[5] - kC10[1] * in[7];
+      float odd2 =
+          kC10[5] * in[1] - kC10[5] * in[3] - kC10[5] * in[5] + kC10[5] * in[7];
+      float odd3 =
+          kC10[7] * in[1] - kC10[1] * in[3] + kC10[5] * in[5] + kC10[9] * in[7];
+      float odd4 =
+          kC10[9] * in[1] - kC10[7] * in[3] + kC10[5] * in[5] - kC10[3] * in[7];
+      out[0] = even0 + odd0;
+      out[9] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[8] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[7] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[6] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[5] = even4 - odd4;
+      break;
+    }
+    case 11: {
+      static constexpr float kC11[11] = {
+          1.414213562373, 1.399818907436, 1.356927976287, 1.286413904599,
+          1.189712155524, 1.068791297809, 0.926112931411, 0.764581576418,
+          0.587485545401, 0.398430002847, 0.201263574413,
+      };
+      float even0 = in[0] + kC11[2] * in[2] + kC11[4] * in[4] + kC11[6] * in[6];
+      float even1 =
+          in[0] + kC11[6] * in[2] - kC11[10] * in[4] - kC11[4] * in[6];
+      float even2 =
+          in[0] + kC11[10] * in[2] - kC11[2] * in[4] - kC11[8] * in[6];
+      float even3 = in[0] - kC11[8] * in[2] - kC11[6] * in[4] + kC11[2] * in[6];
+      float even4 =
+          in[0] - kC11[4] * in[2] + kC11[8] * in[4] + kC11[10] * in[6];
+      float even5 = in[0] - kC11[0] * in[2] + kC11[0] * in[4] - kC11[0] * in[6];
+      float odd0 =
+          kC11[1] * in[1] + kC11[3] * in[3] + kC11[5] * in[5] + kC11[7] * in[7];
+      float odd1 =
+          kC11[3] * in[1] + kC11[9] * in[3] - kC11[7] * in[5] - kC11[1] * in[7];
+      float odd2 =
+          kC11[5] * in[1] - kC11[7] * in[3] - kC11[3] * in[5] + kC11[9] * in[7];
+      float odd3 =
+          kC11[7] * in[1] - kC11[1] * in[3] + kC11[9] * in[5] + kC11[5] * in[7];
+      float odd4 =
+          kC11[9] * in[1] - kC11[5] * in[3] + kC11[1] * in[5] - kC11[3] * in[7];
+      out[0] = even0 + odd0;
+      out[10] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[9] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[8] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[7] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[6] = even4 - odd4;
+      out[5] = even5;
+      break;
+    }
+    case 12: {
+      static constexpr float kC12[12] = {
+          1.414213562373, 1.402114769300, 1.366025403784, 1.306562964876,
+          1.224744871392, 1.121971053594, 1.000000000000, 0.860918669154,
+          0.707106781187, 0.541196100146, 0.366025403784, 0.184591911283,
+      };
+      float even0 = in[0] + kC12[2] * in[2] + kC12[4] * in[4] + kC12[6] * in[6];
+      float even1 = in[0] + kC12[6] * in[2] - kC12[6] * in[6];
+      float even2 =
+          in[0] + kC12[10] * in[2] - kC12[4] * in[4] - kC12[6] * in[6];
+      float even3 =
+          in[0] - kC12[10] * in[2] - kC12[4] * in[4] + kC12[6] * in[6];
+      float even4 = in[0] - kC12[6] * in[2] + kC12[6] * in[6];
+      float even5 = in[0] - kC12[2] * in[2] + kC12[4] * in[4] - kC12[6] * in[6];
+      float odd0 =
+          kC12[1] * in[1] + kC12[3] * in[3] + kC12[5] * in[5] + kC12[7] * in[7];
+      float odd1 =
+          kC12[3] * in[1] + kC12[9] * in[3] - kC12[9] * in[5] - kC12[3] * in[7];
+      float odd2 = kC12[5] * in[1] - kC12[9] * in[3] - kC12[1] * in[5] -
+                   kC12[11] * in[7];
+      float odd3 = kC12[7] * in[1] - kC12[3] * in[3] - kC12[11] * in[5] +
+                   kC12[1] * in[7];
+      float odd4 =
+          kC12[9] * in[1] - kC12[3] * in[3] + kC12[3] * in[5] - kC12[9] * in[7];
+      float odd5 = kC12[11] * in[1] - kC12[9] * in[3] + kC12[7] * in[5] -
+                   kC12[5] * in[7];
+      out[0] = even0 + odd0;
+      out[11] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[10] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[9] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[8] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[7] = even4 - odd4;
+      out[5] = even5 + odd5;
+      out[6] = even5 - odd5;
+      break;
+    }
+    case 13: {
+      static constexpr float kC13[13] = {
+          1.414213562373, 1.403902353238, 1.373119086479, 1.322312651445,
+          1.252223920364, 1.163874944761, 1.058554051646, 0.937797056801,
+          0.803364869133, 0.657217812653, 0.501487040539, 0.338443458124,
+          0.170464607981,
+      };
+      float even0 = in[0] + kC13[2] * in[2] + kC13[4] * in[4] + kC13[6] * in[6];
+      float even1 =
+          in[0] + kC13[6] * in[2] + kC13[12] * in[4] - kC13[8] * in[6];
+      float even2 =
+          in[0] + kC13[10] * in[2] - kC13[6] * in[4] - kC13[4] * in[6];
+      float even3 =
+          in[0] - kC13[12] * in[2] - kC13[2] * in[4] + kC13[10] * in[6];
+      float even4 =
+          in[0] - kC13[8] * in[2] - kC13[10] * in[4] + kC13[2] * in[6];
+      float even5 =
+          in[0] - kC13[4] * in[2] + kC13[8] * in[4] - kC13[12] * in[6];
+      float even6 = in[0] - kC13[0] * in[2] + kC13[0] * in[4] - kC13[0] * in[6];
+      float odd0 =
+          kC13[1] * in[1] + kC13[3] * in[3] + kC13[5] * in[5] + kC13[7] * in[7];
+      float odd1 = kC13[3] * in[1] + kC13[9] * in[3] - kC13[11] * in[5] -
+                   kC13[5] * in[7];
+      float odd2 = kC13[5] * in[1] - kC13[11] * in[3] - kC13[1] * in[5] -
+                   kC13[9] * in[7];
+      float odd3 =
+          kC13[7] * in[1] - kC13[5] * in[3] - kC13[9] * in[5] + kC13[3] * in[7];
+      float odd4 = kC13[9] * in[1] - kC13[1] * in[3] + kC13[7] * in[5] +
+                   kC13[11] * in[7];
+      float odd5 = kC13[11] * in[1] - kC13[7] * in[3] + kC13[3] * in[5] -
+                   kC13[1] * in[7];
+      out[0] = even0 + odd0;
+      out[12] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[11] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[10] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[9] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[8] = even4 - odd4;
+      out[5] = even5 + odd5;
+      out[7] = even5 - odd5;
+      out[6] = even6;
+      break;
+    }
+    case 14: {
+      static constexpr float kC14[14] = {
+          1.414213562373, 1.405321284327, 1.378756275744, 1.334852607020,
+          1.274162392264, 1.197448846138, 1.105676685997, 1.000000000000,
+          0.881747733790, 0.752406978226, 0.613604268353, 0.467085128785,
+          0.314692122713, 0.158341680609,
+      };
+      float even0 = in[0] + kC14[2] * in[2] + kC14[4] * in[4] + kC14[6] * in[6];
+      float even1 =
+          in[0] + kC14[6] * in[2] + kC14[12] * in[4] - kC14[10] * in[6];
+      float even2 =
+          in[0] + kC14[10] * in[2] - kC14[8] * in[4] - kC14[2] * in[6];
+      float even3 = in[0] - kC14[0] * in[4];
+      float even4 =
+          in[0] - kC14[10] * in[2] - kC14[8] * in[4] + kC14[2] * in[6];
+      float even5 =
+          in[0] - kC14[6] * in[2] + kC14[12] * in[4] + kC14[10] * in[6];
+      float even6 = in[0] - kC14[2] * in[2] + kC14[4] * in[4] - kC14[6] * in[6];
+      float odd0 =
+          kC14[1] * in[1] + kC14[3] * in[3] + kC14[5] * in[5] + kC14[7] * in[7];
+      float odd1 = kC14[3] * in[1] + kC14[9] * in[3] - kC14[13] * in[5] -
+                   kC14[7] * in[7];
+      float odd2 = kC14[5] * in[1] - kC14[13] * in[3] - kC14[3] * in[5] -
+                   kC14[7] * in[7];
+      float odd3 =
+          kC14[7] * in[1] - kC14[7] * in[3] - kC14[7] * in[5] + kC14[7] * in[7];
+      float odd4 = kC14[9] * in[1] - kC14[1] * in[3] + kC14[11] * in[5] +
+                   kC14[7] * in[7];
+      float odd5 = kC14[11] * in[1] - kC14[5] * in[3] + kC14[1] * in[5] -
+                   kC14[7] * in[7];
+      float odd6 = kC14[13] * in[1] - kC14[11] * in[3] + kC14[9] * in[5] -
+                   kC14[7] * in[7];
+      out[0] = even0 + odd0;
+      out[13] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[12] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[11] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[10] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[9] = even4 - odd4;
+      out[5] = even5 + odd5;
+      out[8] = even5 - odd5;
+      out[6] = even6 + odd6;
+      out[7] = even6 - odd6;
+      break;
+    }
+    case 15: {
+      static constexpr float kC15[15] = {
+          1.414213562373, 1.406466352507, 1.383309602960, 1.344997023928,
+          1.291948376043, 1.224744871392, 1.144122805635, 1.050965490998,
+          0.946293578512, 0.831253875555, 0.707106781187, 0.575212476952,
+          0.437016024449, 0.294031532930, 0.147825570407,
+      };
+      float even0 = in[0] + kC15[2] * in[2] + kC15[4] * in[4] + kC15[6] * in[6];
+      float even1 =
+          in[0] + kC15[6] * in[2] + kC15[12] * in[4] - kC15[12] * in[6];
+      float even2 =
+          in[0] + kC15[10] * in[2] - kC15[10] * in[4] - kC15[0] * in[6];
+      float even3 =
+          in[0] + kC15[14] * in[2] - kC15[2] * in[4] - kC15[12] * in[6];
+      float even4 =
+          in[0] - kC15[12] * in[2] - kC15[6] * in[4] + kC15[6] * in[6];
+      float even5 =
+          in[0] - kC15[8] * in[2] - kC15[14] * in[4] + kC15[6] * in[6];
+      float even6 =
+          in[0] - kC15[4] * in[2] + kC15[8] * in[4] - kC15[12] * in[6];
+      float even7 = in[0] - kC15[0] * in[2] + kC15[0] * in[4] - kC15[0] * in[6];
+      float odd0 =
+          kC15[1] * in[1] + kC15[3] * in[3] + kC15[5] * in[5] + kC15[7] * in[7];
+      float odd1 = kC15[3] * in[1] + kC15[9] * in[3] - kC15[9] * in[7];
+      float odd2 = kC15[5] * in[1] - kC15[5] * in[5] - kC15[5] * in[7];
+      float odd3 = kC15[7] * in[1] - kC15[9] * in[3] - kC15[5] * in[5] +
+                   kC15[11] * in[7];
+      float odd4 = kC15[9] * in[1] - kC15[3] * in[3] + kC15[3] * in[7];
+      float odd5 = kC15[11] * in[1] - kC15[3] * in[3] + kC15[5] * in[5] -
+                   kC15[13] * in[7];
+      float odd6 = kC15[13] * in[1] - kC15[9] * in[3] + kC15[5] * in[5] -
+                   kC15[1] * in[7];
+      out[0] = even0 + odd0;
+      out[14] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[13] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[12] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[11] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[10] = even4 - odd4;
+      out[5] = even5 + odd5;
+      out[9] = even5 - odd5;
+      out[6] = even6 + odd6;
+      out[8] = even6 - odd6;
+      out[7] = even7;
+      break;
+    }
+    case 16: {
+      static constexpr float kC16[16] = {
+          1.414213562373, 1.407403737526, 1.387039845322, 1.353318001174,
+          1.306562964876, 1.247225012987, 1.175875602419, 1.093201867002,
+          1.000000000000, 0.897167586343, 0.785694958387, 0.666655658478,
+          0.541196100146, 0.410524527522, 0.275899379283, 0.138617169199,
+      };
+      float even0 = in[0] + kC16[2] * in[2] + kC16[4] * in[4] + kC16[6] * in[6];
+      float even1 =
+          in[0] + kC16[6] * in[2] + kC16[12] * in[4] - kC16[14] * in[6];
+      float even2 =
+          in[0] + kC16[10] * in[2] - kC16[12] * in[4] - kC16[2] * in[6];
+      float even3 =
+          in[0] + kC16[14] * in[2] - kC16[4] * in[4] - kC16[10] * in[6];
+      float even4 =
+          in[0] - kC16[14] * in[2] - kC16[4] * in[4] + kC16[10] * in[6];
+      float even5 =
+          in[0] - kC16[10] * in[2] - kC16[12] * in[4] + kC16[2] * in[6];
+      float even6 =
+          in[0] - kC16[6] * in[2] + kC16[12] * in[4] + kC16[14] * in[6];
+      float even7 = in[0] - kC16[2] * in[2] + kC16[4] * in[4] - kC16[6] * in[6];
+      float odd0 = (kC16[1] * in[1] + kC16[3] * in[3] + kC16[5] * in[5] +
+                    kC16[7] * in[7]);
+      float odd1 = (kC16[3] * in[1] + kC16[9] * in[3] + kC16[15] * in[5] -
+                    kC16[11] * in[7]);
+      float odd2 = (kC16[5] * in[1] + kC16[15] * in[3] - kC16[7] * in[5] -
+                    kC16[3] * in[7]);
+      float odd3 = (kC16[7] * in[1] - kC16[11] * in[3] - kC16[3] * in[5] +
+                    kC16[15] * in[7]);
+      float odd4 = (kC16[9] * in[1] - kC16[5] * in[3] - kC16[13] * in[5] +
+                    kC16[1] * in[7]);
+      float odd5 = (kC16[11] * in[1] - kC16[1] * in[3] + kC16[9] * in[5] +
+                    kC16[13] * in[7]);
+      float odd6 = (kC16[13] * in[1] - kC16[7] * in[3] + kC16[1] * in[5] -
+                    kC16[5] * in[7]);
+      float odd7 = (kC16[15] * in[1] - kC16[13] * in[3] + kC16[11] * in[5] -
+                    kC16[9] * in[7]);
+      out[0] = even0 + odd0;
+      out[15] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[14] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[13] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[12] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[11] = even4 - odd4;
+      out[5] = even5 + odd5;
+      out[10] = even5 - odd5;
+      out[6] = even6 + odd6;
+      out[9] = even6 - odd6;
+      out[7] = even7 + odd7;
+      out[8] = even7 - odd7;
+      break;
+    }
+  }
+}
+
+void InverseTransformBlockGeneric(const int16_t* JXL_RESTRICT qblock,
+                                  const float* JXL_RESTRICT dequant,
+                                  const float* JXL_RESTRICT biases,
+                                  float* JXL_RESTRICT scratch_space,
+                                  float* JXL_RESTRICT output,
+                                  size_t output_stride, size_t dctsize) {
+  float* JXL_RESTRICT block0 = scratch_space;
+  float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2;
+  DequantBlock(qblock, dequant, biases, block0);
+  if (dctsize == 1) {
+    *output = *block0;
+  } else if (dctsize == 2 || dctsize == 4) {
+    float* JXL_RESTRICT block2 = scratch_space + 2 * DCTSIZE2;
+    ComputeScaledIDCT(block0, block1, block2, 8);
+    if (dctsize == 4) {
+      for (size_t iy = 0; iy < 4; ++iy) {
+        for (size_t ix = 0; ix < 4; ++ix) {
+          float* block = &block2[16 * iy + 2 * ix];
+          output[iy * output_stride + ix] =
+              0.25f * (block[0] + block[1] + block[8] + block[9]);
+        }
+      }
+    } else {
+      for (size_t iy = 0; iy < 2; ++iy) {
+        for (size_t ix = 0; ix < 2; ++ix) {
+          float* block = &block2[32 * iy + 4 * ix];
+          output[iy * output_stride + ix] =
+              0.0625f *
+              (block[0] + block[1] + block[2] + block[3] + block[8] + block[9] +
+               block[10] + block[11] + block[16] + block[17] + block[18] +
+               block[19] + block[24] + block[25] + block[26] + block[27]);
+        }
+      }
+    }
+  } else {
+    float dctin[DCTSIZE];
+    float dctout[DCTSIZE * 2];
+    size_t insize = std::min<size_t>(dctsize, DCTSIZE);
+    for (size_t ix = 0; ix < insize; ++ix) {
+      for (size_t iy = 0; iy < insize; ++iy) {
+        dctin[iy] = block0[iy * DCTSIZE + ix];
+      }
+      Compute1dIDCT(dctin, dctout, dctsize);
+      for (size_t iy = 0; iy < dctsize; ++iy) {
+        block1[iy * dctsize + ix] = dctout[iy];
+      }
+    }
+    for (size_t iy = 0; iy < dctsize; ++iy) {
+      Compute1dIDCT(block1 + iy * dctsize, output + iy * output_stride,
+                    dctsize);
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(InverseTransformBlock8x8);
+HWY_EXPORT(InverseTransformBlockGeneric);
+
+void ChooseInverseTransform(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    if (m->scaled_dct_size[c] == DCTSIZE) {
+      m->inverse_transform[c] = HWY_DYNAMIC_DISPATCH(InverseTransformBlock8x8);
+    } else {
+      m->inverse_transform[c] =
+          HWY_DYNAMIC_DISPATCH(InverseTransformBlockGeneric);
+    }
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/idct.h b/third_party/jpeg-xl/lib/jpegli/idct.h
new file mode 100644
index 0000000000..21c5c452e6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/idct.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_IDCT_H_
+#define LIB_JPEGLI_IDCT_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+#include <stddef.h>
+#include <stdint.h>
+/* clang-format on */
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jpegli {
+
+void ChooseInverseTransform(j_decompress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_IDCT_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/input.cc b/third_party/jpeg-xl/lib/jpegli/input.cc
new file mode 100644
index 0000000000..765bf98946
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/input.cc
@@ -0,0 +1,414 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/input.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/input.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Vec;
+
+using D = HWY_FULL(float);
+using DU = HWY_FULL(uint32_t);
+using DU8 = Rebind<uint8_t, D>;
+using DU16 = Rebind<uint16_t, D>;
+
+constexpr D d;
+constexpr DU du;
+constexpr DU8 du8;
+constexpr DU16 du16;
+
+static constexpr double kMul16 = 1.0 / 257.0;
+static constexpr double kMulFloat = 255.0;
+
+template <size_t C>
+void ReadUint8Row(const uint8_t* row_in, size_t x0, size_t len,
+                  float* row_out[kMaxComponents]) {
+  for (size_t x = x0; x < len; ++x) {
+    for (size_t c = 0; c < C; ++c) {
+      row_out[c][x] = row_in[C * x + c];
+    }
+  }
+}
+
+template <size_t C, bool swap_endianness = false>
+void ReadUint16Row(const uint8_t* row_in, size_t x0, size_t len,
+                   float* row_out[kMaxComponents]) {
+  const uint16_t* row16 = reinterpret_cast<const uint16_t*>(row_in);
+  for (size_t x = x0; x < len; ++x) {
+    for (size_t c = 0; c < C; ++c) {
+      uint16_t val = row16[C * x + c];
+      if (swap_endianness) val = JXL_BSWAP16(val);
+      row_out[c][x] = val * kMul16;
+    }
+  }
+}
+
+template <size_t C, bool swap_endianness = false>
+void ReadFloatRow(const uint8_t* row_in, size_t x0, size_t len,
+                  float* row_out[kMaxComponents]) {
+  const float* rowf = reinterpret_cast<const float*>(row_in);
+  for (size_t x = x0; x < len; ++x) {
+    for (size_t c = 0; c < C; ++c) {
+      float val = rowf[C * x + c];
+      if (swap_endianness) val = BSwapFloat(val);
+      row_out[c][x] = val * kMulFloat;
+    }
+  }
+}
+
+void ReadUint8RowSingle(const uint8_t* row_in, size_t len,
+                        float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  float* JXL_RESTRICT const row0 = row_out[0];
+  for (size_t x = 0; x < simd_len; x += N) {
+    Store(ConvertTo(d, PromoteTo(du, LoadU(du8, row_in + x))), d, row0 + x);
+  }
+  ReadUint8Row<1>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint8RowInterleaved2(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  Vec<DU8> out0, out1;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved2(du8, row_in + 2 * x, out0, out1);
+    Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
+    Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
+  }
+  ReadUint8Row<2>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint8RowInterleaved3(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  Vec<DU8> out0, out1, out2;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved3(du8, row_in + 3 * x, out0, out1, out2);
+    Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
+    Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
+    Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x);
+  }
+  ReadUint8Row<3>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint8RowInterleaved4(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  float* JXL_RESTRICT const row3 = row_out[3];
+  Vec<DU8> out0, out1, out2, out3;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved4(du8, row_in + 4 * x, out0, out1, out2, out3);
+    Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
+    Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
+    Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x);
+    Store(ConvertTo(d, PromoteTo(du, out3)), d, row3 + x);
+  }
+  ReadUint8Row<4>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint16RowSingle(const uint8_t* row_in, size_t len,
+                         float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMul16);
+  const uint16_t* JXL_RESTRICT const row =
+      reinterpret_cast<const uint16_t*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  for (size_t x = 0; x < simd_len; x += N) {
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, LoadU(du16, row + x)))), d,
+          row0 + x);
+  }
+  ReadUint16Row<1>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint16RowInterleaved2(const uint8_t* row_in, size_t len,
+                               float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMul16);
+  const uint16_t* JXL_RESTRICT const row =
+      reinterpret_cast<const uint16_t*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  Vec<DU16> out0, out1;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved2(du16, row + 2 * x, out0, out1);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
+  }
+  ReadUint16Row<2>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint16RowInterleaved3(const uint8_t* row_in, size_t len,
+                               float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMul16);
+  const uint16_t* JXL_RESTRICT const row =
+      reinterpret_cast<const uint16_t*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  Vec<DU16> out0, out1, out2;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved3(du16, row + 3 * x, out0, out1, out2);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x);
+  }
+  ReadUint16Row<3>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint16RowInterleaved4(const uint8_t* row_in, size_t len,
+                               float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMul16);
+  const uint16_t* JXL_RESTRICT const row =
+      reinterpret_cast<const uint16_t*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  float* JXL_RESTRICT const row3 = row_out[3];
+  Vec<DU16> out0, out1, out2, out3;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved4(du16, row + 4 * x, out0, out1, out2, out3);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out3))), d, row3 + x);
+  }
+  ReadUint16Row<4>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint16RowSingleSwap(const uint8_t* row_in, size_t len,
+                             float* row_out[kMaxComponents]) {
+  ReadUint16Row<1, true>(row_in, 0, len, row_out);
+}
+
+void ReadUint16RowInterleaved2Swap(const uint8_t* row_in, size_t len,
+                                   float* row_out[kMaxComponents]) {
+  ReadUint16Row<2, true>(row_in, 0, len, row_out);
+}
+
+void ReadUint16RowInterleaved3Swap(const uint8_t* row_in, size_t len,
+                                   float* row_out[kMaxComponents]) {
+  ReadUint16Row<3, true>(row_in, 0, len, row_out);
+}
+
+void ReadUint16RowInterleaved4Swap(const uint8_t* row_in, size_t len,
+                                   float* row_out[kMaxComponents]) {
+  ReadUint16Row<4, true>(row_in, 0, len, row_out);
+}
+
+void ReadFloatRowSingle(const uint8_t* row_in, size_t len,
+                        float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMulFloat);
+  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  for (size_t x = 0; x < simd_len; x += N) {
+    Store(Mul(mul, LoadU(d, row + x)), d, row0 + x);
+  }
+  ReadFloatRow<1>(row_in, simd_len, len, row_out);
+}
+
+void ReadFloatRowInterleaved2(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMulFloat);
+  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  Vec<D> out0, out1;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved2(d, row + 2 * x, out0, out1);
+    Store(Mul(mul, out0), d, row0 + x);
+    Store(Mul(mul, out1), d, row1 + x);
+  }
+  ReadFloatRow<2>(row_in, simd_len, len, row_out);
+}
+
+void ReadFloatRowInterleaved3(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMulFloat);
+  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  Vec<D> out0, out1, out2;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved3(d, row + 3 * x, out0, out1, out2);
+    Store(Mul(mul, out0), d, row0 + x);
+    Store(Mul(mul, out1), d, row1 + x);
+    Store(Mul(mul, out2), d, row2 + x);
+  }
+  ReadFloatRow<3>(row_in, simd_len, len, row_out);
+}
+
+void ReadFloatRowInterleaved4(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMulFloat);
+  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  float* JXL_RESTRICT const row3 = row_out[3];
+  Vec<D> out0, out1, out2, out3;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved4(d, row + 4 * x, out0, out1, out2, out3);
+    Store(Mul(mul, out0), d, row0 + x);
+    Store(Mul(mul, out1), d, row1 + x);
+    Store(Mul(mul, out2), d, row2 + x);
+    Store(Mul(mul, out3), d, row3 + x);
+  }
+  ReadFloatRow<4>(row_in, simd_len, len, row_out);
+}
+
+void ReadFloatRowSingleSwap(const uint8_t* row_in, size_t len,
+                            float* row_out[kMaxComponents]) {
+  ReadFloatRow<1, true>(row_in, 0, len, row_out);
+}
+
+void ReadFloatRowInterleaved2Swap(const uint8_t* row_in, size_t len,
+                                  float* row_out[kMaxComponents]) {
+  ReadFloatRow<2, true>(row_in, 0, len, row_out);
+}
+
+void ReadFloatRowInterleaved3Swap(const uint8_t* row_in, size_t len,
+                                  float* row_out[kMaxComponents]) {
+  ReadFloatRow<3, true>(row_in, 0, len, row_out);
+}
+
+void ReadFloatRowInterleaved4Swap(const uint8_t* row_in, size_t len,
+                                  float* row_out[kMaxComponents]) {
+  ReadFloatRow<4, true>(row_in, 0, len, row_out);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(ReadUint8RowSingle);
+HWY_EXPORT(ReadUint8RowInterleaved2);
+HWY_EXPORT(ReadUint8RowInterleaved3);
+HWY_EXPORT(ReadUint8RowInterleaved4);
+HWY_EXPORT(ReadUint16RowSingle);
+HWY_EXPORT(ReadUint16RowInterleaved2);
+HWY_EXPORT(ReadUint16RowInterleaved3);
+HWY_EXPORT(ReadUint16RowInterleaved4);
+HWY_EXPORT(ReadUint16RowSingleSwap);
+HWY_EXPORT(ReadUint16RowInterleaved2Swap);
+HWY_EXPORT(ReadUint16RowInterleaved3Swap);
+HWY_EXPORT(ReadUint16RowInterleaved4Swap);
+HWY_EXPORT(ReadFloatRowSingle);
+HWY_EXPORT(ReadFloatRowInterleaved2);
+HWY_EXPORT(ReadFloatRowInterleaved3);
+HWY_EXPORT(ReadFloatRowInterleaved4);
+HWY_EXPORT(ReadFloatRowSingleSwap);
+HWY_EXPORT(ReadFloatRowInterleaved2Swap);
+HWY_EXPORT(ReadFloatRowInterleaved3Swap);
+HWY_EXPORT(ReadFloatRowInterleaved4Swap);
+
+void ChooseInputMethod(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  bool swap_endianness =
+      (m->endianness == JPEGLI_LITTLE_ENDIAN && !IsLittleEndian()) ||
+      (m->endianness == JPEGLI_BIG_ENDIAN && IsLittleEndian());
+  m->input_method = nullptr;
+  if (m->data_type == JPEGLI_TYPE_UINT8) {
+    if (cinfo->raw_data_in || cinfo->input_components == 1) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowSingle);
+    } else if (cinfo->input_components == 2) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved2);
+    } else if (cinfo->input_components == 3) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved3);
+    } else if (cinfo->input_components == 4) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved4);
+    }
+  } else if (m->data_type == JPEGLI_TYPE_UINT16 && !swap_endianness) {
+    if (cinfo->raw_data_in || cinfo->input_components == 1) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingle);
+    } else if (cinfo->input_components == 2) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2);
+    } else if (cinfo->input_components == 3) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3);
+    } else if (cinfo->input_components == 4) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4);
+    }
+  } else if (m->data_type == JPEGLI_TYPE_UINT16 && swap_endianness) {
+    if (cinfo->raw_data_in || cinfo->input_components == 1) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingleSwap);
+    } else if (cinfo->input_components == 2) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2Swap);
+    } else if (cinfo->input_components == 3) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3Swap);
+    } else if (cinfo->input_components == 4) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4Swap);
+    }
+  } else if (m->data_type == JPEGLI_TYPE_FLOAT && !swap_endianness) {
+    if (cinfo->raw_data_in || cinfo->input_components == 1) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingle);
+    } else if (cinfo->input_components == 2) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2);
+    } else if (cinfo->input_components == 3) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3);
+    } else if (cinfo->input_components == 4) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4);
+    }
+  } else if (m->data_type == JPEGLI_TYPE_FLOAT && swap_endianness) {
+    if (cinfo->raw_data_in || cinfo->input_components == 1) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingleSwap);
+    } else if (cinfo->input_components == 2) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2Swap);
+    } else if (cinfo->input_components == 3) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3Swap);
+    } else if (cinfo->input_components == 4) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4Swap);
+    }
+  }
+  if (m->input_method == nullptr) {
+    JPEGLI_ERROR("Could not find input method.");
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/input.h b/third_party/jpeg-xl/lib/jpegli/input.h
new file mode 100644
index 0000000000..27b0e80fdb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/input.h
@@ -0,0 +1,20 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_INPUT_H_
+#define LIB_JPEGLI_INPUT_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+namespace jpegli {
+
+void ChooseInputMethod(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_INPUT_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/input_suspension_test.cc b/third_party/jpeg-xl/lib/jpegli/input_suspension_test.cc
new file mode 100644
index 0000000000..4914e5e34b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/input_suspension_test.cc
@@ -0,0 +1,612 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#include <cmath>
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jpegli {
+namespace {
+
+static constexpr uint8_t kFakeEoiMarker[2] = {0xff, 0xd9};
+
+struct SourceManager {
+  SourceManager(const uint8_t* data, size_t len, size_t max_chunk_size,
+                bool is_partial_file)
+      : data_(data),
+        len_(len),
+        pos_(0),
+        max_chunk_size_(max_chunk_size),
+        is_partial_file_(is_partial_file) {
+    pub_.init_source = init_source;
+    pub_.fill_input_buffer = fill_input_buffer;
+    pub_.next_input_byte = nullptr;
+    pub_.bytes_in_buffer = 0;
+    pub_.skip_input_data = skip_input_data;
+    pub_.resync_to_restart = jpegli_resync_to_restart;
+    pub_.term_source = term_source;
+    if (max_chunk_size_ == 0) max_chunk_size_ = len;
+  }
+
+  ~SourceManager() {
+    EXPECT_EQ(0, pub_.bytes_in_buffer);
+    if (!is_partial_file_) {
+      EXPECT_EQ(len_, pos_);
+    }
+  }
+
+  bool LoadNextChunk() {
+    if (pos_ >= len_ && !is_partial_file_) {
+      return false;
+    }
+    if (pub_.bytes_in_buffer > 0) {
+      EXPECT_LE(pub_.bytes_in_buffer, buffer_.size());
+      memmove(&buffer_[0], pub_.next_input_byte, pub_.bytes_in_buffer);
+    }
+    size_t chunk_size =
+        pos_ < len_ ? std::min(len_ - pos_, max_chunk_size_) : 2;
+    buffer_.resize(pub_.bytes_in_buffer + chunk_size);
+    memcpy(&buffer_[pub_.bytes_in_buffer],
+           pos_ < len_ ? data_ + pos_ : kFakeEoiMarker, chunk_size);
+    pub_.next_input_byte = &buffer_[0];
+    pub_.bytes_in_buffer += chunk_size;
+    pos_ += chunk_size;
+    return true;
+  }
+
+ private:
+  jpeg_source_mgr pub_;
+  std::vector<uint8_t> buffer_;
+  const uint8_t* data_;
+  size_t len_;
+  size_t pos_;
+  size_t max_chunk_size_;
+  bool is_partial_file_;
+
+  static void init_source(j_decompress_ptr cinfo) {
+    auto src = reinterpret_cast<SourceManager*>(cinfo->src);
+    src->pub_.next_input_byte = nullptr;
+    src->pub_.bytes_in_buffer = 0;
+  }
+
+  static boolean fill_input_buffer(j_decompress_ptr cinfo) { return FALSE; }
+
+  static void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {
+    auto src = reinterpret_cast<SourceManager*>(cinfo->src);
+    if (num_bytes <= 0) {
+      return;
+    }
+    if (src->pub_.bytes_in_buffer >= static_cast<size_t>(num_bytes)) {
+      src->pub_.bytes_in_buffer -= num_bytes;
+      src->pub_.next_input_byte += num_bytes;
+    } else {
+      src->pos_ += num_bytes - src->pub_.bytes_in_buffer;
+      src->pub_.bytes_in_buffer = 0;
+    }
+  }
+
+  static void term_source(j_decompress_ptr cinfo) {}
+};
+
+uint8_t markers_seen[kMarkerSequenceLen];
+size_t num_markers_seen = 0;
+
+uint8_t get_next_byte(j_decompress_ptr cinfo) {
+  cinfo->src->bytes_in_buffer--;
+  return *cinfo->src->next_input_byte++;
+}
+
+boolean test_marker_processor(j_decompress_ptr cinfo) {
+  markers_seen[num_markers_seen] = cinfo->unread_marker;
+  if (cinfo->src->bytes_in_buffer < 2) {
+    return FALSE;
+  }
+  size_t marker_len = (get_next_byte(cinfo) << 8) + get_next_byte(cinfo);
+  EXPECT_EQ(2 + ((num_markers_seen + 2) % sizeof(kMarkerData)), marker_len);
+  if (marker_len > 2) {
+    (*cinfo->src->skip_input_data)(cinfo, marker_len - 2);
+  }
+  ++num_markers_seen;
+  return TRUE;
+}
+
+void ReadOutputImage(const DecompressParams& dparams, j_decompress_ptr cinfo,
+                     SourceManager* src, TestImage* output) {
+  output->ysize = cinfo->output_height;
+  output->xsize = cinfo->output_width;
+  output->components = cinfo->num_components;
+  if (cinfo->raw_data_out) {
+    output->color_space = cinfo->jpeg_color_space;
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+      size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+      std::vector<uint8_t> plane(ysize * xsize);
+      output->raw_data.emplace_back(std::move(plane));
+    }
+  } else {
+    output->color_space = cinfo->out_color_space;
+    output->AllocatePixels();
+  }
+  size_t total_output_lines = 0;
+  while (cinfo->output_scanline < cinfo->output_height) {
+    size_t max_lines;
+    size_t num_output_lines;
+    if (cinfo->raw_data_out) {
+      size_t iMCU_height = cinfo->max_v_samp_factor * DCTSIZE;
+      EXPECT_EQ(cinfo->output_scanline, cinfo->output_iMCU_row * iMCU_height);
+      max_lines = iMCU_height;
+      std::vector<std::vector<JSAMPROW>> rowdata(cinfo->num_components);
+      std::vector<JSAMPARRAY> data(cinfo->num_components);
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+        size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+        size_t num_lines = cinfo->comp_info[c].v_samp_factor * DCTSIZE;
+        rowdata[c].resize(num_lines);
+        size_t y0 = cinfo->output_iMCU_row * num_lines;
+        for (size_t i = 0; i < num_lines; ++i) {
+          rowdata[c][i] =
+              y0 + i < ysize ? &output->raw_data[c][(y0 + i) * xsize] : nullptr;
+        }
+        data[c] = &rowdata[c][0];
+      }
+      while ((num_output_lines =
+                  jpegli_read_raw_data(cinfo, &data[0], max_lines)) == 0) {
+        JXL_CHECK(src && src->LoadNextChunk());
+      }
+    } else {
+      size_t max_output_lines = dparams.max_output_lines;
+      if (max_output_lines == 0) max_output_lines = cinfo->output_height;
+      size_t lines_left = cinfo->output_height - cinfo->output_scanline;
+      max_lines = std::min<size_t>(max_output_lines, lines_left);
+      size_t stride = cinfo->output_width * cinfo->num_components;
+      std::vector<JSAMPROW> scanlines(max_lines);
+      for (size_t i = 0; i < max_lines; ++i) {
+        size_t yidx = cinfo->output_scanline + i;
+        scanlines[i] = &output->pixels[yidx * stride];
+      }
+      while ((num_output_lines = jpegli_read_scanlines(cinfo, &scanlines[0],
+                                                       max_lines)) == 0) {
+        JXL_CHECK(src && src->LoadNextChunk());
+      }
+    }
+    total_output_lines += num_output_lines;
+    EXPECT_EQ(total_output_lines, cinfo->output_scanline);
+    if (num_output_lines < max_lines) {
+      JXL_CHECK(src && src->LoadNextChunk());
+    }
+  }
+}
+
+struct TestConfig {
+  std::string fn;
+  std::string fn_desc;
+  TestImage input;
+  CompressParams jparams;
+  DecompressParams dparams;
+  float max_rms_dist = 1.0f;
+};
+
+std::vector<uint8_t> GetTestJpegData(TestConfig& config) {
+  if (!config.fn.empty()) {
+    return ReadTestData(config.fn.c_str());
+  }
+  GeneratePixels(&config.input);
+  std::vector<uint8_t> compressed;
+  JXL_CHECK(EncodeWithJpegli(config.input, config.jparams, &compressed));
+  return compressed;
+}
+
+bool IsSequential(const TestConfig& config) {
+  if (!config.fn.empty()) {
+    return config.fn_desc.find("PROGR") == std::string::npos;
+  }
+  return config.jparams.progressive_mode <= 0;
+}
+
+class InputSuspensionTestParam : public ::testing::TestWithParam<TestConfig> {};
+
+TEST_P(InputSuspensionTestParam, InputOutputLockStepNonBuffered) {
+  TestConfig config = GetParam();
+  const DecompressParams& dparams = config.dparams;
+  std::vector<uint8_t> compressed = GetTestJpegData(config);
+  bool is_partial = config.dparams.size_factor < 1.0f;
+  if (is_partial) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size,
+                    is_partial);
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+
+    if (config.jparams.add_marker) {
+      jpegli_save_markers(&cinfo, kSpecialMarker0, 0xffff);
+      jpegli_save_markers(&cinfo, kSpecialMarker1, 0xffff);
+      num_markers_seen = 0;
+      jpegli_set_marker_processor(&cinfo, 0xe6, test_marker_processor);
+      jpegli_set_marker_processor(&cinfo, 0xe7, test_marker_processor);
+      jpegli_set_marker_processor(&cinfo, 0xe8, test_marker_processor);
+    }
+    while (jpegli_read_header(&cinfo, TRUE) == JPEG_SUSPENDED) {
+      JXL_CHECK(src.LoadNextChunk());
+    }
+    SetDecompressParams(dparams, &cinfo, true);
+    if (config.jparams.add_marker) {
+      EXPECT_EQ(num_markers_seen, kMarkerSequenceLen);
+      EXPECT_EQ(0, memcmp(markers_seen, kMarkerSequence, num_markers_seen));
+    }
+    VerifyHeader(config.jparams, &cinfo);
+    cinfo.raw_data_out = dparams.output_mode == RAW_DATA;
+
+    if (dparams.output_mode == COEFFICIENTS) {
+      jvirt_barray_ptr* coef_arrays;
+      while ((coef_arrays = jpegli_read_coefficients(&cinfo)) == nullptr) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+      CopyCoefficients(&cinfo, coef_arrays, &output0);
+    } else {
+      while (!jpegli_start_decompress(&cinfo)) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+      ReadOutputImage(dparams, &cinfo, &src, &output0);
+    }
+
+    while (!jpegli_finish_decompress(&cinfo)) {
+      JXL_CHECK(src.LoadNextChunk());
+    }
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  TestImage output1;
+  DecodeWithLibjpeg(config.jparams, dparams, compressed, &output1);
+  VerifyOutputImage(output1, output0, config.max_rms_dist);
+}
+
+TEST_P(InputSuspensionTestParam, InputOutputLockStepBuffered) {
+  TestConfig config = GetParam();
+  if (config.jparams.add_marker) return;
+  const DecompressParams& dparams = config.dparams;
+  std::vector<uint8_t> compressed = GetTestJpegData(config);
+  bool is_partial = config.dparams.size_factor < 1.0f;
+  if (is_partial) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size,
+                    is_partial);
+  std::vector<TestImage> output_progression0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+
+    while (jpegli_read_header(&cinfo, TRUE) == JPEG_SUSPENDED) {
+      JXL_CHECK(src.LoadNextChunk());
+    }
+    SetDecompressParams(dparams, &cinfo, true);
+
+    cinfo.buffered_image = TRUE;
+    cinfo.raw_data_out = dparams.output_mode == RAW_DATA;
+
+    EXPECT_TRUE(jpegli_start_decompress(&cinfo));
+    EXPECT_FALSE(jpegli_input_complete(&cinfo));
+    EXPECT_EQ(0, cinfo.output_scan_number);
+
+    int sos_marker_cnt = 1;  // read_header reads the first SOS marker
+    while (!jpegli_input_complete(&cinfo)) {
+      EXPECT_EQ(cinfo.input_scan_number, sos_marker_cnt);
+      EXPECT_TRUE(jpegli_start_output(&cinfo, cinfo.input_scan_number));
+      // start output sets output_scan_number, but does not change
+      // input_scan_number
+      EXPECT_EQ(cinfo.output_scan_number, cinfo.input_scan_number);
+      EXPECT_EQ(cinfo.input_scan_number, sos_marker_cnt);
+      TestImage output;
+      ReadOutputImage(dparams, &cinfo, &src, &output);
+      output_progression0.emplace_back(std::move(output));
+      // read scanlines/read raw data does not change input/output scan number
+      EXPECT_EQ(cinfo.input_scan_number, sos_marker_cnt);
+      EXPECT_EQ(cinfo.output_scan_number, cinfo.input_scan_number);
+      while (!jpegli_finish_output(&cinfo)) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+      ++sos_marker_cnt;  // finish output reads the next SOS marker or EOI
+      if (dparams.output_mode == COEFFICIENTS) {
+        jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(&cinfo);
+        JXL_CHECK(coef_arrays != nullptr);
+        CopyCoefficients(&cinfo, coef_arrays, &output_progression0.back());
+      }
+    }
+
+    EXPECT_TRUE(jpegli_finish_decompress(&cinfo));
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  std::vector<TestImage> output_progression1;
+  DecodeAllScansWithLibjpeg(config.jparams, dparams, compressed,
+                            &output_progression1);
+  ASSERT_EQ(output_progression0.size(), output_progression1.size());
+  for (size_t i = 0; i < output_progression0.size(); ++i) {
+    const TestImage& output = output_progression0[i];
+    const TestImage& expected = output_progression1[i];
+    VerifyOutputImage(expected, output, config.max_rms_dist);
+  }
+}
+
+TEST_P(InputSuspensionTestParam, PreConsumeInputBuffered) {
+  TestConfig config = GetParam();
+  if (config.jparams.add_marker) return;
+  const DecompressParams& dparams = config.dparams;
+  std::vector<uint8_t> compressed = GetTestJpegData(config);
+  bool is_partial = config.dparams.size_factor < 1.0f;
+  if (is_partial) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  std::vector<TestImage> output_progression1;
+  DecodeAllScansWithLibjpeg(config.jparams, dparams, compressed,
+                            &output_progression1);
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size,
+                    is_partial);
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+
+    int status;
+    while ((status = jpegli_consume_input(&cinfo)) != JPEG_REACHED_SOS) {
+      if (status == JPEG_SUSPENDED) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+    }
+    EXPECT_EQ(JPEG_REACHED_SOS, jpegli_consume_input(&cinfo));
+    cinfo.buffered_image = TRUE;
+    cinfo.raw_data_out = dparams.output_mode == RAW_DATA;
+    cinfo.do_block_smoothing = dparams.do_block_smoothing;
+
+    EXPECT_TRUE(jpegli_start_decompress(&cinfo));
+    EXPECT_FALSE(jpegli_input_complete(&cinfo));
+    EXPECT_EQ(1, cinfo.input_scan_number);
+    EXPECT_EQ(0, cinfo.output_scan_number);
+
+    while ((status = jpegli_consume_input(&cinfo)) != JPEG_REACHED_EOI) {
+      if (status == JPEG_SUSPENDED) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+    }
+
+    EXPECT_TRUE(jpegli_input_complete(&cinfo));
+    EXPECT_EQ(output_progression1.size(), cinfo.input_scan_number);
+    EXPECT_EQ(0, cinfo.output_scan_number);
+
+    EXPECT_TRUE(jpegli_start_output(&cinfo, cinfo.input_scan_number));
+    EXPECT_EQ(output_progression1.size(), cinfo.input_scan_number);
+    EXPECT_EQ(cinfo.output_scan_number, cinfo.input_scan_number);
+
+    ReadOutputImage(dparams, &cinfo, nullptr, &output0);
+    EXPECT_EQ(output_progression1.size(), cinfo.input_scan_number);
+    EXPECT_EQ(cinfo.output_scan_number, cinfo.input_scan_number);
+
+    EXPECT_TRUE(jpegli_finish_output(&cinfo));
+    if (dparams.output_mode == COEFFICIENTS) {
+      jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(&cinfo);
+      JXL_CHECK(coef_arrays != nullptr);
+      CopyCoefficients(&cinfo, coef_arrays, &output0);
+    }
+    EXPECT_TRUE(jpegli_finish_decompress(&cinfo));
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  VerifyOutputImage(output_progression1.back(), output0, config.max_rms_dist);
+}
+
+TEST_P(InputSuspensionTestParam, PreConsumeInputNonBuffered) {
+  TestConfig config = GetParam();
+  if (config.jparams.add_marker || IsSequential(config)) return;
+  const DecompressParams& dparams = config.dparams;
+  std::vector<uint8_t> compressed = GetTestJpegData(config);
+  bool is_partial = config.dparams.size_factor < 1.0f;
+  if (is_partial) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size,
+                    is_partial);
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+
+    int status;
+    while ((status = jpegli_consume_input(&cinfo)) != JPEG_REACHED_SOS) {
+      if (status == JPEG_SUSPENDED) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+    }
+    EXPECT_EQ(JPEG_REACHED_SOS, jpegli_consume_input(&cinfo));
+    cinfo.raw_data_out = dparams.output_mode == RAW_DATA;
+    cinfo.do_block_smoothing = dparams.do_block_smoothing;
+
+    if (dparams.output_mode == COEFFICIENTS) {
+      jpegli_read_coefficients(&cinfo);
+    } else {
+      while (!jpegli_start_decompress(&cinfo)) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+    }
+
+    while ((status = jpegli_consume_input(&cinfo)) != JPEG_REACHED_EOI) {
+      if (status == JPEG_SUSPENDED) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+    }
+
+    if (dparams.output_mode == COEFFICIENTS) {
+      jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(&cinfo);
+      JXL_CHECK(coef_arrays != nullptr);
+      CopyCoefficients(&cinfo, coef_arrays, &output0);
+    } else {
+      ReadOutputImage(dparams, &cinfo, nullptr, &output0);
+    }
+
+    EXPECT_TRUE(jpegli_finish_decompress(&cinfo));
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  TestImage output1;
+  DecodeWithLibjpeg(config.jparams, dparams, compressed, &output1);
+  VerifyOutputImage(output1, output0, config.max_rms_dist);
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  std::vector<std::pair<std::string, std::string>> testfiles({
+      {"jxl/flower/flower.png.im_q85_444.jpg", "Q85YUV444"},
+      {"jxl/flower/flower.png.im_q85_420_R13B.jpg", "Q85YUV420R13B"},
+      {"jxl/flower/flower.png.im_q85_420_progr.jpg", "Q85YUV420PROGR"},
+  });
+  for (const auto& it : testfiles) {
+    for (size_t chunk_size : {1, 64, 65536}) {
+      for (size_t max_output_lines : {0, 1, 8, 16}) {
+        TestConfig config;
+        config.fn = it.first;
+        config.fn_desc = it.second;
+        config.dparams.chunk_size = chunk_size;
+        config.dparams.max_output_lines = max_output_lines;
+        all_tests.push_back(config);
+        if (max_output_lines == 16) {
+          config.dparams.output_mode = RAW_DATA;
+          all_tests.push_back(config);
+          config.dparams.output_mode = COEFFICIENTS;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (size_t r : {1, 17, 1024}) {
+    for (size_t chunk_size : {1, 65536}) {
+      TestConfig config;
+      config.dparams.chunk_size = chunk_size;
+      config.jparams.progressive_mode = 2;
+      config.jparams.restart_interval = r;
+      all_tests.push_back(config);
+    }
+  }
+  for (size_t chunk_size : {1, 4, 1024}) {
+    TestConfig config;
+    config.input.xsize = 256;
+    config.input.ysize = 256;
+    config.dparams.chunk_size = chunk_size;
+    config.jparams.add_marker = true;
+    all_tests.push_back(config);
+  }
+  // Tests for partial input.
+  for (float size_factor : {0.1f, 0.33f, 0.5f, 0.75f}) {
+    for (int progr : {0, 1, 3}) {
+      for (int samp : {1, 2}) {
+        for (JpegIOMode output_mode : {PIXELS, RAW_DATA}) {
+          TestConfig config;
+          config.input.xsize = 517;
+          config.input.ysize = 523;
+          config.jparams.h_sampling = {samp, 1, 1};
+          config.jparams.v_sampling = {samp, 1, 1};
+          config.jparams.progressive_mode = progr;
+          config.dparams.size_factor = size_factor;
+          config.dparams.output_mode = output_mode;
+          // The last partially available block can behave differently.
+          // TODO(szabadka) Figure out if we can make the behaviour more
+          // similar.
+          config.max_rms_dist = samp == 1 ? 1.75f : 3.0f;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  // Tests for block smoothing.
+  for (float size_factor : {0.1f, 0.33f, 0.5f, 0.75f, 1.0f}) {
+    for (int samp : {1, 2}) {
+      TestConfig config;
+      config.input.xsize = 517;
+      config.input.ysize = 523;
+      config.jparams.h_sampling = {samp, 1, 1};
+      config.jparams.v_sampling = {samp, 1, 1};
+      config.jparams.progressive_mode = 2;
+      config.dparams.size_factor = size_factor;
+      config.dparams.do_block_smoothing = true;
+      // libjpeg does smoothing for incomplete scans differently at
+      // the border between current and previous scans.
+      config.max_rms_dist = 8.0f;
+      all_tests.push_back(config);
+    }
+  }
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  if (!c.fn.empty()) {
+    os << c.fn_desc;
+  } else {
+    os << c.input;
+  }
+  os << c.jparams;
+  if (c.dparams.chunk_size == 0) {
+    os << "CompleteInput";
+  } else {
+    os << "InputChunks" << c.dparams.chunk_size;
+  }
+  if (c.dparams.size_factor < 1.0f) {
+    os << "Partial" << static_cast<int>(c.dparams.size_factor * 100) << "p";
+  }
+  if (c.dparams.max_output_lines == 0) {
+    os << "CompleteOutput";
+  } else {
+    os << "OutputLines" << c.dparams.max_output_lines;
+  }
+  if (c.dparams.output_mode == RAW_DATA) {
+    os << "RawDataOut";
+  } else if (c.dparams.output_mode == COEFFICIENTS) {
+    os << "CoeffsOut";
+  }
+  if (c.dparams.do_block_smoothing) {
+    os << "BlockSmoothing";
+  }
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<InputSuspensionTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(InputSuspensionTest, InputSuspensionTestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/jpeg.version.62 b/third_party/jpeg-xl/lib/jpegli/jpeg.version.62
new file mode 100644
index 0000000000..3a8d1f5ec5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/jpeg.version.62
@@ -0,0 +1,11 @@
+LIBJPEG_6.2 {
+  global:
+    jpeg*;
+};
+
+LIBJPEGTURBO_6.2 {
+  global:
+    jpeg_mem_src*;
+    jpeg_mem_dest*;
+    tj*;
+};
+\ No newline at end of file
diff --git a/third_party/jpeg-xl/lib/jpegli/jpeg.version.8 b/third_party/jpeg-xl/lib/jpegli/jpeg.version.8
new file mode 100644
index 0000000000..aa891f8571
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/jpeg.version.8
@@ -0,0 +1,9 @@
+LIBJPEG_8.0 {
+  global:
+    jpeg*;
+};
+
+LIBJPEGTURBO_8.0 {
+  global:
+    tj*;
+};
diff --git a/third_party/jpeg-xl/lib/jpegli/libjpeg_wrapper.cc b/third_party/jpeg-xl/lib/jpegli/libjpeg_wrapper.cc
new file mode 100644
index 0000000000..ef5ef224d3
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/libjpeg_wrapper.cc
@@ -0,0 +1,260 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// This file contains wrapper-functions that are used to build the libjpeg.so
+// shared library that is API- and ABI-compatible with libjpeg-turbo's version
+// of libjpeg.so.
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/error.h"
+
+struct jpeg_error_mgr *jpeg_std_error(struct jpeg_error_mgr *err) {
+  return jpegli_std_error(err);
+}
+
+void jpeg_abort(j_common_ptr cinfo) { jpegli_abort(cinfo); }
+
+void jpeg_destroy(j_common_ptr cinfo) { jpegli_destroy(cinfo); }
+
+JQUANT_TBL *jpeg_alloc_quant_table(j_common_ptr cinfo) {
+  return jpegli_alloc_quant_table(cinfo);
+}
+
+JHUFF_TBL *jpeg_alloc_huff_table(j_common_ptr cinfo) {
+  return jpegli_alloc_huff_table(cinfo);
+}
+
+void jpeg_CreateDecompress(j_decompress_ptr cinfo, int version,
+                           size_t structsize) {
+  jpegli_CreateDecompress(cinfo, version, structsize);
+}
+
+void jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile) {
+  jpegli_stdio_src(cinfo, infile);
+}
+
+void jpeg_mem_src(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+                  unsigned long insize) {
+  jpegli_mem_src(cinfo, inbuffer, insize);
+}
+
+int jpeg_read_header(j_decompress_ptr cinfo, boolean require_image) {
+  return jpegli_read_header(cinfo, require_image);
+}
+
+boolean jpeg_start_decompress(j_decompress_ptr cinfo) {
+  return jpegli_start_decompress(cinfo);
+}
+
+JDIMENSION jpeg_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
+                               JDIMENSION max_lines) {
+  return jpegli_read_scanlines(cinfo, scanlines, max_lines);
+}
+
+JDIMENSION jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines) {
+  return jpegli_skip_scanlines(cinfo, num_lines);
+}
+
+void jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                        JDIMENSION *width) {
+  jpegli_crop_scanline(cinfo, xoffset, width);
+}
+
+boolean jpeg_finish_decompress(j_decompress_ptr cinfo) {
+  return jpegli_finish_decompress(cinfo);
+}
+
+JDIMENSION jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                              JDIMENSION max_lines) {
+  return jpegli_read_raw_data(cinfo, data, max_lines);
+}
+
+jvirt_barray_ptr *jpeg_read_coefficients(j_decompress_ptr cinfo) {
+  return jpegli_read_coefficients(cinfo);
+}
+
+boolean jpeg_has_multiple_scans(j_decompress_ptr cinfo) {
+  return jpegli_has_multiple_scans(cinfo);
+}
+
+boolean jpeg_start_output(j_decompress_ptr cinfo, int scan_number) {
+  return jpegli_start_output(cinfo, scan_number);
+}
+
+boolean jpeg_finish_output(j_decompress_ptr cinfo) {
+  return jpegli_finish_output(cinfo);
+}
+
+boolean jpeg_input_complete(j_decompress_ptr cinfo) {
+  return jpegli_input_complete(cinfo);
+}
+
+int jpeg_consume_input(j_decompress_ptr cinfo) {
+  return jpegli_consume_input(cinfo);
+}
+
+#if JPEG_LIB_VERSION >= 80
+void jpeg_core_output_dimensions(j_decompress_ptr cinfo) {
+  jpegli_core_output_dimensions(cinfo);
+}
+#endif
+void jpeg_calc_output_dimensions(j_decompress_ptr cinfo) {
+  jpegli_calc_output_dimensions(cinfo);
+}
+
+void jpeg_save_markers(j_decompress_ptr cinfo, int marker_code,
+                       unsigned int length_limit) {
+  jpegli_save_markers(cinfo, marker_code, length_limit);
+}
+
+void jpeg_set_marker_processor(j_decompress_ptr cinfo, int marker_code,
+                               jpeg_marker_parser_method routine) {
+  jpegli_set_marker_processor(cinfo, marker_code, routine);
+}
+
+boolean jpeg_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
+                              unsigned int *icc_data_len) {
+  return jpegli_read_icc_profile(cinfo, icc_data_ptr, icc_data_len);
+}
+
+void jpeg_abort_decompress(j_decompress_ptr cinfo) {
+  return jpegli_abort_decompress(cinfo);
+}
+
+void jpeg_destroy_decompress(j_decompress_ptr cinfo) {
+  return jpegli_destroy_decompress(cinfo);
+}
+
+void jpeg_CreateCompress(j_compress_ptr cinfo, int version, size_t structsize) {
+  jpegli_CreateCompress(cinfo, version, structsize);
+}
+
+void jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile) {
+  jpegli_stdio_dest(cinfo, outfile);
+}
+
+void jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
+                   unsigned long *outsize) {
+  jpegli_mem_dest(cinfo, outbuffer, outsize);
+}
+
+void jpeg_set_defaults(j_compress_ptr cinfo) { jpegli_set_defaults(cinfo); }
+
+void jpeg_default_colorspace(j_compress_ptr cinfo) {
+  jpegli_default_colorspace(cinfo);
+}
+
+void jpeg_set_colorspace(j_compress_ptr cinfo, J_COLOR_SPACE colorspace) {
+  jpegli_set_colorspace(cinfo, colorspace);
+}
+
+void jpeg_set_quality(j_compress_ptr cinfo, int quality,
+                      boolean force_baseline) {
+  jpegli_set_quality(cinfo, quality, force_baseline);
+}
+
+void jpeg_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                             boolean force_baseline) {
+  jpegli_set_linear_quality(cinfo, scale_factor, force_baseline);
+}
+
+#if JPEG_LIB_VERSION >= 70
+void jpeg_default_qtables(j_compress_ptr cinfo, boolean force_baseline) {
+  jpegli_default_qtables(cinfo, force_baseline);
+}
+#endif
+
+int jpeg_quality_scaling(int quality) {
+  return jpegli_quality_scaling(quality);
+}
+
+void jpeg_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                          const unsigned int *basic_table, int scale_factor,
+                          boolean force_baseline) {
+  jpegli_add_quant_table(cinfo, which_tbl, basic_table, scale_factor,
+                         force_baseline);
+}
+
+void jpeg_simple_progression(j_compress_ptr cinfo) {
+  jpegli_simple_progression(cinfo);
+}
+
+void jpeg_suppress_tables(j_compress_ptr cinfo, boolean suppress) {
+  jpegli_suppress_tables(cinfo, suppress);
+}
+
+#if JPEG_LIB_VERSION >= 70
+void jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo) {
+  jpegli_calc_jpeg_dimensions(cinfo);
+}
+#endif
+
+void jpeg_copy_critical_parameters(j_decompress_ptr srcinfo,
+                                   j_compress_ptr dstinfo) {
+  jpegli_copy_critical_parameters(srcinfo, dstinfo);
+}
+
+void jpeg_write_m_header(j_compress_ptr cinfo, int marker,
+                         unsigned int datalen) {
+  jpegli_write_m_header(cinfo, marker, datalen);
+}
+
+void jpeg_write_m_byte(j_compress_ptr cinfo, int val) {
+  jpegli_write_m_byte(cinfo, val);
+}
+
+void jpeg_write_marker(j_compress_ptr cinfo, int marker, const JOCTET *dataptr,
+                       unsigned int datalen) {
+  jpegli_write_marker(cinfo, marker, dataptr, datalen);
+}
+
+void jpeg_write_icc_profile(j_compress_ptr cinfo, const JOCTET *icc_data_ptr,
+                            unsigned int icc_data_len) {
+  jpegli_write_icc_profile(cinfo, icc_data_ptr, icc_data_len);
+}
+
+void jpeg_start_compress(j_compress_ptr cinfo, boolean write_all_tables) {
+  jpegli_start_compress(cinfo, write_all_tables);
+}
+
+void jpeg_write_tables(j_compress_ptr cinfo) { jpegli_write_tables(cinfo); }
+
+JDIMENSION jpeg_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
+                                JDIMENSION num_lines) {
+  return jpegli_write_scanlines(cinfo, scanlines, num_lines);
+}
+
+JDIMENSION jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                               JDIMENSION num_lines) {
+  return jpegli_write_raw_data(cinfo, data, num_lines);
+}
+
+void jpeg_write_coefficients(j_compress_ptr cinfo,
+                             jvirt_barray_ptr *coef_arrays) {
+  jpegli_write_coefficients(cinfo, coef_arrays);
+}
+
+void jpeg_finish_compress(j_compress_ptr cinfo) {
+  jpegli_finish_compress(cinfo);
+}
+
+void jpeg_abort_compress(j_compress_ptr cinfo) { jpegli_abort_compress(cinfo); }
+
+void jpeg_destroy_compress(j_compress_ptr cinfo) {
+  jpegli_destroy_compress(cinfo);
+}
+
+boolean jpeg_resync_to_restart(j_decompress_ptr cinfo, int desired) {
+  return jpegli_resync_to_restart(cinfo, desired);
+}
+
+void jpeg_new_colormap(j_decompress_ptr cinfo) { jpegli_new_colormap(cinfo); }
diff --git a/third_party/jpeg-xl/lib/jpegli/memory_manager.cc b/third_party/jpeg-xl/lib/jpegli/memory_manager.cc
new file mode 100644
index 0000000000..f6530d8f02
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/memory_manager.cc
@@ -0,0 +1,181 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/memory_manager.h"
+
+#include <string.h>
+
+#include <hwy/aligned_allocator.h>
+#include <vector>
+
+#include "lib/jpegli/common_internal.h"
+#include "lib/jpegli/error.h"
+
+struct jvirt_sarray_control {
+  JSAMPARRAY full_buffer;
+  size_t numrows;
+  JDIMENSION maxaccess;
+};
+
+struct jvirt_barray_control {
+  JBLOCKARRAY full_buffer;
+  size_t numrows;
+  JDIMENSION maxaccess;
+};
+
+namespace jpegli {
+
+namespace {
+
+struct MemoryManager {
+  struct jpeg_memory_mgr pub;
+  std::vector<void*> owned_ptrs[2 * JPOOL_NUMPOOLS];
+  uint64_t pool_memory_usage[2 * JPOOL_NUMPOOLS];
+  uint64_t total_memory_usage;
+  uint64_t peak_memory_usage;
+};
+
+void* Alloc(j_common_ptr cinfo, int pool_id, size_t sizeofobject) {
+  MemoryManager* mem = reinterpret_cast<MemoryManager*>(cinfo->mem);
+  if (pool_id < 0 || pool_id >= 2 * JPOOL_NUMPOOLS) {
+    JPEGLI_ERROR("Invalid pool id %d", pool_id);
+  }
+  if (mem->pub.max_memory_to_use > 0 &&
+      mem->total_memory_usage + static_cast<uint64_t>(sizeofobject) >
+          static_cast<uint64_t>(mem->pub.max_memory_to_use)) {
+    JPEGLI_ERROR("Total memory usage exceeding %ld",
+                 mem->pub.max_memory_to_use);
+  }
+  void* p;
+  if (pool_id < JPOOL_NUMPOOLS) {
+    p = malloc(sizeofobject);
+  } else {
+    p = hwy::AllocateAlignedBytes(sizeofobject, nullptr, nullptr);
+  }
+  if (p == nullptr) {
+    JPEGLI_ERROR("Out of memory");
+  }
+  mem->owned_ptrs[pool_id].push_back(p);
+  mem->pool_memory_usage[pool_id] += sizeofobject;
+  mem->total_memory_usage += sizeofobject;
+  mem->peak_memory_usage =
+      std::max(mem->peak_memory_usage, mem->total_memory_usage);
+  return p;
+}
+
+template <typename T>
+T** Alloc2dArray(j_common_ptr cinfo, int pool_id, JDIMENSION samplesperrow,
+                 JDIMENSION numrows) {
+  T** array = Allocate<T*>(cinfo, numrows, pool_id);
+  // Always use aligned allocator for large 2d arrays.
+  if (pool_id < JPOOL_NUMPOOLS) {
+    pool_id += JPOOL_NUMPOOLS;
+  }
+  size_t stride = RoundUpTo(samplesperrow, HWY_ALIGNMENT);
+  T* buffer = Allocate<T>(cinfo, numrows * stride, pool_id);
+  for (size_t i = 0; i < numrows; ++i) {
+    array[i] = &buffer[i * stride];
+  }
+  return array;
+}
+
+template <typename Control, typename T>
+Control* RequestVirtualArray(j_common_ptr cinfo, int pool_id, boolean pre_zero,
+                             JDIMENSION samplesperrow, JDIMENSION numrows,
+                             JDIMENSION maxaccess) {
+  if (pool_id != JPOOL_IMAGE) {
+    JPEGLI_ERROR("Only image lifetime virtual arrays are supported.");
+  }
+  Control* p = Allocate<Control>(cinfo, 1, pool_id);
+  p->full_buffer = Alloc2dArray<T>(cinfo, pool_id, samplesperrow, numrows);
+  p->numrows = numrows;
+  p->maxaccess = maxaccess;
+  if (pre_zero) {
+    for (size_t i = 0; i < numrows; ++i) {
+      memset(p->full_buffer[i], 0, samplesperrow * sizeof(T));
+    }
+  }
+  return p;
+}
+
+void RealizeVirtualArrays(j_common_ptr cinfo) {
+  // Nothing to do, the full arrays were realized at request time already.
+}
+
+template <typename Control, typename T>
+T** AccessVirtualArray(j_common_ptr cinfo, Control* ptr, JDIMENSION start_row,
+                       JDIMENSION num_rows, boolean writable) {
+  if (num_rows > ptr->maxaccess) {
+    JPEGLI_ERROR("Invalid virtual array access, num rows %u vs max rows %u",
+                 num_rows, ptr->maxaccess);
+  }
+  if (start_row + num_rows > ptr->numrows) {
+    JPEGLI_ERROR("Invalid virtual array access, %u vs %u total rows",
+                 start_row + num_rows, ptr->numrows);
+  }
+  if (ptr->full_buffer == nullptr) {
+    JPEGLI_ERROR("Invalid virtual array access, array not realized.");
+  }
+  return ptr->full_buffer + start_row;
+}
+
+void ClearPool(j_common_ptr cinfo, int pool_id) {
+  MemoryManager* mem = reinterpret_cast<MemoryManager*>(cinfo->mem);
+  mem->owned_ptrs[pool_id].clear();
+  mem->total_memory_usage -= mem->pool_memory_usage[pool_id];
+  mem->pool_memory_usage[pool_id] = 0;
+}
+
+void FreePool(j_common_ptr cinfo, int pool_id) {
+  MemoryManager* mem = reinterpret_cast<MemoryManager*>(cinfo->mem);
+  if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS) {
+    JPEGLI_ERROR("Invalid pool id %d", pool_id);
+  }
+  for (void* ptr : mem->owned_ptrs[pool_id]) {
+    free(ptr);
+  }
+  ClearPool(cinfo, pool_id);
+  for (void* ptr : mem->owned_ptrs[JPOOL_NUMPOOLS + pool_id]) {
+    hwy::FreeAlignedBytes(ptr, nullptr, nullptr);
+  }
+  ClearPool(cinfo, JPOOL_NUMPOOLS + pool_id);
+}
+
+void SelfDestruct(j_common_ptr cinfo) {
+  MemoryManager* mem = reinterpret_cast<MemoryManager*>(cinfo->mem);
+  for (int pool_id = 0; pool_id < JPOOL_NUMPOOLS; ++pool_id) {
+    FreePool(cinfo, pool_id);
+  }
+  delete mem;
+  cinfo->mem = nullptr;
+}
+
+}  // namespace
+
+void InitMemoryManager(j_common_ptr cinfo) {
+  MemoryManager* mem = new MemoryManager;
+  mem->pub.alloc_small = jpegli::Alloc;
+  mem->pub.alloc_large = jpegli::Alloc;
+  mem->pub.alloc_sarray = jpegli::Alloc2dArray<JSAMPLE>;
+  mem->pub.alloc_barray = jpegli::Alloc2dArray<JBLOCK>;
+  mem->pub.request_virt_sarray =
+      jpegli::RequestVirtualArray<jvirt_sarray_control, JSAMPLE>;
+  mem->pub.request_virt_barray =
+      jpegli::RequestVirtualArray<jvirt_barray_control, JBLOCK>;
+  mem->pub.realize_virt_arrays = jpegli::RealizeVirtualArrays;
+  mem->pub.access_virt_sarray =
+      jpegli::AccessVirtualArray<jvirt_sarray_control, JSAMPLE>;
+  mem->pub.access_virt_barray =
+      jpegli::AccessVirtualArray<jvirt_barray_control, JBLOCK>;
+  mem->pub.free_pool = jpegli::FreePool;
+  mem->pub.self_destruct = jpegli::SelfDestruct;
+  mem->pub.max_memory_to_use = 0;
+  mem->total_memory_usage = 0;
+  mem->peak_memory_usage = 0;
+  memset(mem->pool_memory_usage, 0, sizeof(mem->pool_memory_usage));
+  cinfo->mem = reinterpret_cast<struct jpeg_memory_mgr*>(mem);
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/memory_manager.h b/third_party/jpeg-xl/lib/jpegli/memory_manager.h
new file mode 100644
index 0000000000..238f85a308
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/memory_manager.h
@@ -0,0 +1,40 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_MEMORY_MANAGER_H_
+#define LIB_JPEGLI_MEMORY_MANAGER_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+#include <stdlib.h>
+/* clang-format on */
+
+#define JPOOL_PERMANENT_ALIGNED (JPOOL_NUMPOOLS + JPOOL_PERMANENT)
+#define JPOOL_IMAGE_ALIGNED (JPOOL_NUMPOOLS + JPOOL_IMAGE)
+
+namespace jpegli {
+
+void InitMemoryManager(j_common_ptr cinfo);
+
+template <typename T>
+T* Allocate(j_common_ptr cinfo, size_t len, int pool_id = JPOOL_PERMANENT) {
+  void* p = (*cinfo->mem->alloc_small)(cinfo, pool_id, len * sizeof(T));
+  return reinterpret_cast<T*>(p);
+}
+
+template <typename T>
+T* Allocate(j_decompress_ptr cinfo, size_t len, int pool_id = JPOOL_PERMANENT) {
+  return Allocate<T>(reinterpret_cast<j_common_ptr>(cinfo), len, pool_id);
+}
+
+template <typename T>
+T* Allocate(j_compress_ptr cinfo, size_t len, int pool_id = JPOOL_PERMANENT) {
+  return Allocate<T>(reinterpret_cast<j_common_ptr>(cinfo), len, pool_id);
+}
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_MEMORY_MANAGER_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/output_suspension_test.cc b/third_party/jpeg-xl/lib/jpegli/output_suspension_test.cc
new file mode 100644
index 0000000000..73db791727
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/output_suspension_test.cc
@@ -0,0 +1,219 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+
+namespace jpegli {
+namespace {
+
+static constexpr size_t kInitialBufferSize = 1024;
+static constexpr size_t kFinalBufferSize = 18;
+
+struct DestinationManager {
+  jpeg_destination_mgr pub;
+  std::vector<uint8_t> buffer;
+
+  DestinationManager() {
+    pub.init_destination = init_destination;
+    pub.empty_output_buffer = empty_output_buffer;
+    pub.term_destination = term_destination;
+  }
+
+  void Rewind() {
+    pub.next_output_byte = buffer.data();
+    pub.free_in_buffer = buffer.size();
+  }
+
+  void EmptyTo(std::vector<uint8_t>* output, size_t new_size = 0) {
+    output->insert(output->end(), buffer.data(), pub.next_output_byte);
+    if (new_size > 0) {
+      buffer.resize(new_size);
+    }
+    Rewind();
+  }
+
+  static void init_destination(j_compress_ptr cinfo) {
+    auto us = reinterpret_cast<DestinationManager*>(cinfo->dest);
+    us->buffer.resize(kInitialBufferSize);
+    us->Rewind();
+  }
+
+  static boolean empty_output_buffer(j_compress_ptr cinfo) { return FALSE; }
+
+  static void term_destination(j_compress_ptr cinfo) {}
+};
+
+struct TestConfig {
+  TestImage input;
+  CompressParams jparams;
+  size_t buffer_size;
+  size_t lines_batch_size;
+};
+
+class OutputSuspensionTestParam : public ::testing::TestWithParam<TestConfig> {
+};
+
+TEST_P(OutputSuspensionTestParam, PixelData) {
+  jpeg_compress_struct cinfo = {};
+  TestConfig config = GetParam();
+  TestImage& input = config.input;
+  GeneratePixels(&input);
+  DestinationManager dest;
+  std::vector<uint8_t> compressed;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.dest = reinterpret_cast<jpeg_destination_mgr*>(&dest);
+
+    cinfo.image_width = input.xsize;
+    cinfo.image_height = input.ysize;
+    cinfo.input_components = input.components;
+    cinfo.in_color_space = JCS_RGB;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].v_samp_factor = config.jparams.v_sampling[0];
+    jpegli_set_progressive_level(&cinfo, 0);
+    cinfo.optimize_coding = FALSE;
+    jpegli_start_compress(&cinfo, TRUE);
+
+    size_t stride = cinfo.image_width * cinfo.input_components;
+    std::vector<uint8_t> row_bytes(config.lines_batch_size * stride);
+    while (cinfo.next_scanline < cinfo.image_height) {
+      size_t lines_left = cinfo.image_height - cinfo.next_scanline;
+      size_t num_lines = std::min(config.lines_batch_size, lines_left);
+      memcpy(&row_bytes[0], &input.pixels[cinfo.next_scanline * stride],
+             num_lines * stride);
+      std::vector<JSAMPROW> rows(num_lines);
+      for (size_t i = 0; i < num_lines; ++i) {
+        rows[i] = &row_bytes[i * stride];
+      }
+      size_t lines_done = 0;
+      while (lines_done < num_lines) {
+        lines_done += jpegli_write_scanlines(&cinfo, &rows[lines_done],
+                                             num_lines - lines_done);
+        if (lines_done < num_lines) {
+          dest.EmptyTo(&compressed, config.buffer_size);
+        }
+      }
+    }
+    dest.EmptyTo(&compressed, kFinalBufferSize);
+    jpegli_finish_compress(&cinfo);
+    dest.EmptyTo(&compressed);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  TestImage output;
+  DecodeWithLibjpeg(CompressParams(), DecompressParams(), compressed, &output);
+  VerifyOutputImage(input, output, 2.5);
+}
+
+TEST_P(OutputSuspensionTestParam, RawData) {
+  jpeg_compress_struct cinfo = {};
+  TestConfig config = GetParam();
+  if (config.lines_batch_size != 1) return;
+  TestImage& input = config.input;
+  input.color_space = JCS_YCbCr;
+  GeneratePixels(&input);
+  GenerateRawData(config.jparams, &input);
+  DestinationManager dest;
+  std::vector<uint8_t> compressed;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.dest = reinterpret_cast<jpeg_destination_mgr*>(&dest);
+    cinfo.image_width = input.xsize;
+    cinfo.image_height = input.ysize;
+    cinfo.input_components = input.components;
+    cinfo.in_color_space = JCS_YCbCr;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].v_samp_factor = config.jparams.v_sampling[0];
+    jpegli_set_progressive_level(&cinfo, 0);
+    cinfo.optimize_coding = FALSE;
+    cinfo.raw_data_in = TRUE;
+    jpegli_start_compress(&cinfo, TRUE);
+
+    std::vector<std::vector<uint8_t>> raw_data = input.raw_data;
+    size_t max_lines = config.jparams.max_v_sample() * DCTSIZE;
+    std::vector<std::vector<JSAMPROW>> rowdata(cinfo.num_components);
+    std::vector<JSAMPARRAY> data(cinfo.num_components);
+    for (int c = 0; c < cinfo.num_components; ++c) {
+      rowdata[c].resize(config.jparams.v_samp(c) * DCTSIZE);
+      data[c] = &rowdata[c][0];
+    }
+    while (cinfo.next_scanline < cinfo.image_height) {
+      for (int c = 0; c < cinfo.num_components; ++c) {
+        size_t cwidth = cinfo.comp_info[c].width_in_blocks * DCTSIZE;
+        size_t cheight = cinfo.comp_info[c].height_in_blocks * DCTSIZE;
+        size_t num_lines = config.jparams.v_samp(c) * DCTSIZE;
+        size_t y0 = (cinfo.next_scanline / max_lines) * num_lines;
+        for (size_t i = 0; i < num_lines; ++i) {
+          rowdata[c][i] =
+              (y0 + i < cheight ? &raw_data[c][(y0 + i) * cwidth] : nullptr);
+        }
+      }
+      while (jpegli_write_raw_data(&cinfo, &data[0], max_lines) == 0) {
+        dest.EmptyTo(&compressed, config.buffer_size);
+      }
+    }
+    dest.EmptyTo(&compressed, kFinalBufferSize);
+    jpegli_finish_compress(&cinfo);
+    dest.EmptyTo(&compressed);
+    return true;
+  };
+  try_catch_block();
+  jpegli_destroy_compress(&cinfo);
+  DecompressParams dparams;
+  dparams.output_mode = RAW_DATA;
+  TestImage output;
+  DecodeWithLibjpeg(CompressParams(), dparams, compressed, &output);
+  VerifyOutputImage(input, output, 3.5);
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  const size_t xsize0 = 1920;
+  const size_t ysize0 = 1080;
+  for (int dysize : {0, 1, 8, 9}) {
+    for (int v_sampling : {1, 2}) {
+      for (int nlines : {1, 8, 117}) {
+        for (int bufsize : {1, 16, 16 << 10}) {
+          TestConfig config;
+          config.lines_batch_size = nlines;
+          config.buffer_size = bufsize;
+          config.input.xsize = xsize0;
+          config.input.ysize = ysize0 + dysize;
+          config.jparams.h_sampling = {1, 1, 1};
+          config.jparams.v_sampling = {v_sampling, 1, 1};
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  os << c.input;
+  os << c.jparams;
+  os << "Lines" << c.lines_batch_size;
+  os << "BufSize" << c.buffer_size;
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<OutputSuspensionTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(OutputSuspensionTest, OutputSuspensionTestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/quant.cc b/third_party/jpeg-xl/lib/jpegli/quant.cc
new file mode 100644
index 0000000000..3ab9bcf856
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/quant.cc
@@ -0,0 +1,748 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/quant.h"
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "lib/jpegli/adaptive_quantization.h"
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/status.h"
+
+namespace jpegli {
+
+namespace {
+
+// Global scale is chosen in a way that butteraugli 3-norm matches libjpeg
+// with the same quality setting. Fitted for quality 90 on jyrki31 corpus.
+constexpr float kGlobalScaleXYB = 1.43951668f;
+constexpr float kGlobalScaleYCbCr = 1.66986909f;
+
+static constexpr float kBaseQuantMatrixXYB[] = {
+    // c = 0
+    7.5629935265f,
+    19.8247814178f,
+    22.5724945068f,
+    20.6706695557f,
+    22.6864585876f,
+    23.5696277618f,
+    25.8129081726f,
+    36.3307571411f,
+    19.8247814178f,
+    21.5503177643f,
+    19.9372234344f,
+    20.5424213409f,
+    21.8645496368f,
+    23.9041385651f,
+    28.2844066620f,
+    32.6609764099f,
+    22.5724945068f,
+    19.9372234344f,
+    21.9017257690f,
+    19.1223449707f,
+    21.7515811920f,
+    24.6724700928f,
+    25.4249649048f,
+    32.6653823853f,
+    20.6706695557f,
+    20.5424213409f,
+    19.1223449707f,
+    20.1610221863f,
+    25.3719692230f,
+    25.9668903351f,
+    30.9804954529f,
+    31.3406009674f,
+    22.6864585876f,
+    21.8645496368f,
+    21.7515811920f,
+    25.3719692230f,
+    26.2431850433f,
+    40.5992202759f,
+    43.2624626160f,
+    63.3010940552f,
+    23.5696277618f,
+    23.9041385651f,
+    24.6724700928f,
+    25.9668903351f,
+    40.5992202759f,
+    48.3026771545f,
+    34.0964355469f,
+    61.9852142334f,
+    25.8129081726f,
+    28.2844066620f,
+    25.4249649048f,
+    30.9804954529f,
+    43.2624626160f,
+    34.0964355469f,
+    34.4937438965f,
+    66.9702758789f,
+    36.3307571411f,
+    32.6609764099f,
+    32.6653823853f,
+    31.3406009674f,
+    63.3010940552f,
+    61.9852142334f,
+    66.9702758789f,
+    39.9652709961f,
+    // c = 1
+    1.6262000799f,
+    3.2199242115f,
+    3.4903779030f,
+    3.9148359299f,
+    4.8337211609f,
+    4.9108843803f,
+    5.3137121201f,
+    6.1676793098f,
+    3.2199242115f,
+    3.4547898769f,
+    3.6036829948f,
+    4.2652835846f,
+    4.8368387222f,
+    4.8226222992f,
+    5.6120514870f,
+    6.3431472778f,
+    3.4903779030f,
+    3.6036829948f,
+    3.9044559002f,
+    4.3374395370f,
+    4.8435096741f,
+    5.4057979584f,
+    5.6066360474f,
+    6.1075134277f,
+    3.9148359299f,
+    4.2652835846f,
+    4.3374395370f,
+    4.6064834595f,
+    5.1751475334f,
+    5.4013924599f,
+    6.0399808884f,
+    6.7825231552f,
+    4.8337211609f,
+    4.8368387222f,
+    4.8435096741f,
+    5.1751475334f,
+    5.3748049736f,
+    6.1410837173f,
+    7.6529307365f,
+    7.5235214233f,
+    4.9108843803f,
+    4.8226222992f,
+    5.4057979584f,
+    5.4013924599f,
+    6.1410837173f,
+    6.3431472778f,
+    7.1083049774f,
+    7.6008300781f,
+    5.3137121201f,
+    5.6120514870f,
+    5.6066360474f,
+    6.0399808884f,
+    7.6529307365f,
+    7.1083049774f,
+    7.0943155289f,
+    7.0478363037f,
+    6.1676793098f,
+    6.3431472778f,
+    6.1075134277f,
+    6.7825231552f,
+    7.5235214233f,
+    7.6008300781f,
+    7.0478363037f,
+    6.9186143875f,
+    // c = 2
+    3.3038473129f,
+    10.0689258575f,
+    12.2785224915f,
+    14.6041173935f,
+    16.2107315063f,
+    19.2314529419f,
+    28.0129547119f,
+    55.6682891846f,
+    10.0689258575f,
+    11.4085016251f,
+    11.3871345520f,
+    15.4934167862f,
+    16.5364933014f,
+    14.9153423309f,
+    26.3748722076f,
+    40.8614425659f,
+    12.2785224915f,
+    11.3871345520f,
+    17.0886878967f,
+    13.9500350952f,
+    16.0003223419f,
+    28.5660629272f,
+    26.2124195099f,
+    30.1260128021f,
+    14.6041173935f,
+    15.4934167862f,
+    13.9500350952f,
+    21.1235027313f,
+    26.1579780579f,
+    25.5579223633f,
+    40.6859359741f,
+    33.8056335449f,
+    16.2107315063f,
+    16.5364933014f,
+    16.0003223419f,
+    26.1579780579f,
+    26.8042831421f,
+    26.1587715149f,
+    35.7343978882f,
+    43.6857032776f,
+    19.2314529419f,
+    14.9153423309f,
+    28.5660629272f,
+    25.5579223633f,
+    26.1587715149f,
+    34.5418128967f,
+    41.3197937012f,
+    48.7867660522f,
+    28.0129547119f,
+    26.3748722076f,
+    26.2124195099f,
+    40.6859359741f,
+    35.7343978882f,
+    41.3197937012f,
+    47.6329460144f,
+    55.3498458862f,
+    55.6682891846f,
+    40.8614425659f,
+    30.1260128021f,
+    33.8056335449f,
+    43.6857032776f,
+    48.7867660522f,
+    55.3498458862f,
+    63.6065597534f,
+};
+
+static const float kBaseQuantMatrixYCbCr[] = {
+    // c = 0
+    1.4076321125f,
+    2.6927082539f,
+    2.6927735806f,
+    2.9220938683f,
+    3.0870633125f,
+    3.4968640804f,
+    3.5730612278f,
+    3.5978596210f,
+    2.6927082539f,
+    2.6926636696f,
+    2.7195601463f,
+    2.9238407612f,
+    3.1882488728f,
+    3.0607142448f,
+    3.1882314682f,
+    3.8304426670f,
+    2.6927735806f,
+    2.7195601463f,
+    2.9532215595f,
+    3.5562388897f,
+    3.7088179588f,
+    3.0576279163f,
+    3.7443304062f,
+    4.2484717369f,
+    2.9220938683f,
+    2.9238407612f,
+    3.5562388897f,
+    3.0594384670f,
+    4.1780085564f,
+    4.9221563339f,
+    4.7842588425f,
+    4.6059336662f,
+    3.0870633125f,
+    3.1882488728f,
+    3.7088179588f,
+    4.1780085564f,
+    4.3475294113f,
+    5.5422372818f,
+    5.5741071701f,
+    5.4531836510f,
+    3.4968640804f,
+    3.0607142448f,
+    3.0576279163f,
+    4.9221563339f,
+    5.5422372818f,
+    5.4393601418f,
+    5.1039180756f,
+    6.0990614891f,
+    3.5730612278f,
+    3.1882314682f,
+    3.7443304062f,
+    4.7842588425f,
+    5.5741071701f,
+    5.1039180756f,
+    5.4144043922f,
+    5.4524297714f,
+    3.5978596210f,
+    3.8304426670f,
+    4.2484717369f,
+    4.6059336662f,
+    5.4531836510f,
+    6.0990614891f,
+    5.4524297714f,
+    4.3595433235f,
+    // c = 1
+    2.8152642250f,
+    10.4298934937f,
+    16.1451492310f,
+    15.3725156784f,
+    17.6543502808f,
+    19.1104965210f,
+    17.5021877289f,
+    29.5177459717f,
+    10.4298934937f,
+    15.7448558807f,
+    16.8441677094f,
+    15.3214502335f,
+    17.5918464661f,
+    16.8787574768f,
+    27.0867996216f,
+    21.3443832397f,
+    16.1451492310f,
+    16.8441677094f,
+    14.7525558472f,
+    18.0765247345f,
+    18.2206096649f,
+    23.2126445770f,
+    98.1291885376f,
+    23.6039886475f,
+    15.3725156784f,
+    15.3214502335f,
+    18.0765247345f,
+    17.2925109863f,
+    16.1435356140f,
+    24.0464611053f,
+    27.1577339172f,
+    35.3269882202f,
+    17.6543502808f,
+    17.5918464661f,
+    18.2206096649f,
+    16.1435356140f,
+    19.2819595337f,
+    16.2939300537f,
+    19.6862888336f,
+    51.0941123962f,
+    19.1104965210f,
+    16.8787574768f,
+    23.2126445770f,
+    24.0464611053f,
+    16.2939300537f,
+    32.3153648376f,
+    45.7272338867f,
+    64.6245880127f,
+    17.5021877289f,
+    27.0867996216f,
+    98.1291885376f,
+    27.1577339172f,
+    19.6862888336f,
+    45.7272338867f,
+    61.8331909180f,
+    85.0626754761f,
+    29.5177459717f,
+    21.3443832397f,
+    23.6039886475f,
+    35.3269882202f,
+    51.0941123962f,
+    64.6245880127f,
+    85.0626754761f,
+    112.7605514526f,
+    // c = 2
+    2.8152642250f,
+    5.4735932350f,
+    7.3637795448f,
+    6.5195322037f,
+    8.1501169205f,
+    8.7243938446f,
+    8.7219915390f,
+    9.3618907928f,
+    5.4735932350f,
+    7.1514792442f,
+    7.2054982185f,
+    8.1126995087f,
+    8.1497650146f,
+    7.1335659027f,
+    7.8453893661f,
+    8.3512821198f,
+    7.3637795448f,
+    7.2054982185f,
+    6.9224662781f,
+    8.0766754150f,
+    9.1168527603f,
+    7.3714752197f,
+    7.3646650314f,
+    8.6790895462f,
+    6.5195322037f,
+    8.1126995087f,
+    8.0766754150f,
+    7.8294739723f,
+    7.7385902405f,
+    7.8628563881f,
+    7.4404106140f,
+    8.4759435654f,
+    8.1501169205f,
+    8.1497650146f,
+    9.1168527603f,
+    7.7385902405f,
+    7.0960793495f,
+    8.9185447693f,
+    8.2047510147f,
+    7.8465061188f,
+    8.7243938446f,
+    7.1335659027f,
+    7.3714752197f,
+    7.8628563881f,
+    8.9185447693f,
+    8.6063842773f,
+    9.7156696320f,
+    64.6700744629f,
+    8.7219915390f,
+    7.8453893661f,
+    7.3646650314f,
+    7.4404106140f,
+    8.2047510147f,
+    9.7156696320f,
+    61.9934043884f,
+    83.2930450439f,
+    9.3618907928f,
+    8.3512821198f,
+    8.6790895462f,
+    8.4759435654f,
+    7.8465061188f,
+    64.6700744629f,
+    83.2930450439f,
+    113.0502548218f,
+};
+
+static const float k420GlobalScale = 1.2;
+static const float k420Rescale[64] = {
+    0.6386, 0.4213, 0.3994, 0.3333, 0.3143, 0.3367, 0.3612, 0.3794,  //
+    0.4213, 0.4026, 0.3309, 0.3344, 0.3059, 0.3118, 0.4069, 0.3595,  //
+    0.3994, 0.3309, 0.4080, 0.2531, 0.2645, 0.3630, 0.3502, 0.3231,  //
+    0.3333, 0.3344, 0.2531, 0.2960, 0.3153, 0.3476, 0.3430, 0.4004,  //
+    0.3143, 0.3059, 0.2645, 0.3153, 0.2733, 0.3296, 0.3338, 0.3418,  //
+    0.3367, 0.3118, 0.3630, 0.3476, 0.3296, 0.3144, 0.2262, 0.1326,  //
+    0.3612, 0.4069, 0.3502, 0.3430, 0.3338, 0.2262, 0.1000, 0.1000,  //
+    0.3794, 0.3595, 0.3231, 0.4004, 0.3418, 0.1326, 0.1000, 0.3366,  //
+};
+
+static const float kBaseQuantMatrixStd[] = {
+    // c = 0
+    16.0f, 11.0f, 10.0f, 16.0f, 24.0f, 40.0f, 51.0f, 61.0f,      //
+    12.0f, 12.0f, 14.0f, 19.0f, 26.0f, 58.0f, 60.0f, 55.0f,      //
+    14.0f, 13.0f, 16.0f, 24.0f, 40.0f, 57.0f, 69.0f, 56.0f,      //
+    14.0f, 17.0f, 22.0f, 29.0f, 51.0f, 87.0f, 80.0f, 62.0f,      //
+    18.0f, 22.0f, 37.0f, 56.0f, 68.0f, 109.0f, 103.0f, 77.0f,    //
+    24.0f, 35.0f, 55.0f, 64.0f, 81.0f, 104.0f, 113.0f, 92.0f,    //
+    49.0f, 64.0f, 78.0f, 87.0f, 103.0f, 121.0f, 120.0f, 101.0f,  //
+    72.0f, 92.0f, 95.0f, 98.0f, 112.0f, 100.0f, 103.0f, 99.0f,   //
+    // c = 1
+    17.0f, 18.0f, 24.0f, 47.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    18.0f, 21.0f, 26.0f, 66.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    24.0f, 26.0f, 56.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    47.0f, 66.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+};
+
+static const float kZeroBiasMulYCbCrLQ[] = {
+    // c = 0
+    0.6190f, 0.0568f, 0.3880f, 0.6190f, 0.6190f, 0.4490f, 0.4490f, 0.6187f,  //
+    0.0568f, 0.5829f, 0.6189f, 0.6190f, 0.6190f, 0.7190f, 0.6190f, 0.6189f,  //
+    0.3880f, 0.6189f, 0.6190f, 0.6190f, 0.6190f, 0.6190f, 0.6187f, 0.6100f,  //
+    0.6190f, 0.6190f, 0.6190f, 0.6190f, 0.5890f, 0.3839f, 0.7160f, 0.6190f,  //
+    0.6190f, 0.6190f, 0.6190f, 0.5890f, 0.6190f, 0.3880f, 0.5860f, 0.4790f,  //
+    0.4490f, 0.7190f, 0.6190f, 0.3839f, 0.3880f, 0.6190f, 0.6190f, 0.6190f,  //
+    0.4490f, 0.6190f, 0.6187f, 0.7160f, 0.5860f, 0.6190f, 0.6204f, 0.6190f,  //
+    0.6187f, 0.6189f, 0.6100f, 0.6190f, 0.4790f, 0.6190f, 0.6190f, 0.3480f,  //
+    // c = 1
+    0.9430f, 1.1640f, 0.9373f, 1.1319f, 0.8016f, 0.9136f, 1.1530f, 0.9430f,  //
+    1.1640f, 0.9188f, 0.9160f, 1.1980f, 1.1830f, 0.9758f, 0.9430f, 0.9430f,  //
+    0.9373f, 0.9160f, 0.8430f, 1.1720f, 0.7083f, 0.9430f, 0.9430f, 0.9430f,  //
+    1.1319f, 1.1980f, 1.1720f, 1.1490f, 0.8547f, 0.9430f, 0.9430f, 0.9430f,  //
+    0.8016f, 1.1830f, 0.7083f, 0.8547f, 0.9430f, 0.9430f, 0.9430f, 0.9430f,  //
+    0.9136f, 0.9758f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f,  //
+    1.1530f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9480f,  //
+    0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9480f, 0.9430f,  //
+    // c = 2
+    0.3060f, 1.3190f, 0.4308f, 0.4460f, 0.0661f, 0.0660f, 0.2660f, 0.2960f,  //
+    1.3190f, 0.3280f, 0.3093f, 0.0750f, 0.0505f, 0.1594f, 0.3060f, 0.2113f,  //
+    0.4308f, 0.3093f, 0.3060f, 0.1182f, 0.0500f, 0.3060f, 0.3915f, 0.2426f,  //
+    0.4460f, 0.0750f, 0.1182f, 0.0512f, 0.0500f, 0.2130f, 0.3930f, 0.1590f,  //
+    0.0661f, 0.0505f, 0.0500f, 0.0500f, 0.3055f, 0.3360f, 0.5148f, 0.5403f,  //
+    0.0660f, 0.1594f, 0.3060f, 0.2130f, 0.3360f, 0.5060f, 0.5874f, 0.3060f,  //
+    0.2660f, 0.3060f, 0.3915f, 0.3930f, 0.5148f, 0.5874f, 0.3060f, 0.3060f,  //
+    0.2960f, 0.2113f, 0.2426f, 0.1590f, 0.5403f, 0.3060f, 0.3060f, 0.3060f,  //
+};
+
+static const float kZeroBiasMulYCbCrHQ[] = {
+    // c = 0
+    0.7830f, 0.0044f, 0.2521f, 0.6547f, 0.8161f, 0.6130f, 0.8841f, 0.8155f,  //
+    0.0044f, 0.6831f, 0.6553f, 0.6295f, 0.7848f, 0.7843f, 0.8474f, 0.7836f,  //
+    0.2521f, 0.6553f, 0.7834f, 0.7829f, 0.8161f, 0.8072f, 0.7743f, 0.9242f,  //
+    0.6547f, 0.6295f, 0.7829f, 0.8654f, 0.7829f, 0.6986f, 0.7818f, 0.7726f,  //
+    0.8161f, 0.7848f, 0.8161f, 0.7829f, 0.7471f, 0.7827f, 0.7843f, 0.7653f,  //
+    0.6130f, 0.7843f, 0.8072f, 0.6986f, 0.7827f, 0.7848f, 0.9508f, 0.7653f,  //
+    0.8841f, 0.8474f, 0.7743f, 0.7818f, 0.7843f, 0.9508f, 0.7839f, 0.8437f,  //
+    0.8155f, 0.7836f, 0.9242f, 0.7726f, 0.7653f, 0.7653f, 0.8437f, 0.7819f,  //
+    // c = 1
+    1.0540f, 1.0816f, 1.0556f, 1.2876f, 1.1554f, 1.1567f, 1.8851f, 0.5488f,  //
+    1.0816f, 1.1537f, 1.1850f, 1.0712f, 1.1671f, 2.0719f, 1.0544f, 1.4764f,  //
+    1.0556f, 1.1850f, 1.2870f, 1.1981f, 1.8181f, 1.2618f, 1.0564f, 1.1191f,  //
+    1.2876f, 1.0712f, 1.1981f, 1.4753f, 2.0609f, 1.0564f, 1.2645f, 1.0564f,  //
+    1.1554f, 1.1671f, 1.8181f, 2.0609f, 0.7324f, 1.1163f, 0.8464f, 1.0564f,  //
+    1.1567f, 2.0719f, 1.2618f, 1.0564f, 1.1163f, 1.0040f, 1.0564f, 1.0564f,  //
+    1.8851f, 1.0544f, 1.0564f, 1.2645f, 0.8464f, 1.0564f, 1.0564f, 1.0564f,  //
+    0.5488f, 1.4764f, 1.1191f, 1.0564f, 1.0564f, 1.0564f, 1.0564f, 1.0564f,  //
+    // c = 2
+    0.6620f, 0.5392f, 0.6659f, 0.8968f, 0.6829f, 0.6328f, 0.5802f, 0.4836f,  //
+    0.5392f, 0.6746f, 0.6760f, 0.6102f, 0.6015f, 0.6958f, 0.7327f, 0.4897f,  //
+    0.6659f, 0.6760f, 0.6957f, 0.6543f, 0.4396f, 0.6330f, 0.7081f, 0.2583f,  //
+    0.8968f, 0.6102f, 0.6543f, 0.5913f, 0.6457f, 0.5828f, 0.5139f, 0.3565f,  //
+    0.6829f, 0.6015f, 0.4396f, 0.6457f, 0.5633f, 0.4263f, 0.6371f, 0.5949f,  //
+    0.6328f, 0.6958f, 0.6330f, 0.5828f, 0.4263f, 0.2847f, 0.2909f, 0.6629f,  //
+    0.5802f, 0.7327f, 0.7081f, 0.5139f, 0.6371f, 0.2909f, 0.6644f, 0.6644f,  //
+    0.4836f, 0.4897f, 0.2583f, 0.3565f, 0.5949f, 0.6629f, 0.6644f, 0.6644f,  //
+};
+
+static const float kZeroBiasOffsetYCbCr[] = {
+    0.59082f,
+    0.58146f,
+    0.57988f,
+};
+
+constexpr uint8_t kTransferFunctionPQ = 16;
+constexpr uint8_t kTransferFunctionHLG = 18;
+
+float DistanceToLinearQuality(float distance) {
+  if (distance <= 0.1f) {
+    return 1.0f;
+  } else if (distance <= 4.6f) {
+    return (200.0f / 9.0f) * (distance - 0.1f);
+  } else if (distance <= 6.4f) {
+    return 5000.0f / (100.0f - (distance - 0.1f) / 0.09f);
+  } else if (distance < 25.0f) {
+    return 530000.0f /
+           (3450.0f -
+            300.0f * std::sqrt((848.0f * distance - 5330.0f) / 120.0f));
+  } else {
+    return 5000.0f;
+  }
+}
+
+constexpr float kExponent[DCTSIZE2] = {
+    1.00f, 0.51f, 0.67f, 0.74f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+    0.51f, 0.66f, 0.69f, 0.87f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+    0.67f, 0.69f, 0.84f, 0.83f, 0.96f, 1.00f, 1.00f, 1.00f,  //
+    0.74f, 0.87f, 0.83f, 1.00f, 1.00f, 0.91f, 0.91f, 1.00f,  //
+    1.00f, 1.00f, 0.96f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+    1.00f, 1.00f, 1.00f, 0.91f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+    1.00f, 1.00f, 1.00f, 0.91f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+    1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+};
+constexpr float kDist0 = 1.5f;  // distance where non-linearity kicks in.
+
+float DistanceToScale(float distance, int k) {
+  if (distance < kDist0) {
+    return distance;
+  }
+  const float exp = kExponent[k];
+  const float mul = std::pow(kDist0, 1.0 - exp);
+  return std::max<float>(0.5f * distance, mul * std::pow(distance, exp));
+}
+
+float ScaleToDistance(float scale, int k) {
+  if (scale < kDist0) {
+    return scale;
+  }
+  const float exp = 1.0 / kExponent[k];
+  const float mul = std::pow(kDist0, 1.0 - exp);
+  return std::min<float>(2.0f * scale, mul * std::pow(scale, exp));
+}
+
+float QuantValsToDistance(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  float global_scale = kGlobalScaleYCbCr;
+  if (m->cicp_transfer_function == kTransferFunctionPQ) {
+    global_scale *= .4f;
+  } else if (m->cicp_transfer_function == kTransferFunctionHLG) {
+    global_scale *= .5f;
+  }
+  int quant_max = m->force_baseline ? 255 : 32767U;
+  static const float kDistMax = 10000.0f;
+  float dist_min = 0.0f;
+  float dist_max = kDistMax;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    int quant_idx = cinfo->comp_info[c].quant_tbl_no;
+    uint16_t* quantval = cinfo->quant_tbl_ptrs[quant_idx]->quantval;
+    const float* base_qm = &kBaseQuantMatrixYCbCr[quant_idx * DCTSIZE2];
+    for (int k = 0; k < DCTSIZE2; ++k) {
+      float dmin = 0.0;
+      float dmax = kDistMax;
+      float invq = 1.0f / base_qm[k] / global_scale;
+      int qval = quantval[k];
+      if (qval > 1) {
+        float scale_min = (qval - 0.5f) * invq;
+        dmin = ScaleToDistance(scale_min, k);
+      }
+      if (qval < quant_max) {
+        float scale_max = (qval + 0.5f) * invq;
+        dmax = ScaleToDistance(scale_max, k);
+      }
+      if (dmin <= dist_max) {
+        dist_min = std::max(dmin, dist_min);
+      }
+      if (dmax >= dist_min) {
+        dist_max = std::min(dist_max, dmax);
+      }
+    }
+  }
+  float distance;
+  if (dist_min == 0) {
+    distance = dist_max;
+  } else if (dist_max == kDistMax) {
+    distance = dist_min;
+  } else {
+    distance = 0.5f * (dist_min + dist_max);
+  }
+  return distance;
+}
+
+bool IsYUV420(j_compress_ptr cinfo) {
+  return (cinfo->jpeg_color_space == JCS_YCbCr &&
+          cinfo->comp_info[0].h_samp_factor == 2 &&
+          cinfo->comp_info[0].v_samp_factor == 2 &&
+          cinfo->comp_info[1].h_samp_factor == 1 &&
+          cinfo->comp_info[1].v_samp_factor == 1 &&
+          cinfo->comp_info[2].h_samp_factor == 1 &&
+          cinfo->comp_info[2].v_samp_factor == 1);
+}
+
+}  // namespace
+
+void SetQuantMatrices(j_compress_ptr cinfo, float distances[NUM_QUANT_TBLS],
+                      bool add_two_chroma_tables) {
+  jpeg_comp_master* m = cinfo->master;
+  const bool xyb = m->xyb_mode && cinfo->jpeg_color_space == JCS_RGB;
+  const bool is_yuv420 = IsYUV420(cinfo);
+
+  float global_scale;
+  bool non_linear_scaling = true;
+  const float* base_quant_matrix[NUM_QUANT_TBLS];
+  int num_base_tables;
+
+  if (xyb) {
+    global_scale = kGlobalScaleXYB;
+    num_base_tables = 3;
+    base_quant_matrix[0] = kBaseQuantMatrixXYB;
+    base_quant_matrix[1] = kBaseQuantMatrixXYB + DCTSIZE2;
+    base_quant_matrix[2] = kBaseQuantMatrixXYB + 2 * DCTSIZE2;
+  } else if (cinfo->jpeg_color_space == JCS_YCbCr && !m->use_std_tables) {
+    global_scale = kGlobalScaleYCbCr;
+    if (m->cicp_transfer_function == kTransferFunctionPQ) {
+      global_scale *= .4f;
+    } else if (m->cicp_transfer_function == kTransferFunctionHLG) {
+      global_scale *= .5f;
+    }
+    if (is_yuv420) {
+      global_scale *= k420GlobalScale;
+    }
+    if (add_two_chroma_tables) {
+      cinfo->comp_info[2].quant_tbl_no = 2;
+      num_base_tables = 3;
+      base_quant_matrix[0] = kBaseQuantMatrixYCbCr;
+      base_quant_matrix[1] = kBaseQuantMatrixYCbCr + DCTSIZE2;
+      base_quant_matrix[2] = kBaseQuantMatrixYCbCr + 2 * DCTSIZE2;
+    } else {
+      num_base_tables = 2;
+      base_quant_matrix[0] = kBaseQuantMatrixYCbCr;
+      // Use the Cr table for both Cb and Cr.
+      base_quant_matrix[1] = kBaseQuantMatrixYCbCr + 2 * DCTSIZE2;
+    }
+  } else {
+    global_scale = 0.01f;
+    non_linear_scaling = false;
+    num_base_tables = 2;
+    base_quant_matrix[0] = kBaseQuantMatrixStd;
+    base_quant_matrix[1] = kBaseQuantMatrixStd + DCTSIZE2;
+  }
+
+  int quant_max = m->force_baseline ? 255 : 32767U;
+  for (int quant_idx = 0; quant_idx < num_base_tables; ++quant_idx) {
+    const float* base_qm = base_quant_matrix[quant_idx];
+    JQUANT_TBL** qtable = &cinfo->quant_tbl_ptrs[quant_idx];
+    if (*qtable == nullptr) {
+      *qtable = jpegli_alloc_quant_table(reinterpret_cast<j_common_ptr>(cinfo));
+    }
+    for (int k = 0; k < DCTSIZE2; ++k) {
+      float scale = global_scale;
+      if (non_linear_scaling) {
+        scale *= DistanceToScale(distances[quant_idx], k);
+        if (is_yuv420 && quant_idx > 0) {
+          scale *= k420Rescale[k];
+        }
+      } else {
+        scale *= DistanceToLinearQuality(distances[quant_idx]);
+      }
+      int qval = std::round(scale * base_qm[k]);
+      (*qtable)->quantval[k] = std::max(1, std::min(qval, quant_max));
+    }
+    (*qtable)->sent_table = FALSE;
+  }
+}
+
+void InitQuantizer(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  // Compute quantization multupliers from the quant table values.
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    int quant_idx = cinfo->comp_info[c].quant_tbl_no;
+    JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[quant_idx];
+    if (!quant_table) {
+      JPEGLI_ERROR("Missing quantization table %d for component %d", quant_idx,
+                   c);
+    }
+    for (size_t k = 0; k < DCTSIZE2; k++) {
+      int val = quant_table->quantval[k];
+      if (val == 0) {
+        JPEGLI_ERROR("Invalid quantval 0.");
+      }
+      m->quant_mul[c][k] = 8.0f / val;
+    }
+  }
+  if (m->use_adaptive_quantization) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      for (int k = 0; k < DCTSIZE2; ++k) {
+        m->zero_bias_mul[c][k] = 0.5f;
+        m->zero_bias_offset[c][k] = 0.5f;
+      }
+    }
+    if (cinfo->jpeg_color_space == JCS_YCbCr) {
+      float distance = QuantValsToDistance(cinfo);
+      static const float kDistHQ = 1.0f;
+      static const float kDistLQ = 3.0f;
+      float mix0 = (distance - kDistHQ) / (kDistLQ - kDistHQ);
+      mix0 = std::max(0.0f, std::min(1.0f, mix0));
+      float mix1 = 1.0f - mix0;
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        for (int k = 0; k < DCTSIZE2; ++k) {
+          float mul0 = kZeroBiasMulYCbCrLQ[c * DCTSIZE2 + k];
+          float mul1 = kZeroBiasMulYCbCrHQ[c * DCTSIZE2 + k];
+          m->zero_bias_mul[c][k] = mix0 * mul0 + mix1 * mul1;
+          m->zero_bias_offset[c][k] = kZeroBiasOffsetYCbCr[c];
+        }
+      }
+    }
+  }
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/quant.h b/third_party/jpeg-xl/lib/jpegli/quant.h
new file mode 100644
index 0000000000..44deb48d45
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/quant.h
@@ -0,0 +1,23 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_QUANT_H_
+#define LIB_JPEGLI_QUANT_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+namespace jpegli {
+
+void SetQuantMatrices(j_compress_ptr cinfo, float distances[NUM_QUANT_TBLS],
+                      bool add_two_chroma_tables);
+
+void InitQuantizer(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_QUANT_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/render.cc b/third_party/jpeg-xl/lib/jpegli/render.cc
new file mode 100644
index 0000000000..026345552a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/render.cc
@@ -0,0 +1,802 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/render.h"
+
+#include <string.h>
+
+#include <array>
+#include <atomic>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <hwy/aligned_allocator.h>
+
+#include "lib/jpegli/color_quantize.h"
+#include "lib/jpegli/color_transform.h"
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/idct.h"
+#include "lib/jpegli/upsample.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+#ifdef MEMORY_SANITIZER
+#define JXL_MEMORY_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define JXL_MEMORY_SANITIZER 1
+#else
+#define JXL_MEMORY_SANITIZER 0
+#endif
+#else
+#define JXL_MEMORY_SANITIZER 0
+#endif
+
+#if JXL_MEMORY_SANITIZER
+#include "sanitizer/msan_interface.h"
+#endif
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/render.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::Gt;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::NearestInt;
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ShiftLeftSame;
+using hwy::HWY_NAMESPACE::ShiftRightSame;
+using hwy::HWY_NAMESPACE::Vec;
+using D = HWY_FULL(float);
+using DI = HWY_FULL(int32_t);
+constexpr D d;
+constexpr DI di;
+
+void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
+                      const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
+                      int32_t* JXL_RESTRICT sumabs) {
+  for (size_t i = 0; i < coeffs_size; i += Lanes(d)) {
+    size_t k = i % DCTSIZE2;
+    const Rebind<int16_t, DI> di16;
+    const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i));
+    const auto abs_coeff = Abs(coeff);
+    const auto not_0 = Gt(abs_coeff, Zero(di));
+    const auto nzero = IfThenElseZero(not_0, Set(di, 1));
+    Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k);
+    Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k);
+  }
+}
+
+void DecenterRow(float* row, size_t xsize) {
+  const HWY_CAPPED(float, 8) df;
+  const auto c128 = Set(df, 128.0f / 255);
+  for (size_t x = 0; x < xsize; x += Lanes(df)) {
+    Store(Add(Load(df, row + x), c128), df, row + x);
+  }
+}
+
+void DitherRow(j_decompress_ptr cinfo, float* row, int c, size_t y,
+               size_t xsize) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!m->dither_[c]) return;
+  const float* dither_row =
+      &m->dither_[c][(y & m->dither_mask_) * m->dither_size_];
+  for (size_t x = 0; x < xsize; ++x) {
+    row[x] += dither_row[x & m->dither_mask_];
+  }
+}
+
+template <typename T>
+void StoreUnsignedRow(float* JXL_RESTRICT input[], size_t x0, size_t len,
+                      size_t num_channels, float multiplier, T* output) {
+  const HWY_CAPPED(float, 8) d;
+  auto zero = Zero(d);
+  auto mul = Set(d, multiplier);
+  const Rebind<T, decltype(d)> du;
+#if JXL_MEMORY_SANITIZER
+  const size_t padding = hwy::RoundUpTo(len, Lanes(d)) - len;
+  for (size_t c = 0; c < num_channels; ++c) {
+    __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding);
+  }
+#endif
+  if (num_channels == 1) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
+      StoreU(DemoteTo(du, NearestInt(v0)), du, &output[i]);
+    }
+  } else if (num_channels == 2) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
+      auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul);
+      StoreInterleaved2(DemoteTo(du, NearestInt(v0)),
+                        DemoteTo(du, NearestInt(v1)), du, &output[2 * i]);
+    }
+  } else if (num_channels == 3) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
+      auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul);
+      auto v2 = Clamp(zero, Mul(LoadU(d, &input[2][x0 + i]), mul), mul);
+      StoreInterleaved3(DemoteTo(du, NearestInt(v0)),
+                        DemoteTo(du, NearestInt(v1)),
+                        DemoteTo(du, NearestInt(v2)), du, &output[3 * i]);
+    }
+  } else if (num_channels == 4) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
+      auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul);
+      auto v2 = Clamp(zero, Mul(LoadU(d, &input[2][x0 + i]), mul), mul);
+      auto v3 = Clamp(zero, Mul(LoadU(d, &input[3][x0 + i]), mul), mul);
+      StoreInterleaved4(DemoteTo(du, NearestInt(v0)),
+                        DemoteTo(du, NearestInt(v1)),
+                        DemoteTo(du, NearestInt(v2)),
+                        DemoteTo(du, NearestInt(v3)), du, &output[4 * i]);
+    }
+  }
+#if JXL_MEMORY_SANITIZER
+  __msan_poison(output + num_channels * len,
+                sizeof(output[0]) * num_channels * padding);
+#endif
+}
+
+void StoreFloatRow(float* JXL_RESTRICT input[3], size_t x0, size_t len,
+                   size_t num_channels, float* output) {
+  const HWY_CAPPED(float, 8) d;
+  if (num_channels == 1) {
+    memcpy(output, input[0] + x0, len * sizeof(output[0]));
+  } else if (num_channels == 2) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      StoreInterleaved2(LoadU(d, &input[0][x0 + i]),
+                        LoadU(d, &input[1][x0 + i]), d, &output[2 * i]);
+    }
+  } else if (num_channels == 3) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      StoreInterleaved3(LoadU(d, &input[0][x0 + i]),
+                        LoadU(d, &input[1][x0 + i]),
+                        LoadU(d, &input[2][x0 + i]), d, &output[3 * i]);
+    }
+  } else if (num_channels == 4) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      StoreInterleaved4(LoadU(d, &input[0][x0 + i]),
+                        LoadU(d, &input[1][x0 + i]),
+                        LoadU(d, &input[2][x0 + i]),
+                        LoadU(d, &input[3][x0 + i]), d, &output[4 * i]);
+    }
+  }
+}
+
+static constexpr float kFSWeightMR = 7.0f / 16.0f;
+static constexpr float kFSWeightBL = 3.0f / 16.0f;
+static constexpr float kFSWeightBM = 5.0f / 16.0f;
+static constexpr float kFSWeightBR = 1.0f / 16.0f;
+
+float LimitError(float error) {
+  float abserror = std::abs(error);
+  if (abserror > 48.0f) {
+    abserror = 32.0f;
+  } else if (abserror > 16.0f) {
+    abserror = 0.5f * abserror + 8.0f;
+  }
+  return error > 0.0f ? abserror : -abserror;
+}
+
+void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[],
+                   size_t xoffset, size_t len, size_t num_channels,
+                   uint8_t* JXL_RESTRICT output) {
+  jpeg_decomp_master* m = cinfo->master;
+  uint8_t* JXL_RESTRICT scratch_space = m->output_scratch_;
+  if (cinfo->quantize_colors && m->quant_pass_ == 1) {
+    float* error_row[kMaxComponents];
+    float* next_error_row[kMaxComponents];
+    if (cinfo->dither_mode == JDITHER_ORDERED) {
+      for (size_t c = 0; c < num_channels; ++c) {
+        DitherRow(cinfo, &rows[c][xoffset], c, cinfo->output_scanline,
+                  cinfo->output_width);
+      }
+    } else if (cinfo->dither_mode == JDITHER_FS) {
+      for (size_t c = 0; c < num_channels; ++c) {
+        if (cinfo->output_scanline % 2 == 0) {
+          error_row[c] = m->error_row_[c];
+          next_error_row[c] = m->error_row_[c + kMaxComponents];
+        } else {
+          error_row[c] = m->error_row_[c + kMaxComponents];
+          next_error_row[c] = m->error_row_[c];
+        }
+        memset(next_error_row[c], 0.0, cinfo->output_width * sizeof(float));
+      }
+    }
+    const float mul = 255.0f;
+    if (cinfo->dither_mode != JDITHER_FS) {
+      StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
+    }
+    for (size_t i = 0; i < len; ++i) {
+      uint8_t* pixel = &scratch_space[num_channels * i];
+      if (cinfo->dither_mode == JDITHER_FS) {
+        for (size_t c = 0; c < num_channels; ++c) {
+          float val = rows[c][i] * mul + LimitError(error_row[c][i]);
+          pixel[c] = std::round(std::min(255.0f, std::max(0.0f, val)));
+        }
+      }
+      int index = LookupColorIndex(cinfo, pixel);
+      output[i] = index;
+      if (cinfo->dither_mode == JDITHER_FS) {
+        size_t prev_i = i > 0 ? i - 1 : 0;
+        size_t next_i = i + 1 < len ? i + 1 : len - 1;
+        for (size_t c = 0; c < num_channels; ++c) {
+          float error = pixel[c] - cinfo->colormap[c][index];
+          error_row[c][next_i] += kFSWeightMR * error;
+          next_error_row[c][prev_i] += kFSWeightBL * error;
+          next_error_row[c][i] += kFSWeightBM * error;
+          next_error_row[c][next_i] += kFSWeightBR * error;
+        }
+      }
+    }
+  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT8) {
+    const float mul = 255.0;
+    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
+    memcpy(output, scratch_space, len * num_channels);
+  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT16) {
+    const float mul = 65535.0;
+    uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space);
+    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, tmp);
+    if (m->swap_endianness_) {
+      const HWY_CAPPED(uint16_t, 8) du;
+      size_t output_len = len * num_channels;
+      for (size_t j = 0; j < output_len; j += Lanes(du)) {
+        auto v = LoadU(du, tmp + j);
+        auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8));
+        StoreU(vswap, du, tmp + j);
+      }
+    }
+    memcpy(output, tmp, len * num_channels * 2);
+  } else if (m->output_data_type_ == JPEGLI_TYPE_FLOAT) {
+    float* tmp = reinterpret_cast<float*>(scratch_space);
+    StoreFloatRow(rows, xoffset, len, num_channels, tmp);
+    if (m->swap_endianness_) {
+      size_t output_len = len * num_channels;
+      for (size_t j = 0; j < output_len; ++j) {
+        tmp[j] = BSwapFloat(tmp[j]);
+      }
+    }
+    memcpy(output, tmp, len * num_channels * 4);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace jpegli {
+
+HWY_EXPORT(GatherBlockStats);
+HWY_EXPORT(WriteToOutput);
+HWY_EXPORT(DecenterRow);
+
+void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
+                      const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
+                      int32_t* JXL_RESTRICT sumabs) {
+  return HWY_DYNAMIC_DISPATCH(GatherBlockStats)(coeffs, coeffs_size, nonzeros,
+                                                sumabs);
+}
+
+void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[],
+                   size_t xoffset, size_t len, size_t num_channels,
+                   uint8_t* JXL_RESTRICT output) {
+  return HWY_DYNAMIC_DISPATCH(WriteToOutput)(cinfo, rows, xoffset, len,
+                                             num_channels, output);
+}
+
+void DecenterRow(float* row, size_t xsize) {
+  return HWY_DYNAMIC_DISPATCH(DecenterRow)(row, xsize);
+}
+
+// Padding for horizontal chroma upsampling.
+constexpr size_t kPaddingLeft = 64;
+constexpr size_t kPaddingRight = 64;
+
+bool ShouldApplyDequantBiases(j_decompress_ptr cinfo, int ci) {
+  const auto& compinfo = cinfo->comp_info[ci];
+  return (compinfo.h_samp_factor == cinfo->max_h_samp_factor &&
+          compinfo.v_samp_factor == cinfo->max_v_samp_factor);
+}
+
+// See the following article for the details:
+// J. R. Price and M. Rabbani, "Dequantization bias for JPEG decompression"
+// Proceedings International Conference on Information Technology: Coding and
+// Computing (Cat. No.PR00540), 2000, pp. 30-35, doi: 10.1109/ITCC.2000.844179.
+void ComputeOptimalLaplacianBiases(const int num_blocks, const int* nonzeros,
+                                   const int* sumabs, float* biases) {
+  for (size_t k = 1; k < DCTSIZE2; ++k) {
+    if (nonzeros[k] == 0) {
+      biases[k] = 0.5f;
+      continue;
+    }
+    // Notation adapted from the article
+    float N = num_blocks;
+    float N1 = nonzeros[k];
+    float N0 = num_blocks - N1;
+    float S = sumabs[k];
+    // Compute gamma from N0, N1, N, S (eq. 11), with A and B being just
+    // temporary grouping of terms.
+    float A = 4.0 * S + 2.0 * N;
+    float B = 4.0 * S - 2.0 * N1;
+    float gamma = (-1.0 * N0 + std::sqrt(N0 * N0 * 1.0 + A * B)) / A;
+    float gamma2 = gamma * gamma;
+    // The bias is computed from gamma with (eq. 5), where the quantization
+    // multiplier Q can be factored out and thus the bias can be applied
+    // directly on the quantized coefficient.
+    biases[k] =
+        0.5 * (((1.0 + gamma2) / (1.0 - gamma2)) + 1.0 / std::log(gamma));
+  }
+}
+
+constexpr std::array<int, SAVED_COEFS> Q_POS = {0, 1, 8,  16, 9,
+                                                2, 3, 10, 17, 24};
+
+bool is_nonzero_quantizers(const JQUANT_TBL* qtable) {
+  return std::all_of(Q_POS.begin(), Q_POS.end(),
+                     [&](int pos) { return qtable->quantval[pos] != 0; });
+}
+
+// Determine whether smoothing should be applied during decompression
+bool do_smoothing(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  bool smoothing_useful = false;
+
+  if (!cinfo->progressive_mode || cinfo->coef_bits == nullptr) {
+    return false;
+  }
+  auto coef_bits_latch = m->coef_bits_latch;
+  auto prev_coef_bits_latch = m->prev_coef_bits_latch;
+
+  for (int ci = 0; ci < cinfo->num_components; ci++) {
+    jpeg_component_info* compptr = &cinfo->comp_info[ci];
+    JQUANT_TBL* qtable = compptr->quant_table;
+    int* coef_bits = cinfo->coef_bits[ci];
+    int* prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components];
+
+    // Return early if conditions for smoothing are not met
+    if (qtable == nullptr || !is_nonzero_quantizers(qtable) ||
+        coef_bits[0] < 0) {
+      return false;
+    }
+
+    coef_bits_latch[ci][0] = coef_bits[0];
+
+    for (int coefi = 1; coefi < SAVED_COEFS; coefi++) {
+      prev_coef_bits_latch[ci][coefi] =
+          cinfo->input_scan_number > 1 ? prev_coef_bits[coefi] : -1;
+      if (coef_bits[coefi] != 0) {
+        smoothing_useful = true;
+      }
+      coef_bits_latch[ci][coefi] = coef_bits[coefi];
+    }
+  }
+
+  return smoothing_useful;
+}
+
+void PredictSmooth(j_decompress_ptr cinfo, JBLOCKARRAY blocks, int component,
+                   size_t bx, int iy) {
+  const size_t imcu_row = cinfo->output_iMCU_row;
+  int16_t* scratch = cinfo->master->smoothing_scratch_;
+  std::vector<int> Q_VAL(SAVED_COEFS);
+  int* coef_bits;
+
+  std::array<std::array<int, 5>, 5> dc_values;
+  auto& compinfo = cinfo->comp_info[component];
+  const size_t by0 = imcu_row * compinfo.v_samp_factor;
+  const size_t by = by0 + iy;
+
+  int prev_iy = by > 0 ? iy - 1 : 0;
+  int prev_prev_iy = by > 1 ? iy - 2 : prev_iy;
+  int next_iy = by + 1 < compinfo.height_in_blocks ? iy + 1 : iy;
+  int next_next_iy = by + 2 < compinfo.height_in_blocks ? iy + 2 : next_iy;
+
+  const int16_t* cur_row = blocks[iy][bx];
+  const int16_t* prev_row = blocks[prev_iy][bx];
+  const int16_t* prev_prev_row = blocks[prev_prev_iy][bx];
+  const int16_t* next_row = blocks[next_iy][bx];
+  const int16_t* next_next_row = blocks[next_next_iy][bx];
+
+  int prev_block_ind = bx ? -DCTSIZE2 : 0;
+  int prev_prev_block_ind = bx > 1 ? -2 * DCTSIZE2 : prev_block_ind;
+  int next_block_ind = bx + 1 < compinfo.width_in_blocks ? DCTSIZE2 : 0;
+  int next_next_block_ind =
+      bx + 2 < compinfo.width_in_blocks ? DCTSIZE2 * 2 : next_block_ind;
+
+  std::array<const int16_t*, 5> row_ptrs = {prev_prev_row, prev_row, cur_row,
+                                            next_row, next_next_row};
+  std::array<int, 5> block_inds = {prev_prev_block_ind, prev_block_ind, 0,
+                                   next_block_ind, next_next_block_ind};
+
+  memcpy(scratch, cur_row, DCTSIZE2 * sizeof(cur_row[0]));
+
+  for (int r = 0; r < 5; ++r) {
+    for (int c = 0; c < 5; ++c) {
+      dc_values[r][c] = row_ptrs[r][block_inds[c]];
+    }
+  }
+  // Get the correct coef_bits: In case of an incomplete scan, we use the
+  // prev coeficients.
+  if (cinfo->output_iMCU_row + 1 > cinfo->input_iMCU_row) {
+    coef_bits = cinfo->master->prev_coef_bits_latch[component];
+  } else {
+    coef_bits = cinfo->master->coef_bits_latch[component];
+  }
+
+  bool change_dc = true;
+  for (int i = 1; i < SAVED_COEFS; i++) {
+    if (coef_bits[i] != -1) {
+      change_dc = false;
+      break;
+    }
+  }
+
+  JQUANT_TBL* quanttbl = cinfo->quant_tbl_ptrs[compinfo.quant_tbl_no];
+  for (size_t i = 0; i < 6; ++i) {
+    Q_VAL[i] = quanttbl->quantval[Q_POS[i]];
+  }
+  if (change_dc) {
+    for (size_t i = 6; i < SAVED_COEFS; ++i) {
+      Q_VAL[i] = quanttbl->quantval[Q_POS[i]];
+    }
+  }
+  auto calculate_dct_value = [&](int coef_index) {
+    int64_t num = 0;
+    int pred;
+    int Al;
+    // we use the symmetry of the smoothing matrices by transposing the 5x5 dc
+    // matrix in that case.
+    bool swap_indices = coef_index == 2 || coef_index == 5 || coef_index == 8 ||
+                        coef_index == 9;
+    auto dc = [&](int i, int j) {
+      return swap_indices ? dc_values[j][i] : dc_values[i][j];
+    };
+    Al = coef_bits[coef_index];
+    switch (coef_index) {
+      case 0:
+        // set the DC
+        num = (-2 * dc(0, 0) - 6 * dc(0, 1) - 8 * dc(0, 2) - 6 * dc(0, 3) -
+               2 * dc(0, 4) - 6 * dc(1, 0) + 6 * dc(1, 1) + 42 * dc(1, 2) +
+               6 * dc(1, 3) - 6 * dc(1, 4) - 8 * dc(2, 0) + 42 * dc(2, 1) +
+               152 * dc(2, 2) + 42 * dc(2, 3) - 8 * dc(2, 4) - 6 * dc(3, 0) +
+               6 * dc(3, 1) + 42 * dc(3, 2) + 6 * dc(3, 3) - 6 * dc(3, 4) -
+               2 * dc(4, 0) - 6 * dc(4, 1) - 8 * dc(4, 2) - 6 * dc(4, 3) -
+               2 * dc(4, 4));
+        // special case: for the DC the dequantization is different
+        Al = 0;
+        break;
+      case 1:
+      case 2:
+        // set Q01 or Q10
+        num = (change_dc ? (-dc(0, 0) - dc(0, 1) + dc(0, 3) + dc(0, 4) -
+                            3 * dc(1, 0) + 13 * dc(1, 1) - 13 * dc(1, 3) +
+                            3 * dc(1, 4) - 3 * dc(2, 0) + 38 * dc(2, 1) -
+                            38 * dc(2, 3) + 3 * dc(2, 4) - 3 * dc(3, 0) +
+                            13 * dc(3, 1) - 13 * dc(3, 3) + 3 * dc(3, 4) -
+                            dc(4, 0) - dc(4, 1) + dc(4, 3) + dc(4, 4))
+                         : (-7 * dc(2, 0) + 50 * dc(2, 1) - 50 * dc(2, 3) +
+                            7 * dc(2, 4)));
+        break;
+      case 3:
+      case 5:
+        // set Q02 or Q20
+        num = (change_dc
+                   ? dc(0, 2) + 2 * dc(1, 1) + 7 * dc(1, 2) + 2 * dc(1, 3) -
+                         5 * dc(2, 1) - 14 * dc(2, 2) - 5 * dc(2, 3) +
+                         2 * dc(3, 1) + 7 * dc(3, 2) + 2 * dc(3, 3) + dc(4, 2)
+                   : (-dc(0, 2) + 13 * dc(1, 2) - 24 * dc(2, 2) +
+                      13 * dc(3, 2) - dc(4, 2)));
+        break;
+      case 4:
+        // set Q11
+        num =
+            (change_dc ? -dc(0, 0) + dc(0, 4) + 9 * dc(1, 1) - 9 * dc(1, 3) -
+                             9 * dc(3, 1) + 9 * dc(3, 3) + dc(4, 0) - dc(4, 4)
+                       : (dc(1, 4) + dc(3, 0) - 10 * dc(3, 1) + 10 * dc(3, 3) -
+                          dc(0, 1) - dc(3, 4) + dc(4, 1) - dc(4, 3) + dc(0, 3) -
+                          dc(1, 0) + 10 * dc(1, 1) - 10 * dc(1, 3)));
+        break;
+      case 6:
+      case 9:
+        // set Q03 or Q30
+        num = (dc(1, 1) - dc(1, 3) + 2 * dc(2, 1) - 2 * dc(2, 3) + dc(3, 1) -
+               dc(3, 3));
+        break;
+      case 7:
+      case 8:
+        // set Q12 and Q21
+        num = (dc(1, 1) - 3 * dc(1, 2) + dc(1, 3) - dc(3, 1) + 3 * dc(3, 2) -
+               dc(3, 3));
+        break;
+    }
+    num = Q_VAL[0] * num;
+    if (num >= 0) {
+      pred = ((Q_VAL[coef_index] << 7) + num) / (Q_VAL[coef_index] << 8);
+      if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1;
+    } else {
+      pred = ((Q_VAL[coef_index] << 7) - num) / (Q_VAL[coef_index] << 8);
+      if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1;
+      pred = -pred;
+    }
+    return static_cast<int16_t>(pred);
+  };
+
+  int loop_end = change_dc ? SAVED_COEFS : 6;
+  for (int i = 1; i < loop_end; ++i) {
+    if (coef_bits[i] != 0 && scratch[Q_POS[i]] == 0) {
+      scratch[Q_POS[i]] = calculate_dct_value(i);
+    }
+  }
+  if (change_dc) {
+    scratch[0] = calculate_dct_value(0);
+  }
+}
+
+void PrepareForOutput(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  size_t iMCU_width = cinfo->max_h_samp_factor * m->min_scaled_dct_size;
+  size_t output_stride = m->iMCU_cols_ * iMCU_width;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    const auto& comp = cinfo->comp_info[c];
+    size_t cheight = comp.v_samp_factor * m->scaled_dct_size[c];
+    m->raw_height_[c] = cinfo->total_iMCU_rows * cheight;
+    m->raw_output_[c].Allocate(cinfo, 3 * cheight, output_stride);
+  }
+  int num_all_components =
+      std::max(cinfo->out_color_components, cinfo->num_components);
+  for (int c = 0; c < num_all_components; ++c) {
+    m->render_output_[c].Allocate(cinfo, cinfo->max_v_samp_factor,
+                                  output_stride);
+  }
+  m->idct_scratch_ = Allocate<float>(cinfo, 5 * DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+  m->upsample_scratch_ = Allocate<float>(
+      cinfo, output_stride + kPaddingLeft + kPaddingRight, JPOOL_IMAGE_ALIGNED);
+  size_t bytes_per_sample = jpegli_bytes_per_sample(m->output_data_type_);
+  size_t bytes_per_pixel = cinfo->out_color_components * bytes_per_sample;
+  size_t scratch_stride = RoundUpTo(output_stride, HWY_ALIGNMENT);
+  m->output_scratch_ = Allocate<uint8_t>(
+      cinfo, bytes_per_pixel * scratch_stride, JPOOL_IMAGE_ALIGNED);
+  m->smoothing_scratch_ =
+      Allocate<int16_t>(cinfo, DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+  bool smoothing = do_smoothing(cinfo);
+  m->apply_smoothing = smoothing && cinfo->do_block_smoothing;
+  size_t coeffs_per_block = cinfo->num_components * DCTSIZE2;
+  m->nonzeros_ = Allocate<int>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED);
+  m->sumabs_ = Allocate<int>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED);
+  memset(m->nonzeros_, 0, coeffs_per_block * sizeof(m->nonzeros_[0]));
+  memset(m->sumabs_, 0, coeffs_per_block * sizeof(m->sumabs_[0]));
+  memset(m->num_processed_blocks_, 0, sizeof(m->num_processed_blocks_));
+  m->biases_ = Allocate<float>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED);
+  memset(m->biases_, 0, coeffs_per_block * sizeof(m->biases_[0]));
+  cinfo->output_iMCU_row = 0;
+  cinfo->output_scanline = 0;
+  const float kDequantScale = 1.0f / (8 * 255);
+  if (m->dequant_ == nullptr) {
+    m->dequant_ = Allocate<float>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED);
+    memset(m->dequant_, 0, coeffs_per_block * sizeof(float));
+  }
+  for (int c = 0; c < cinfo->num_components; c++) {
+    const auto& comp = cinfo->comp_info[c];
+    JQUANT_TBL* table = comp.quant_table;
+    if (table == nullptr) continue;
+    for (size_t k = 0; k < DCTSIZE2; ++k) {
+      m->dequant_[c * DCTSIZE2 + k] = table->quantval[k] * kDequantScale;
+    }
+  }
+  ChooseInverseTransform(cinfo);
+  ChooseColorTransform(cinfo);
+}
+
+void DecodeCurrentiMCURow(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  const size_t imcu_row = cinfo->output_iMCU_row;
+  JBLOCKARRAY ba[kMaxComponents];
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    const jpeg_component_info* comp = &cinfo->comp_info[c];
+    int by0 = imcu_row * comp->v_samp_factor;
+    int block_rows_left = comp->height_in_blocks - by0;
+    int max_block_rows = std::min(comp->v_samp_factor, block_rows_left);
+    int offset = m->streaming_mode_ ? 0 : by0;
+    ba[c] = (*cinfo->mem->access_virt_barray)(
+        reinterpret_cast<j_common_ptr>(cinfo), m->coef_arrays[c], offset,
+        max_block_rows, false);
+  }
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    size_t k0 = c * DCTSIZE2;
+    auto& compinfo = cinfo->comp_info[c];
+    size_t block_row = imcu_row * compinfo.v_samp_factor;
+    if (ShouldApplyDequantBiases(cinfo, c)) {
+      // Update statistics for this iMCU row.
+      for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) {
+        size_t by = block_row + iy;
+        if (by >= compinfo.height_in_blocks) {
+          continue;
+        }
+        int16_t* JXL_RESTRICT coeffs = &ba[c][iy][0][0];
+        size_t num = compinfo.width_in_blocks * DCTSIZE2;
+        GatherBlockStats(coeffs, num, &m->nonzeros_[k0], &m->sumabs_[k0]);
+        m->num_processed_blocks_[c] += compinfo.width_in_blocks;
+      }
+      if (imcu_row % 4 == 3) {
+        // Re-compute optimal biases every few iMCU-rows.
+        ComputeOptimalLaplacianBiases(m->num_processed_blocks_[c],
+                                      &m->nonzeros_[k0], &m->sumabs_[k0],
+                                      &m->biases_[k0]);
+      }
+    }
+    RowBuffer<float>* raw_out = &m->raw_output_[c];
+    for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) {
+      size_t by = block_row + iy;
+      if (by >= compinfo.height_in_blocks) {
+        continue;
+      }
+      size_t dctsize = m->scaled_dct_size[c];
+      int16_t* JXL_RESTRICT row_in = &ba[c][iy][0][0];
+      float* JXL_RESTRICT row_out = raw_out->Row(by * dctsize);
+      for (size_t bx = 0; bx < compinfo.width_in_blocks; ++bx) {
+        if (m->apply_smoothing) {
+          PredictSmooth(cinfo, ba[c], c, bx, iy);
+          (*m->inverse_transform[c])(m->smoothing_scratch_, &m->dequant_[k0],
+                                     &m->biases_[k0], m->idct_scratch_,
+                                     &row_out[bx * dctsize], raw_out->stride(),
+                                     dctsize);
+        } else {
+          (*m->inverse_transform[c])(&row_in[bx * DCTSIZE2], &m->dequant_[k0],
+                                     &m->biases_[k0], m->idct_scratch_,
+                                     &row_out[bx * dctsize], raw_out->stride(),
+                                     dctsize);
+        }
+      }
+      if (m->streaming_mode_) {
+        memset(row_in, 0, compinfo.width_in_blocks * sizeof(JBLOCK));
+      }
+    }
+  }
+}
+
+void ProcessRawOutput(j_decompress_ptr cinfo, JSAMPIMAGE data) {
+  jpegli::DecodeCurrentiMCURow(cinfo);
+  jpeg_decomp_master* m = cinfo->master;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    const auto& compinfo = cinfo->comp_info[c];
+    size_t comp_width = compinfo.width_in_blocks * DCTSIZE;
+    size_t comp_height = compinfo.height_in_blocks * DCTSIZE;
+    size_t comp_nrows = compinfo.v_samp_factor * DCTSIZE;
+    size_t y0 = cinfo->output_iMCU_row * compinfo.v_samp_factor * DCTSIZE;
+    size_t y1 = std::min(y0 + comp_nrows, comp_height);
+    for (size_t y = y0; y < y1; ++y) {
+      float* rows[1] = {m->raw_output_[c].Row(y)};
+      uint8_t* output = data[c][y - y0];
+      DecenterRow(rows[0], comp_width);
+      WriteToOutput(cinfo, rows, 0, comp_width, 1, output);
+    }
+  }
+  ++cinfo->output_iMCU_row;
+  cinfo->output_scanline += cinfo->max_v_samp_factor * DCTSIZE;
+  if (cinfo->output_scanline >= cinfo->output_height) {
+    ++m->output_passes_done_;
+  }
+}
+
+void ProcessOutput(j_decompress_ptr cinfo, size_t* num_output_rows,
+                   JSAMPARRAY scanlines, size_t max_output_rows) {
+  jpeg_decomp_master* m = cinfo->master;
+  const int vfactor = cinfo->max_v_samp_factor;
+  const int hfactor = cinfo->max_h_samp_factor;
+  const size_t imcu_row = cinfo->output_iMCU_row;
+  const size_t imcu_height = vfactor * m->min_scaled_dct_size;
+  const size_t imcu_width = hfactor * m->min_scaled_dct_size;
+  const size_t output_width = m->iMCU_cols_ * imcu_width;
+  if (imcu_row == cinfo->total_iMCU_rows ||
+      (imcu_row > 1 && cinfo->output_scanline < (imcu_row - 1) * imcu_height)) {
+    // We are ready to output some scanlines.
+    size_t ybegin = cinfo->output_scanline;
+    size_t yend =
+        (imcu_row == cinfo->total_iMCU_rows ? cinfo->output_height
+                                            : (imcu_row - 1) * imcu_height);
+    yend = std::min<size_t>(yend, ybegin + max_output_rows - *num_output_rows);
+    size_t yb = (ybegin / vfactor) * vfactor;
+    size_t ye = DivCeil(yend, vfactor) * vfactor;
+    for (size_t y = yb; y < ye; y += vfactor) {
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        RowBuffer<float>* raw_out = &m->raw_output_[c];
+        RowBuffer<float>* render_out = &m->render_output_[c];
+        int line_groups = vfactor / m->v_factor[c];
+        size_t yc = y / m->v_factor[c];
+        for (int dy = 0; dy < line_groups; ++dy) {
+          if (cinfo->do_fancy_upsampling && m->v_factor[c] == 2) {
+            size_t ymid = yc + dy;
+            const float* JXL_RESTRICT row_mid = raw_out->Row(ymid);
+            const float* JXL_RESTRICT row_top =
+                ymid == 0 ? row_mid : raw_out->Row(ymid - 1);
+            const float* JXL_RESTRICT row_bot = ymid + 1 == m->raw_height_[c]
+                                                    ? row_mid
+                                                    : raw_out->Row(ymid + 1);
+            Upsample2Vertical(row_top, row_mid, row_bot,
+                              render_out->Row(2 * dy),
+                              render_out->Row(2 * dy + 1), output_width);
+          } else {
+            for (int yix = 0; yix < m->v_factor[c]; ++yix) {
+              size_t ymid = yc + dy;
+              memcpy(render_out->Row(m->v_factor[c] * dy + yix),
+                     raw_out->Row(ymid), raw_out->xsize() * sizeof(float));
+            }
+          }
+        }
+      }
+      for (int yix = 0; yix < vfactor; ++yix) {
+        if (y + yix < ybegin || y + yix >= yend) continue;
+        float* rows[kMaxComponents];
+        int num_all_components =
+            std::max(cinfo->out_color_components, cinfo->num_components);
+        for (int c = 0; c < num_all_components; ++c) {
+          rows[c] = m->render_output_[c].Row(yix);
+        }
+        (*m->color_transform)(rows, output_width);
+        for (int c = 0; c < cinfo->out_color_components; ++c) {
+          // Undo the centering of the sample values around zero.
+          DecenterRow(rows[c], output_width);
+        }
+        if (scanlines) {
+          uint8_t* output = scanlines[*num_output_rows];
+          WriteToOutput(cinfo, rows, m->xoffset_, cinfo->output_width,
+                        cinfo->out_color_components, output);
+        }
+        JXL_ASSERT(cinfo->output_scanline == y + yix);
+        ++cinfo->output_scanline;
+        ++(*num_output_rows);
+        if (cinfo->output_scanline == cinfo->output_height) {
+          ++m->output_passes_done_;
+        }
+      }
+    }
+  } else {
+    DecodeCurrentiMCURow(cinfo);
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      if (m->h_factor[c] == 1) continue;
+      const auto& compinfo = cinfo->comp_info[c];
+      RowBuffer<float>* raw_out = &m->raw_output_[c];
+      size_t cheight = compinfo.v_samp_factor * m->scaled_dct_size[c];
+      size_t y0 = imcu_row * cheight;
+      if (cinfo->do_fancy_upsampling && m->h_factor[c] == 2) {
+        for (size_t iy = 0; iy < cheight; ++iy) {
+          float* JXL_RESTRICT row = raw_out->Row(y0 + iy);
+          Upsample2Horizontal(row, m->upsample_scratch_, output_width);
+        }
+      } else {
+        for (size_t iy = 0; iy < cheight; ++iy) {
+          float* JXL_RESTRICT row = raw_out->Row(y0 + iy);
+          float* JXL_RESTRICT tmp = m->upsample_scratch_;
+          // TODO(szabadka) SIMDify this.
+          for (size_t x = 0; x < output_width; ++x) {
+            tmp[x] = row[x / m->h_factor[c]];
+          }
+          memcpy(row, tmp, output_width * sizeof(tmp[0]));
+        }
+      }
+    }
+    ++cinfo->output_iMCU_row;
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/render.h b/third_party/jpeg-xl/lib/jpegli/render.h
new file mode 100644
index 0000000000..93b80d975a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/render.h
@@ -0,0 +1,28 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_RENDER_H_
+#define LIB_JPEGLI_RENDER_H_
+
+/* clang-format off */
+#include <stdint.h>
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#include <vector>
+
+namespace jpegli {
+
+void PrepareForOutput(j_decompress_ptr cinfo);
+
+void ProcessOutput(j_decompress_ptr cinfo, size_t* num_output_rows,
+                   JSAMPARRAY scanlines, size_t max_output_rows);
+
+void ProcessRawOutput(j_decompress_ptr cinfo, JSAMPIMAGE data);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_RENDER_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/simd.cc b/third_party/jpeg-xl/lib/jpegli/simd.cc
new file mode 100644
index 0000000000..5e84939342
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/simd.cc
@@ -0,0 +1,38 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/simd.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/simd.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+size_t GetVectorSize() { return HWY_LANES(uint8_t); }
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+namespace {
+
+HWY_EXPORT(GetVectorSize);  // Local function.
+
+}  // namespace
+
+size_t VectorSize() {
+  static size_t bytes = HWY_DYNAMIC_DISPATCH(GetVectorSize)();
+  return bytes;
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/simd.h b/third_party/jpeg-xl/lib/jpegli/simd.h
new file mode 100644
index 0000000000..aec772e2d4
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/simd.h
@@ -0,0 +1,18 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_SIMD_H_
+#define LIB_JPEGLI_SIMD_H_
+
+#include <stddef.h>
+
+namespace jpegli {
+
+// Returns SIMD vector size in bytes.
+size_t VectorSize();
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_SIMD_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/source_manager.cc b/third_party/jpeg-xl/lib/jpegli/source_manager.cc
new file mode 100644
index 0000000000..0b8e0a5c8c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/source_manager.cc
@@ -0,0 +1,90 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+
+namespace jpegli {
+
+void init_mem_source(j_decompress_ptr cinfo) {}
+void init_stdio_source(j_decompress_ptr cinfo) {}
+
+void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {
+  if (num_bytes <= 0) return;
+  while (num_bytes > static_cast<long>(cinfo->src->bytes_in_buffer)) {
+    num_bytes -= cinfo->src->bytes_in_buffer;
+    (*cinfo->src->fill_input_buffer)(cinfo);
+  }
+  cinfo->src->next_input_byte += num_bytes;
+  cinfo->src->bytes_in_buffer -= num_bytes;
+}
+
+void term_source(j_decompress_ptr cinfo) {}
+
+boolean EmitFakeEoiMarker(j_decompress_ptr cinfo) {
+  static constexpr uint8_t kFakeEoiMarker[2] = {0xff, 0xd9};
+  cinfo->src->next_input_byte = kFakeEoiMarker;
+  cinfo->src->bytes_in_buffer = 2;
+  return TRUE;
+}
+
+constexpr size_t kStdioBufferSize = 64 << 10;
+
+struct StdioSourceManager {
+  jpeg_source_mgr pub;
+  FILE* f;
+  uint8_t* buffer;
+
+  static boolean fill_input_buffer(j_decompress_ptr cinfo) {
+    auto src = reinterpret_cast<StdioSourceManager*>(cinfo->src);
+    size_t num_bytes_read = fread(src->buffer, 1, kStdioBufferSize, src->f);
+    if (num_bytes_read == 0) {
+      return EmitFakeEoiMarker(cinfo);
+    }
+    src->pub.next_input_byte = src->buffer;
+    src->pub.bytes_in_buffer = num_bytes_read;
+    return TRUE;
+  }
+};
+
+}  // namespace jpegli
+
+void jpegli_mem_src(j_decompress_ptr cinfo, const unsigned char* inbuffer,
+                    unsigned long insize) {
+  if (cinfo->src && cinfo->src->init_source != jpegli::init_mem_source) {
+    JPEGLI_ERROR("jpegli_mem_src: a different source manager was already set");
+  }
+  if (!cinfo->src) {
+    cinfo->src = jpegli::Allocate<jpeg_source_mgr>(cinfo, 1);
+  }
+  cinfo->src->next_input_byte = inbuffer;
+  cinfo->src->bytes_in_buffer = insize;
+  cinfo->src->init_source = jpegli::init_mem_source;
+  cinfo->src->fill_input_buffer = jpegli::EmitFakeEoiMarker;
+  cinfo->src->skip_input_data = jpegli::skip_input_data;
+  cinfo->src->resync_to_restart = jpegli_resync_to_restart;
+  cinfo->src->term_source = jpegli::term_source;
+}
+
+void jpegli_stdio_src(j_decompress_ptr cinfo, FILE* infile) {
+  if (cinfo->src && cinfo->src->init_source != jpegli::init_stdio_source) {
+    JPEGLI_ERROR("jpeg_stdio_src: a different source manager was already set");
+  }
+  if (!cinfo->src) {
+    cinfo->src = reinterpret_cast<jpeg_source_mgr*>(
+        jpegli::Allocate<jpegli::StdioSourceManager>(cinfo, 1));
+  }
+  auto src = reinterpret_cast<jpegli::StdioSourceManager*>(cinfo->src);
+  src->f = infile;
+  src->buffer = jpegli::Allocate<uint8_t>(cinfo, jpegli::kStdioBufferSize);
+  src->pub.next_input_byte = src->buffer;
+  src->pub.bytes_in_buffer = 0;
+  src->pub.init_source = jpegli::init_stdio_source;
+  src->pub.fill_input_buffer = jpegli::StdioSourceManager::fill_input_buffer;
+  src->pub.skip_input_data = jpegli::skip_input_data;
+  src->pub.resync_to_restart = jpegli_resync_to_restart;
+  src->pub.term_source = jpegli::term_source;
+}
diff --git a/third_party/jpeg-xl/lib/jpegli/source_manager_test.cc b/third_party/jpeg-xl/lib/jpegli/source_manager_test.cc
new file mode 100644
index 0000000000..c8d1fbc053
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/source_manager_test.cc
@@ -0,0 +1,141 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#include <cmath>
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/status.h"
+
+namespace jpegli {
+namespace {
+
+void ReadOutputImage(j_decompress_ptr cinfo, TestImage* output) {
+  jpegli_read_header(cinfo, /*require_image=*/TRUE);
+  jpegli_start_decompress(cinfo);
+  output->ysize = cinfo->output_height;
+  output->xsize = cinfo->output_width;
+  output->components = cinfo->num_components;
+  output->AllocatePixels();
+  size_t stride = cinfo->output_width * cinfo->num_components;
+  while (cinfo->output_scanline < cinfo->output_height) {
+    JSAMPROW scanline = &output->pixels[cinfo->output_scanline * stride];
+    jpegli_read_scanlines(cinfo, &scanline, 1);
+  }
+  jpegli_finish_decompress(cinfo);
+}
+
+struct TestConfig {
+  std::string fn;
+  std::string fn_desc;
+  DecompressParams dparams;
+};
+
+class SourceManagerTestParam : public ::testing::TestWithParam<TestConfig> {};
+
+TEST_P(SourceManagerTestParam, TestStdioSourceManager) {
+  TestConfig config = GetParam();
+  jxl::FileWrapper testfile(GetTestDataPath(config.fn), "rb");
+  FILE* src = nullptr;
+  if (config.dparams.size_factor != 1.0) {
+    std::vector<uint8_t> compressed = ReadTestData(config.fn.c_str());
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+    src = tmpfile();
+    ASSERT_TRUE(src != nullptr);
+    fwrite(compressed.data(), 1, compressed.size(), src);
+    rewind(src);
+    return;
+  } else {
+    src = testfile;
+  }
+  ASSERT_TRUE(src != nullptr);
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_stdio_src(&cinfo, src);
+    ReadOutputImage(&cinfo, &output0);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  TestImage output1;
+  DecodeWithLibjpeg(CompressParams(), DecompressParams(),
+                    ReadTestData(config.fn.c_str()), &output1);
+  VerifyOutputImage(output1, output0, 1.0f);
+}
+
+TEST_P(SourceManagerTestParam, TestMemSourceManager) {
+  TestConfig config = GetParam();
+  std::vector<uint8_t> compressed = ReadTestData(config.fn.c_str());
+  if (config.dparams.size_factor < 1.0f) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, compressed.data(), compressed.size());
+    ReadOutputImage(&cinfo, &output0);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  TestImage output1;
+  DecodeWithLibjpeg(CompressParams(), DecompressParams(), compressed, &output1);
+  VerifyOutputImage(output1, output0, 1.0f);
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  {
+    std::vector<std::pair<std::string, std::string>> testfiles({
+        {"jxl/flower/flower.png.im_q85_444.jpg", "Q85YUV444"},
+        {"jxl/flower/flower.png.im_q85_420.jpg", "Q85YUV420"},
+        {"jxl/flower/flower.png.im_q85_420_R13B.jpg", "Q85YUV420R13B"},
+    });
+    for (const auto& it : testfiles) {
+      for (float size_factor : {0.1f, 0.33f, 0.5f, 0.75f}) {
+        TestConfig config;
+        config.fn = it.first;
+        config.fn_desc = it.second;
+        config.dparams.size_factor = size_factor;
+        all_tests.push_back(config);
+      }
+    }
+    return all_tests;
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  os << c.fn_desc;
+  if (c.dparams.size_factor < 1.0f) {
+    os << "Partial" << static_cast<int>(c.dparams.size_factor * 100) << "p";
+  }
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<SourceManagerTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(SourceManagerTest, SourceManagerTestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/streaming_test.cc b/third_party/jpeg-xl/lib/jpegli/streaming_test.cc
new file mode 100644
index 0000000000..4a8981ee6a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/streaming_test.cc
@@ -0,0 +1,233 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+
+namespace jpegli {
+namespace {
+
+// A simple suspending source manager with an input buffer.
+struct SourceManager {
+  jpeg_source_mgr pub;
+  std::vector<uint8_t> buffer;
+
+  SourceManager() {
+    pub.next_input_byte = nullptr;
+    pub.bytes_in_buffer = 0;
+    pub.init_source = init_source;
+    pub.fill_input_buffer = fill_input_buffer;
+    pub.skip_input_data = skip_input_data;
+    pub.resync_to_restart = jpegli_resync_to_restart;
+    pub.term_source = term_source;
+  }
+
+  static void init_source(j_decompress_ptr cinfo) {}
+  static boolean fill_input_buffer(j_decompress_ptr cinfo) { return FALSE; }
+  static void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {}
+  static void term_source(j_decompress_ptr cinfo) {}
+};
+
+// A destination manager that empties its output buffer into a SourceManager's
+// input buffer. The buffer size is kept short because empty_output_buffer() is
+// called only when the output buffer is full, and we want to update the decoder
+// input frequently to demostrate that streaming works.
+static constexpr size_t kOutputBufferSize = 1024;
+struct DestinationManager {
+  jpeg_destination_mgr pub;
+  std::vector<uint8_t> buffer;
+  SourceManager* dest;
+
+  DestinationManager(SourceManager* src)
+      : buffer(kOutputBufferSize), dest(src) {
+    pub.next_output_byte = buffer.data();
+    pub.free_in_buffer = buffer.size();
+    pub.init_destination = init_destination;
+    pub.empty_output_buffer = empty_output_buffer;
+    pub.term_destination = term_destination;
+  }
+
+  static void init_destination(j_compress_ptr cinfo) {}
+
+  static boolean empty_output_buffer(j_compress_ptr cinfo) {
+    auto us = reinterpret_cast<DestinationManager*>(cinfo->dest);
+    jpeg_destination_mgr* src = &us->pub;
+    jpeg_source_mgr* dst = &us->dest->pub;
+    std::vector<uint8_t>& src_buf = us->buffer;
+    std::vector<uint8_t>& dst_buf = us->dest->buffer;
+    if (dst->bytes_in_buffer > 0 && dst->bytes_in_buffer < dst_buf.size()) {
+      memmove(dst_buf.data(), dst->next_input_byte, dst->bytes_in_buffer);
+    }
+    size_t src_len = src_buf.size() - src->free_in_buffer;
+    dst_buf.resize(dst->bytes_in_buffer + src_len);
+    memcpy(&dst_buf[dst->bytes_in_buffer], src_buf.data(), src_len);
+    dst->next_input_byte = dst_buf.data();
+    dst->bytes_in_buffer = dst_buf.size();
+    src->next_output_byte = src_buf.data();
+    src->free_in_buffer = src_buf.size();
+    return true;
+  }
+
+  static void term_destination(j_compress_ptr cinfo) {
+    empty_output_buffer(cinfo);
+  }
+};
+
+struct TestConfig {
+  TestImage input;
+  CompressParams jparams;
+};
+
+class StreamingTestParam : public ::testing::TestWithParam<TestConfig> {};
+
+TEST_P(StreamingTestParam, TestStreaming) {
+  jpeg_decompress_struct dinfo = {};
+  jpeg_compress_struct cinfo = {};
+  TestConfig config = GetParam();
+  TestImage& input = config.input;
+  TestImage output;
+  GeneratePixels(&input);
+  const auto try_catch_block = [&]() {
+    ERROR_HANDLER_SETUP(jpegli);
+    dinfo.err = cinfo.err;
+    dinfo.client_data = cinfo.client_data;
+    // Create a pair of compressor and decompressor objects, where the
+    // compressor's output is connected to the decompressor's input.
+    jpegli_create_decompress(&dinfo);
+    jpegli_create_compress(&cinfo);
+    SourceManager src;
+    dinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+    DestinationManager dest(&src);
+    cinfo.dest = reinterpret_cast<jpeg_destination_mgr*>(&dest);
+
+    cinfo.image_width = input.xsize;
+    cinfo.image_height = input.ysize;
+    cinfo.input_components = input.components;
+    cinfo.in_color_space = input.color_space;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].v_samp_factor = config.jparams.v_sampling[0];
+    jpegli_set_progressive_level(&cinfo, 0);
+    cinfo.optimize_coding = FALSE;
+    jpegli_start_compress(&cinfo, TRUE);
+
+    size_t stride = cinfo.image_width * cinfo.input_components;
+    size_t iMCU_height = 8 * cinfo.max_v_samp_factor;
+    std::vector<uint8_t> row_bytes(iMCU_height * stride);
+    size_t yin = 0;
+    size_t yout = 0;
+    while (yin < cinfo.image_height) {
+      // Feed one iMCU row at a time to the compressor.
+      size_t lines_in = std::min(iMCU_height, cinfo.image_height - yin);
+      memcpy(&row_bytes[0], &input.pixels[yin * stride], lines_in * stride);
+      std::vector<JSAMPROW> rows_in(lines_in);
+      for (size_t i = 0; i < lines_in; ++i) {
+        rows_in[i] = &row_bytes[i * stride];
+      }
+      EXPECT_EQ(lines_in,
+                jpegli_write_scanlines(&cinfo, &rows_in[0], lines_in));
+      yin += lines_in;
+      if (yin == cinfo.image_height) {
+        jpegli_finish_compress(&cinfo);
+      }
+
+      // Atfer the first iMCU row, we don't yet expect any output because the
+      // compressor delays processing to have context rows after the iMCU row.
+      if (yin < std::min<size_t>(2 * iMCU_height, cinfo.image_height)) {
+        continue;
+      }
+
+      // After two iMCU rows, the compressor has started emitting compressed
+      // data. We check here that at least the scan header was output, because
+      // we expect that the compressor's output buffer was filled at least once
+      // while emitting the first compressed iMCU row.
+      if (yin == std::min<size_t>(2 * iMCU_height, cinfo.image_height)) {
+        EXPECT_EQ(JPEG_REACHED_SOS,
+                  jpegli_read_header(&dinfo, /*require_image=*/TRUE));
+        output.xsize = dinfo.image_width;
+        output.ysize = dinfo.image_height;
+        output.components = dinfo.num_components;
+        EXPECT_EQ(output.xsize, input.xsize);
+        EXPECT_EQ(output.ysize, input.ysize);
+        EXPECT_EQ(output.components, input.components);
+        EXPECT_TRUE(jpegli_start_decompress(&dinfo));
+        output.pixels.resize(output.ysize * stride);
+        if (yin < cinfo.image_height) {
+          continue;
+        }
+      }
+
+      // After six iMCU rows, the compressor has emitted five iMCU rows of
+      // compressed data, of which we expect four full iMCU row of compressed
+      // data to be in the decoder's input buffer, but since the decoder also
+      // needs context rows for upsampling and smoothing, we don't expect any
+      // output to be ready yet.
+      if (yin < 7 * iMCU_height && yin < cinfo.image_height) {
+        continue;
+      }
+
+      // After five iMCU rows, we expect the decoder to have rendered the output
+      // with four iMCU rows of delay.
+      // TODO(szabadka) Reduce the processing delay in the decoder if possible.
+      size_t lines_out =
+          (yin == cinfo.image_height ? cinfo.image_height - yout : iMCU_height);
+      std::vector<JSAMPROW> rows_out(lines_out);
+      for (size_t i = 0; i < lines_out; ++i) {
+        rows_out[i] =
+            reinterpret_cast<JSAMPLE*>(&output.pixels[(yout + i) * stride]);
+      }
+      EXPECT_EQ(lines_out,
+                jpegli_read_scanlines(&dinfo, &rows_out[0], lines_out));
+      VerifyOutputImage(input, output, yout, lines_out, 3.8f);
+      yout += lines_out;
+
+      if (yout == cinfo.image_height) {
+        EXPECT_TRUE(jpegli_finish_decompress(&dinfo));
+      }
+    }
+    return true;
+  };
+  EXPECT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&dinfo);
+  jpegli_destroy_compress(&cinfo);
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  const size_t xsize0 = 1920;
+  const size_t ysize0 = 1080;
+  for (int dysize : {0, 1, 8, 9}) {
+    for (int v_sampling : {1, 2}) {
+      TestConfig config;
+      config.input.xsize = xsize0;
+      config.input.ysize = ysize0 + dysize;
+      config.jparams.h_sampling = {1, 1, 1};
+      config.jparams.v_sampling = {v_sampling, 1, 1};
+      all_tests.push_back(config);
+    }
+  }
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  os << c.input;
+  os << c.jparams;
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<StreamingTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(StreamingTest, StreamingTestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/test_utils.cc b/third_party/jpeg-xl/lib/jpegli/test_utils.cc
new file mode 100644
index 0000000000..1ae5483d4a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/test_utils.cc
@@ -0,0 +1,1240 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/test_utils.h"
+
+#include <cmath>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/sanitizers.h"
+
+#if !defined(TEST_DATA_PATH)
+#include "tools/cpp/runfiles/runfiles.h"
+#endif
+
+namespace jpegli {
+
+#if defined(TEST_DATA_PATH)
+std::string GetTestDataPath(const std::string& filename) {
+  return std::string(TEST_DATA_PATH "/") + filename;
+}
+#else
+using bazel::tools::cpp::runfiles::Runfiles;
+const std::unique_ptr<Runfiles> kRunfiles(Runfiles::Create(""));
+std::string GetTestDataPath(const std::string& filename) {
+  std::string root(JPEGXL_ROOT_PACKAGE "/testdata/");
+  return kRunfiles->Rlocation(root + filename);
+}
+#endif
+
+std::vector<uint8_t> ReadTestData(const std::string& filename) {
+  std::string full_path = GetTestDataPath(filename);
+  std::vector<uint8_t> data;
+  fprintf(stderr, "ReadTestData %s\n", full_path.c_str());
+  JXL_CHECK(jxl::ReadFile(full_path, &data));
+  printf("Test data %s is %d bytes long.\n", filename.c_str(),
+         static_cast<int>(data.size()));
+  return data;
+}
+
+void CustomQuantTable::Generate() {
+  basic_table.resize(DCTSIZE2);
+  quantval.resize(DCTSIZE2);
+  switch (table_type) {
+    case 0: {
+      for (int k = 0; k < DCTSIZE2; ++k) {
+        basic_table[k] = k + 1;
+      }
+      break;
+    }
+    default:
+      for (int k = 0; k < DCTSIZE2; ++k) {
+        basic_table[k] = table_type;
+      }
+  }
+  for (int k = 0; k < DCTSIZE2; ++k) {
+    quantval[k] = (basic_table[k] * scale_factor + 50U) / 100U;
+    quantval[k] = std::max(quantval[k], 1U);
+    quantval[k] = std::min(quantval[k], 65535U);
+    if (!add_raw) {
+      quantval[k] = std::min(quantval[k], force_baseline ? 255U : 32767U);
+    }
+  }
+}
+
+bool PNMParser::ParseHeader(const uint8_t** pos, size_t* xsize, size_t* ysize,
+                            size_t* num_channels, size_t* bitdepth) {
+  if (pos_[0] != 'P' || (pos_[1] != '5' && pos_[1] != '6')) {
+    fprintf(stderr, "Invalid PNM header.");
+    return false;
+  }
+  *num_channels = (pos_[1] == '5' ? 1 : 3);
+  pos_ += 2;
+
+  size_t maxval;
+  if (!SkipWhitespace() || !ParseUnsigned(xsize) || !SkipWhitespace() ||
+      !ParseUnsigned(ysize) || !SkipWhitespace() || !ParseUnsigned(&maxval) ||
+      !SkipWhitespace()) {
+    return false;
+  }
+  if (maxval == 0 || maxval >= 65536) {
+    fprintf(stderr, "Invalid maxval value.\n");
+    return false;
+  }
+  bool found_bitdepth = false;
+  for (int bits = 1; bits <= 16; ++bits) {
+    if (maxval == (1u << bits) - 1) {
+      *bitdepth = bits;
+      found_bitdepth = true;
+      break;
+    }
+  }
+  if (!found_bitdepth) {
+    fprintf(stderr, "Invalid maxval value.\n");
+    return false;
+  }
+
+  *pos = pos_;
+  return true;
+}
+
+bool PNMParser::ParseUnsigned(size_t* number) {
+  if (pos_ == end_ || *pos_ < '0' || *pos_ > '9') {
+    fprintf(stderr, "Expected unsigned number.\n");
+    return false;
+  }
+  *number = 0;
+  while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') {
+    *number *= 10;
+    *number += *pos_ - '0';
+    ++pos_;
+  }
+
+  return true;
+}
+
+bool PNMParser::SkipWhitespace() {
+  if (pos_ == end_ || !IsWhitespace(*pos_)) {
+    fprintf(stderr, "Expected whitespace.\n");
+    return false;
+  }
+  while (pos_ < end_ && IsWhitespace(*pos_)) {
+    ++pos_;
+  }
+  return true;
+}
+
+bool ReadPNM(const std::vector<uint8_t>& data, size_t* xsize, size_t* ysize,
+             size_t* num_channels, size_t* bitdepth,
+             std::vector<uint8_t>* pixels) {
+  if (data.size() < 2) {
+    fprintf(stderr, "PNM file too small.\n");
+    return false;
+  }
+  PNMParser parser(data.data(), data.size());
+  const uint8_t* pos = nullptr;
+  if (!parser.ParseHeader(&pos, xsize, ysize, num_channels, bitdepth)) {
+    return false;
+  }
+  pixels->resize(data.data() + data.size() - pos);
+  memcpy(&(*pixels)[0], pos, pixels->size());
+  return true;
+}
+
+std::string ColorSpaceName(J_COLOR_SPACE colorspace) {
+  switch (colorspace) {
+    case JCS_UNKNOWN:
+      return "UNKNOWN";
+    case JCS_GRAYSCALE:
+      return "GRAYSCALE";
+    case JCS_RGB:
+      return "RGB";
+    case JCS_YCbCr:
+      return "YCbCr";
+    case JCS_CMYK:
+      return "CMYK";
+    case JCS_YCCK:
+      return "YCCK";
+    default:
+      return "";
+  }
+}
+
+std::string IOMethodName(JpegliDataType data_type,
+                         JpegliEndianness endianness) {
+  std::string retval;
+  if (data_type == JPEGLI_TYPE_UINT8) {
+    return "";
+  } else if (data_type == JPEGLI_TYPE_UINT16) {
+    retval = "UINT16";
+  } else if (data_type == JPEGLI_TYPE_FLOAT) {
+    retval = "FLOAT";
+  }
+  if (endianness == JPEGLI_LITTLE_ENDIAN) {
+    retval += "LE";
+  } else if (endianness == JPEGLI_BIG_ENDIAN) {
+    retval += "BE";
+  }
+  return retval;
+}
+
+std::string SamplingId(const CompressParams& jparams) {
+  std::stringstream os;
+  JXL_CHECK(jparams.h_sampling.size() == jparams.v_sampling.size());
+  if (!jparams.h_sampling.empty()) {
+    size_t len = jparams.h_sampling.size();
+    while (len > 1 && jparams.h_sampling[len - 1] == 1 &&
+           jparams.v_sampling[len - 1] == 1) {
+      --len;
+    }
+    os << "SAMP";
+    for (size_t i = 0; i < len; ++i) {
+      if (i > 0) os << "_";
+      os << jparams.h_sampling[i] << "x" << jparams.v_sampling[i];
+    }
+  }
+  return os.str();
+}
+
+std::ostream& operator<<(std::ostream& os, const TestImage& input) {
+  os << input.xsize << "x" << input.ysize;
+  os << IOMethodName(input.data_type, input.endianness);
+  if (input.color_space != JCS_RGB) {
+    os << "InputColor" << ColorSpaceName(input.color_space);
+  }
+  if (input.color_space == JCS_UNKNOWN) {
+    os << input.components;
+  }
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const CompressParams& jparams) {
+  os << "Q" << jparams.quality;
+  os << SamplingId(jparams);
+  if (jparams.set_jpeg_colorspace) {
+    os << "JpegColor" << ColorSpaceName(jparams.jpeg_color_space);
+  }
+  if (!jparams.comp_ids.empty()) {
+    os << "CID";
+    for (size_t i = 0; i < jparams.comp_ids.size(); ++i) {
+      os << jparams.comp_ids[i];
+    }
+  }
+  if (!jparams.quant_indexes.empty()) {
+    os << "QIDX";
+    for (size_t i = 0; i < jparams.quant_indexes.size(); ++i) {
+      os << jparams.quant_indexes[i];
+    }
+    for (const auto& table : jparams.quant_tables) {
+      os << "TABLE" << table.slot_idx << "T" << table.table_type << "F"
+         << table.scale_factor
+         << (table.add_raw          ? "R"
+             : table.force_baseline ? "B"
+                                    : "");
+    }
+  }
+  if (jparams.progressive_mode >= 0) {
+    os << "P" << jparams.progressive_mode;
+  } else if (jparams.simple_progression) {
+    os << "Psimple";
+  }
+  if (jparams.optimize_coding == 1) {
+    JXL_CHECK(jparams.progressive_mode <= 0 && !jparams.simple_progression);
+    os << "OptimizedCode";
+  } else if (jparams.optimize_coding == 0) {
+    JXL_CHECK(jparams.progressive_mode <= 0 && !jparams.simple_progression);
+    os << "FixedCode";
+    if (jparams.use_flat_dc_luma_code) {
+      os << "FlatDCLuma";
+    } else if (jparams.omit_standard_tables) {
+      os << "OmitDHT";
+    }
+  }
+  if (!jparams.use_adaptive_quantization) {
+    os << "NoAQ";
+  }
+  if (jparams.restart_interval > 0) {
+    os << "R" << jparams.restart_interval;
+  }
+  if (jparams.restart_in_rows > 0) {
+    os << "RR" << jparams.restart_in_rows;
+  }
+  if (jparams.xyb_mode) {
+    os << "XYB";
+  } else if (jparams.libjpeg_mode) {
+    os << "Libjpeg";
+  }
+  if (jparams.override_JFIF >= 0) {
+    os << (jparams.override_JFIF ? "AddJFIF" : "NoJFIF");
+  }
+  if (jparams.override_Adobe >= 0) {
+    os << (jparams.override_Adobe ? "AddAdobe" : "NoAdobe");
+  }
+  if (jparams.add_marker) {
+    os << "AddMarker";
+  }
+  if (!jparams.icc.empty()) {
+    os << "ICCSize" << jparams.icc.size();
+  }
+  if (jparams.smoothing_factor != 0) {
+    os << "SF" << jparams.smoothing_factor;
+  }
+  return os;
+}
+
+void SetNumChannels(J_COLOR_SPACE colorspace, size_t* channels) {
+  if (colorspace == JCS_GRAYSCALE) {
+    *channels = 1;
+  } else if (colorspace == JCS_RGB || colorspace == JCS_YCbCr) {
+    *channels = 3;
+  } else if (colorspace == JCS_CMYK || colorspace == JCS_YCCK) {
+    *channels = 4;
+  } else if (colorspace == JCS_UNKNOWN) {
+    JXL_CHECK(*channels <= 4);
+  } else {
+    JXL_ABORT();
+  }
+}
+
+void RGBToYCbCr(float r, float g, float b, float* y, float* cb, float* cr) {
+  *y = 0.299f * r + 0.587f * g + 0.114f * b;
+  *cb = -0.168736f * r - 0.331264f * g + 0.5f * b + 0.5f;
+  *cr = 0.5f * r - 0.418688f * g - 0.081312f * b + 0.5f;
+}
+
+void ConvertPixel(const uint8_t* input_rgb, uint8_t* out,
+                  J_COLOR_SPACE colorspace, size_t num_channels,
+                  JpegliDataType data_type = JPEGLI_TYPE_UINT8,
+                  bool swap_endianness = JPEGLI_NATIVE_ENDIAN) {
+  const float kMul = 255.0f;
+  float r = input_rgb[0] / kMul;
+  float g = input_rgb[1] / kMul;
+  float b = input_rgb[2] / kMul;
+  uint8_t out8[MAX_COMPONENTS];
+  if (colorspace == JCS_GRAYSCALE) {
+    const float Y = 0.299f * r + 0.587f * g + 0.114f * b;
+    out8[0] = static_cast<uint8_t>(std::round(Y * kMul));
+  } else if (colorspace == JCS_RGB || colorspace == JCS_UNKNOWN) {
+    for (size_t c = 0; c < num_channels; ++c) {
+      out8[c] = input_rgb[std::min<size_t>(2, c)];
+    }
+  } else if (colorspace == JCS_YCbCr) {
+    float Y, Cb, Cr;
+    RGBToYCbCr(r, g, b, &Y, &Cb, &Cr);
+    out8[0] = static_cast<uint8_t>(std::round(Y * kMul));
+    out8[1] = static_cast<uint8_t>(std::round(Cb * kMul));
+    out8[2] = static_cast<uint8_t>(std::round(Cr * kMul));
+  } else if (colorspace == JCS_CMYK || colorspace == JCS_YCCK) {
+    float K = 1.0f - std::max(r, std::max(g, b));
+    float scaleK = 1.0f / (1.0f - K);
+    r *= scaleK;
+    g *= scaleK;
+    b *= scaleK;
+    if (colorspace == JCS_CMYK) {
+      out8[0] = static_cast<uint8_t>(std::round((1.0f - r) * kMul));
+      out8[1] = static_cast<uint8_t>(std::round((1.0f - g) * kMul));
+      out8[2] = static_cast<uint8_t>(std::round((1.0f - b) * kMul));
+    } else if (colorspace == JCS_YCCK) {
+      float Y, Cb, Cr;
+      RGBToYCbCr(r, g, b, &Y, &Cb, &Cr);
+      out8[0] = static_cast<uint8_t>(std::round(Y * kMul));
+      out8[1] = static_cast<uint8_t>(std::round(Cb * kMul));
+      out8[2] = static_cast<uint8_t>(std::round(Cr * kMul));
+    }
+    out8[3] = static_cast<uint8_t>(std::round(K * kMul));
+  } else {
+    JXL_ABORT("Colorspace %d not supported", colorspace);
+  }
+  if (data_type == JPEGLI_TYPE_UINT8) {
+    memcpy(out, out8, num_channels);
+  } else if (data_type == JPEGLI_TYPE_UINT16) {
+    for (size_t c = 0; c < num_channels; ++c) {
+      uint16_t val = (out8[c] << 8) + out8[c];
+      val |= 0x40;  // Make little-endian and big-endian asymmetric
+      if (swap_endianness) {
+        val = JXL_BSWAP16(val);
+      }
+      memcpy(&out[sizeof(val) * c], &val, sizeof(val));
+    }
+  } else if (data_type == JPEGLI_TYPE_FLOAT) {
+    for (size_t c = 0; c < num_channels; ++c) {
+      float val = out8[c] / 255.0f;
+      if (swap_endianness) {
+        val = BSwapFloat(val);
+      }
+      memcpy(&out[sizeof(val) * c], &val, sizeof(val));
+    }
+  }
+}
+
+void ConvertToGrayscale(TestImage* img) {
+  if (img->color_space == JCS_GRAYSCALE) return;
+  JXL_CHECK(img->data_type == JPEGLI_TYPE_UINT8);
+  for (size_t i = 0; i < img->pixels.size(); i += 3) {
+    if (img->color_space == JCS_RGB) {
+      ConvertPixel(&img->pixels[i], &img->pixels[i / 3], JCS_GRAYSCALE, 1);
+    } else if (img->color_space == JCS_YCbCr) {
+      img->pixels[i / 3] = img->pixels[i];
+    }
+  }
+  img->pixels.resize(img->pixels.size() / 3);
+  img->color_space = JCS_GRAYSCALE;
+  img->components = 1;
+}
+
+void GeneratePixels(TestImage* img) {
+  const std::vector<uint8_t> imgdata = ReadTestData("jxl/flower/flower.pnm");
+  size_t xsize, ysize, channels, bitdepth;
+  std::vector<uint8_t> pixels;
+  JXL_CHECK(ReadPNM(imgdata, &xsize, &ysize, &channels, &bitdepth, &pixels));
+  if (img->xsize == 0) img->xsize = xsize;
+  if (img->ysize == 0) img->ysize = ysize;
+  JXL_CHECK(img->xsize <= xsize);
+  JXL_CHECK(img->ysize <= ysize);
+  JXL_CHECK(3 == channels);
+  JXL_CHECK(8 == bitdepth);
+  size_t in_bytes_per_pixel = channels;
+  size_t in_stride = xsize * in_bytes_per_pixel;
+  size_t x0 = (xsize - img->xsize) / 2;
+  size_t y0 = (ysize - img->ysize) / 2;
+  SetNumChannels(img->color_space, &img->components);
+  size_t out_bytes_per_pixel =
+      jpegli_bytes_per_sample(img->data_type) * img->components;
+  size_t out_stride = img->xsize * out_bytes_per_pixel;
+  bool swap_endianness =
+      (img->endianness == JPEGLI_LITTLE_ENDIAN && !IsLittleEndian()) ||
+      (img->endianness == JPEGLI_BIG_ENDIAN && IsLittleEndian());
+  img->pixels.resize(img->ysize * out_stride);
+  for (size_t iy = 0; iy < img->ysize; ++iy) {
+    size_t y = y0 + iy;
+    for (size_t ix = 0; ix < img->xsize; ++ix) {
+      size_t x = x0 + ix;
+      size_t idx_in = y * in_stride + x * in_bytes_per_pixel;
+      size_t idx_out = iy * out_stride + ix * out_bytes_per_pixel;
+      ConvertPixel(&pixels[idx_in], &img->pixels[idx_out], img->color_space,
+                   img->components, img->data_type, swap_endianness);
+    }
+  }
+}
+
+void GenerateRawData(const CompressParams& jparams, TestImage* img) {
+  for (size_t c = 0; c < img->components; ++c) {
+    size_t xsize = jparams.comp_width(*img, c);
+    size_t ysize = jparams.comp_height(*img, c);
+    size_t factor_y = jparams.max_v_sample() / jparams.v_samp(c);
+    size_t factor_x = jparams.max_h_sample() / jparams.h_samp(c);
+    size_t factor = factor_x * factor_y;
+    std::vector<uint8_t> plane(ysize * xsize);
+    size_t bytes_per_pixel = img->components;
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        int result = 0;
+        for (size_t iy = 0; iy < factor_y; ++iy) {
+          size_t yy = std::min(y * factor_y + iy, img->ysize - 1);
+          for (size_t ix = 0; ix < factor_x; ++ix) {
+            size_t xx = std::min(x * factor_x + ix, img->xsize - 1);
+            size_t pixel_ix = (yy * img->xsize + xx) * bytes_per_pixel + c;
+            result += img->pixels[pixel_ix];
+          }
+        }
+        result = static_cast<uint8_t>((result + factor / 2) / factor);
+        plane[y * xsize + x] = result;
+      }
+    }
+    img->raw_data.emplace_back(std::move(plane));
+  }
+}
+
+void GenerateCoeffs(const CompressParams& jparams, TestImage* img) {
+  for (size_t c = 0; c < img->components; ++c) {
+    int xsize_blocks = jparams.comp_width(*img, c) / DCTSIZE;
+    int ysize_blocks = jparams.comp_height(*img, c) / DCTSIZE;
+    std::vector<JCOEF> plane(ysize_blocks * xsize_blocks * DCTSIZE2);
+    for (int by = 0; by < ysize_blocks; ++by) {
+      for (int bx = 0; bx < xsize_blocks; ++bx) {
+        JCOEF* block = &plane[(by * xsize_blocks + bx) * DCTSIZE2];
+        for (int k = 0; k < DCTSIZE2; ++k) {
+          block[k] = (bx - by) / (k + 1);
+        }
+      }
+    }
+    img->coeffs.emplace_back(std::move(plane));
+  }
+}
+
+void EncodeWithJpegli(const TestImage& input, const CompressParams& jparams,
+                      j_compress_ptr cinfo) {
+  cinfo->image_width = input.xsize;
+  cinfo->image_height = input.ysize;
+  cinfo->input_components = input.components;
+  if (jparams.xyb_mode) {
+    jpegli_set_xyb_mode(cinfo);
+  }
+  if (jparams.libjpeg_mode) {
+    jpegli_enable_adaptive_quantization(cinfo, FALSE);
+    jpegli_use_standard_quant_tables(cinfo);
+    jpegli_set_progressive_level(cinfo, 0);
+  }
+  jpegli_set_defaults(cinfo);
+  cinfo->in_color_space = input.color_space;
+  jpegli_default_colorspace(cinfo);
+  if (jparams.override_JFIF >= 0) {
+    cinfo->write_JFIF_header = jparams.override_JFIF;
+  }
+  if (jparams.override_Adobe >= 0) {
+    cinfo->write_Adobe_marker = jparams.override_Adobe;
+  }
+  if (jparams.set_jpeg_colorspace) {
+    jpegli_set_colorspace(cinfo, jparams.jpeg_color_space);
+  }
+  if (!jparams.comp_ids.empty()) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      cinfo->comp_info[c].component_id = jparams.comp_ids[c];
+    }
+  }
+  if (!jparams.h_sampling.empty()) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      cinfo->comp_info[c].h_samp_factor = jparams.h_sampling[c];
+      cinfo->comp_info[c].v_samp_factor = jparams.v_sampling[c];
+    }
+  }
+  jpegli_set_quality(cinfo, jparams.quality, TRUE);
+  if (!jparams.quant_indexes.empty()) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      cinfo->comp_info[c].quant_tbl_no = jparams.quant_indexes[c];
+    }
+    for (const auto& table : jparams.quant_tables) {
+      if (table.add_raw) {
+        cinfo->quant_tbl_ptrs[table.slot_idx] =
+            jpegli_alloc_quant_table((j_common_ptr)cinfo);
+        for (int k = 0; k < DCTSIZE2; ++k) {
+          cinfo->quant_tbl_ptrs[table.slot_idx]->quantval[k] =
+              table.quantval[k];
+        }
+        cinfo->quant_tbl_ptrs[table.slot_idx]->sent_table = FALSE;
+      } else {
+        jpegli_add_quant_table(cinfo, table.slot_idx, &table.basic_table[0],
+                               table.scale_factor, table.force_baseline);
+      }
+    }
+  }
+  if (jparams.simple_progression) {
+    jpegli_simple_progression(cinfo);
+    JXL_CHECK(jparams.progressive_mode == -1);
+  }
+  if (jparams.progressive_mode > 2) {
+    const ScanScript& script = kTestScript[jparams.progressive_mode - 3];
+    cinfo->scan_info = script.scans;
+    cinfo->num_scans = script.num_scans;
+  } else if (jparams.progressive_mode >= 0) {
+    jpegli_set_progressive_level(cinfo, jparams.progressive_mode);
+  }
+  jpegli_set_input_format(cinfo, input.data_type, input.endianness);
+  jpegli_enable_adaptive_quantization(cinfo, jparams.use_adaptive_quantization);
+  cinfo->restart_interval = jparams.restart_interval;
+  cinfo->restart_in_rows = jparams.restart_in_rows;
+  cinfo->smoothing_factor = jparams.smoothing_factor;
+  if (jparams.optimize_coding == 1) {
+    cinfo->optimize_coding = TRUE;
+  } else if (jparams.optimize_coding == 0) {
+    cinfo->optimize_coding = FALSE;
+  }
+  cinfo->raw_data_in = !input.raw_data.empty();
+  if (jparams.optimize_coding == 0 && jparams.use_flat_dc_luma_code) {
+    JHUFF_TBL* tbl = cinfo->dc_huff_tbl_ptrs[0];
+    memset(tbl, 0, sizeof(*tbl));
+    tbl->bits[4] = 15;
+    for (int i = 0; i < 15; ++i) tbl->huffval[i] = i;
+  }
+  if (input.coeffs.empty()) {
+    bool write_all_tables = TRUE;
+    if (jparams.optimize_coding == 0 && !jparams.use_flat_dc_luma_code &&
+        jparams.omit_standard_tables) {
+      write_all_tables = FALSE;
+      cinfo->dc_huff_tbl_ptrs[0]->sent_table = TRUE;
+      cinfo->dc_huff_tbl_ptrs[1]->sent_table = TRUE;
+      cinfo->ac_huff_tbl_ptrs[0]->sent_table = TRUE;
+      cinfo->ac_huff_tbl_ptrs[1]->sent_table = TRUE;
+    }
+    jpegli_start_compress(cinfo, write_all_tables);
+    if (jparams.add_marker) {
+      jpegli_write_marker(cinfo, kSpecialMarker0, kMarkerData,
+                          sizeof(kMarkerData));
+      jpegli_write_m_header(cinfo, kSpecialMarker1, sizeof(kMarkerData));
+      for (size_t p = 0; p < sizeof(kMarkerData); ++p) {
+        jpegli_write_m_byte(cinfo, kMarkerData[p]);
+      }
+      for (size_t i = 0; i < kMarkerSequenceLen; ++i) {
+        jpegli_write_marker(cinfo, kMarkerSequence[i], kMarkerData,
+                            ((i + 2) % sizeof(kMarkerData)));
+      }
+    }
+    if (!jparams.icc.empty()) {
+      jpegli_write_icc_profile(cinfo, jparams.icc.data(), jparams.icc.size());
+    }
+  }
+  if (cinfo->raw_data_in) {
+    // Need to copy because jpeg API requires non-const pointers.
+    std::vector<std::vector<uint8_t>> raw_data = input.raw_data;
+    size_t max_lines = jparams.max_v_sample() * DCTSIZE;
+    std::vector<std::vector<JSAMPROW>> rowdata(cinfo->num_components);
+    std::vector<JSAMPARRAY> data(cinfo->num_components);
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      rowdata[c].resize(jparams.v_samp(c) * DCTSIZE);
+      data[c] = &rowdata[c][0];
+    }
+    while (cinfo->next_scanline < cinfo->image_height) {
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        size_t cwidth = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+        size_t cheight = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+        size_t num_lines = jparams.v_samp(c) * DCTSIZE;
+        size_t y0 = (cinfo->next_scanline / max_lines) * num_lines;
+        for (size_t i = 0; i < num_lines; ++i) {
+          rowdata[c][i] =
+              (y0 + i < cheight ? &raw_data[c][(y0 + i) * cwidth] : nullptr);
+        }
+      }
+      size_t num_lines = jpegli_write_raw_data(cinfo, &data[0], max_lines);
+      JXL_CHECK(num_lines == max_lines);
+    }
+  } else if (!input.coeffs.empty()) {
+    j_common_ptr comptr = reinterpret_cast<j_common_ptr>(cinfo);
+    jvirt_barray_ptr* coef_arrays = reinterpret_cast<jvirt_barray_ptr*>((
+        *cinfo->mem->alloc_small)(
+        comptr, JPOOL_IMAGE, cinfo->num_components * sizeof(jvirt_barray_ptr)));
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      size_t xsize_blocks = jparams.comp_width(input, c) / DCTSIZE;
+      size_t ysize_blocks = jparams.comp_height(input, c) / DCTSIZE;
+      coef_arrays[c] = (*cinfo->mem->request_virt_barray)(
+          comptr, JPOOL_IMAGE, FALSE, xsize_blocks, ysize_blocks,
+          cinfo->comp_info[c].v_samp_factor);
+    }
+    jpegli_write_coefficients(cinfo, coef_arrays);
+    if (jparams.add_marker) {
+      jpegli_write_marker(cinfo, kSpecialMarker0, kMarkerData,
+                          sizeof(kMarkerData));
+      jpegli_write_m_header(cinfo, kSpecialMarker1, sizeof(kMarkerData));
+      for (size_t p = 0; p < sizeof(kMarkerData); ++p) {
+        jpegli_write_m_byte(cinfo, kMarkerData[p]);
+      }
+    }
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      jpeg_component_info* comp = &cinfo->comp_info[c];
+      for (size_t by = 0; by < comp->height_in_blocks; ++by) {
+        JBLOCKARRAY ba = (*cinfo->mem->access_virt_barray)(
+            comptr, coef_arrays[c], by, 1, true);
+        size_t stride = comp->width_in_blocks * sizeof(JBLOCK);
+        size_t offset = by * comp->width_in_blocks * DCTSIZE2;
+        memcpy(ba[0], &input.coeffs[c][offset], stride);
+      }
+    }
+  } else {
+    size_t stride = cinfo->image_width * cinfo->input_components *
+                    jpegli_bytes_per_sample(input.data_type);
+    std::vector<uint8_t> row_bytes(stride);
+    for (size_t y = 0; y < cinfo->image_height; ++y) {
+      memcpy(&row_bytes[0], &input.pixels[y * stride], stride);
+      JSAMPROW row[] = {row_bytes.data()};
+      jpegli_write_scanlines(cinfo, row, 1);
+    }
+  }
+  jpegli_finish_compress(cinfo);
+}
+
+bool EncodeWithJpegli(const TestImage& input, const CompressParams& jparams,
+                      std::vector<uint8_t>* compressed) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    EncodeWithJpegli(input, jparams, &cinfo);
+    return true;
+  };
+  bool success = try_catch_block();
+  jpegli_destroy_compress(&cinfo);
+  if (success) {
+    compressed->resize(buffer_size);
+    std::copy_n(buffer, buffer_size, compressed->data());
+  }
+  if (buffer) std::free(buffer);
+  return success;
+}
+
+void SetScanDecompressParams(const DecompressParams& dparams,
+                             j_decompress_ptr cinfo, int scan_number,
+                             bool is_jpegli) {
+  const ScanDecompressParams* sparams = nullptr;
+  for (const auto& sp : dparams.scan_params) {
+    if (scan_number <= sp.max_scan_number) {
+      sparams = &sp;
+      break;
+    }
+  }
+  if (sparams == nullptr) {
+    return;
+  }
+  if (dparams.quantize_colors) {
+    cinfo->dither_mode = sparams->dither_mode;
+    if (sparams->color_quant_mode == CQUANT_1PASS) {
+      cinfo->two_pass_quantize = FALSE;
+      cinfo->colormap = nullptr;
+    } else if (sparams->color_quant_mode == CQUANT_2PASS) {
+      JXL_CHECK(cinfo->out_color_space = JCS_RGB);
+      cinfo->two_pass_quantize = TRUE;
+      cinfo->colormap = nullptr;
+    } else if (sparams->color_quant_mode == CQUANT_EXTERNAL) {
+      JXL_CHECK(cinfo->out_color_space = JCS_RGB);
+      cinfo->two_pass_quantize = FALSE;
+      bool have_colormap = cinfo->colormap != nullptr;
+      cinfo->actual_number_of_colors = kTestColorMapNumColors;
+      cinfo->colormap = (*cinfo->mem->alloc_sarray)(
+          reinterpret_cast<j_common_ptr>(cinfo), JPOOL_IMAGE,
+          cinfo->actual_number_of_colors, 3);
+      jxl::msan::UnpoisonMemory(cinfo->colormap, 3 * sizeof(JSAMPROW));
+      for (int i = 0; i < kTestColorMapNumColors; ++i) {
+        cinfo->colormap[0][i] = (kTestColorMap[i] >> 16) & 0xff;
+        cinfo->colormap[1][i] = (kTestColorMap[i] >> 8) & 0xff;
+        cinfo->colormap[2][i] = (kTestColorMap[i] >> 0) & 0xff;
+      }
+      if (have_colormap) {
+        if (is_jpegli) {
+          jpegli_new_colormap(cinfo);
+        } else {
+          jpeg_new_colormap(cinfo);
+        }
+      }
+    } else if (sparams->color_quant_mode == CQUANT_REUSE) {
+      JXL_CHECK(cinfo->out_color_space = JCS_RGB);
+      JXL_CHECK(cinfo->colormap);
+    }
+  }
+}
+
+void SetDecompressParams(const DecompressParams& dparams,
+                         j_decompress_ptr cinfo, bool is_jpegli) {
+  cinfo->do_block_smoothing = dparams.do_block_smoothing;
+  cinfo->do_fancy_upsampling = dparams.do_fancy_upsampling;
+  if (dparams.output_mode == RAW_DATA) {
+    cinfo->raw_data_out = TRUE;
+  }
+  if (dparams.set_out_color_space) {
+    cinfo->out_color_space = dparams.out_color_space;
+    if (dparams.out_color_space == JCS_UNKNOWN) {
+      cinfo->jpeg_color_space = JCS_UNKNOWN;
+    }
+  }
+  cinfo->scale_num = dparams.scale_num;
+  cinfo->scale_denom = dparams.scale_denom;
+  cinfo->quantize_colors = dparams.quantize_colors;
+  cinfo->desired_number_of_colors = dparams.desired_number_of_colors;
+  if (!dparams.scan_params.empty()) {
+    if (cinfo->buffered_image) {
+      for (const auto& sparams : dparams.scan_params) {
+        if (sparams.color_quant_mode == CQUANT_1PASS) {
+          cinfo->enable_1pass_quant = TRUE;
+        } else if (sparams.color_quant_mode == CQUANT_2PASS) {
+          cinfo->enable_2pass_quant = TRUE;
+        } else if (sparams.color_quant_mode == CQUANT_EXTERNAL) {
+          cinfo->enable_external_quant = TRUE;
+        }
+      }
+      SetScanDecompressParams(dparams, cinfo, 1, is_jpegli);
+    } else {
+      SetScanDecompressParams(dparams, cinfo, kLastScan, is_jpegli);
+    }
+  }
+  if (is_jpegli) {
+    jpegli_set_output_format(cinfo, dparams.data_type, dparams.endianness);
+  }
+}
+
+void CheckMarkerPresent(j_decompress_ptr cinfo, uint8_t marker_type) {
+  bool marker_found = false;
+  for (jpeg_saved_marker_ptr marker = cinfo->marker_list; marker != nullptr;
+       marker = marker->next) {
+    jxl::msan::UnpoisonMemory(marker, sizeof(*marker));
+    jxl::msan::UnpoisonMemory(marker->data, marker->data_length);
+    if (marker->marker == marker_type &&
+        marker->data_length == sizeof(kMarkerData) &&
+        memcmp(marker->data, kMarkerData, sizeof(kMarkerData)) == 0) {
+      marker_found = true;
+    }
+  }
+  JXL_CHECK(marker_found);
+}
+
+void VerifyHeader(const CompressParams& jparams, j_decompress_ptr cinfo) {
+  if (jparams.set_jpeg_colorspace) {
+    JXL_CHECK(cinfo->jpeg_color_space == jparams.jpeg_color_space);
+  }
+  if (jparams.override_JFIF >= 0) {
+    JXL_CHECK(cinfo->saw_JFIF_marker == jparams.override_JFIF);
+  }
+  if (jparams.override_Adobe >= 0) {
+    JXL_CHECK(cinfo->saw_Adobe_marker == jparams.override_Adobe);
+  }
+  if (jparams.add_marker) {
+    CheckMarkerPresent(cinfo, kSpecialMarker0);
+    CheckMarkerPresent(cinfo, kSpecialMarker1);
+  }
+  jxl::msan::UnpoisonMemory(
+      cinfo->comp_info, cinfo->num_components * sizeof(cinfo->comp_info[0]));
+  int max_h_samp_factor = 1;
+  int max_v_samp_factor = 1;
+  for (int i = 0; i < cinfo->num_components; ++i) {
+    jpeg_component_info* comp = &cinfo->comp_info[i];
+    if (!jparams.comp_ids.empty()) {
+      JXL_CHECK(comp->component_id == jparams.comp_ids[i]);
+    }
+    if (!jparams.h_sampling.empty()) {
+      JXL_CHECK(comp->h_samp_factor == jparams.h_sampling[i]);
+    }
+    if (!jparams.v_sampling.empty()) {
+      JXL_CHECK(comp->v_samp_factor == jparams.v_sampling[i]);
+    }
+    if (!jparams.quant_indexes.empty()) {
+      JXL_CHECK(comp->quant_tbl_no == jparams.quant_indexes[i]);
+    }
+    max_h_samp_factor = std::max(max_h_samp_factor, comp->h_samp_factor);
+    max_v_samp_factor = std::max(max_v_samp_factor, comp->v_samp_factor);
+  }
+  JXL_CHECK(max_h_samp_factor == cinfo->max_h_samp_factor);
+  JXL_CHECK(max_v_samp_factor == cinfo->max_v_samp_factor);
+  int referenced_tables[NUM_QUANT_TBLS] = {};
+  for (int i = 0; i < cinfo->num_components; ++i) {
+    jpeg_component_info* comp = &cinfo->comp_info[i];
+    JXL_CHECK(comp->width_in_blocks ==
+              DivCeil(cinfo->image_width * comp->h_samp_factor,
+                      max_h_samp_factor * DCTSIZE));
+    JXL_CHECK(comp->height_in_blocks ==
+              DivCeil(cinfo->image_height * comp->v_samp_factor,
+                      max_v_samp_factor * DCTSIZE));
+    referenced_tables[comp->quant_tbl_no] = 1;
+  }
+  for (const auto& table : jparams.quant_tables) {
+    JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[table.slot_idx];
+    if (!referenced_tables[table.slot_idx]) {
+      JXL_CHECK(quant_table == nullptr);
+      continue;
+    }
+    JXL_CHECK(quant_table != nullptr);
+    jxl::msan::UnpoisonMemory(quant_table, sizeof(*quant_table));
+    for (int k = 0; k < DCTSIZE2; ++k) {
+      JXL_CHECK(quant_table->quantval[k] == table.quantval[k]);
+    }
+  }
+}
+
+void VerifyScanHeader(const CompressParams& jparams, j_decompress_ptr cinfo) {
+  JXL_CHECK(cinfo->input_scan_number > 0);
+  if (cinfo->progressive_mode) {
+    JXL_CHECK(cinfo->Ss != 0 || cinfo->Se != 63);
+  } else {
+    JXL_CHECK(cinfo->Ss == 0 && cinfo->Se == 63);
+  }
+  if (jparams.progressive_mode > 2) {
+    JXL_CHECK(jparams.progressive_mode < 3 + kNumTestScripts);
+    const ScanScript& script = kTestScript[jparams.progressive_mode - 3];
+    JXL_CHECK(cinfo->input_scan_number <= script.num_scans);
+    const jpeg_scan_info& scan = script.scans[cinfo->input_scan_number - 1];
+    JXL_CHECK(cinfo->comps_in_scan == scan.comps_in_scan);
+    for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+      JXL_CHECK(cinfo->cur_comp_info[i]->component_index ==
+                scan.component_index[i]);
+    }
+    JXL_CHECK(cinfo->Ss == scan.Ss);
+    JXL_CHECK(cinfo->Se == scan.Se);
+    JXL_CHECK(cinfo->Ah == scan.Ah);
+    JXL_CHECK(cinfo->Al == scan.Al);
+  }
+  if (jparams.restart_interval > 0) {
+    JXL_CHECK(cinfo->restart_interval == jparams.restart_interval);
+  } else if (jparams.restart_in_rows > 0) {
+    JXL_CHECK(cinfo->restart_interval ==
+              jparams.restart_in_rows * cinfo->MCUs_per_row);
+  }
+  if (jparams.progressive_mode == 0 && jparams.optimize_coding == 0) {
+    if (cinfo->jpeg_color_space == JCS_RGB) {
+      JXL_CHECK(cinfo->comp_info[0].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[2].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[0].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[2].ac_tbl_no == 0);
+    } else if (cinfo->jpeg_color_space == JCS_YCbCr) {
+      JXL_CHECK(cinfo->comp_info[0].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].dc_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[2].dc_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[0].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].ac_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[2].ac_tbl_no == 1);
+    } else if (cinfo->jpeg_color_space == JCS_CMYK) {
+      JXL_CHECK(cinfo->comp_info[0].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[2].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[3].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[0].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[2].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[3].ac_tbl_no == 0);
+    } else if (cinfo->jpeg_color_space == JCS_YCCK) {
+      JXL_CHECK(cinfo->comp_info[0].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].dc_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[2].dc_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[3].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[0].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].ac_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[2].ac_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[3].ac_tbl_no == 0);
+    }
+    if (jparams.use_flat_dc_luma_code) {
+      JHUFF_TBL* tbl = cinfo->dc_huff_tbl_ptrs[0];
+      jxl::msan::UnpoisonMemory(tbl, sizeof(*tbl));
+      for (int i = 0; i < 15; ++i) {
+        JXL_CHECK(tbl->huffval[i] == i);
+      }
+    }
+  }
+}
+
+void UnmapColors(uint8_t* row, size_t xsize, int components,
+                 JSAMPARRAY colormap, size_t num_colors) {
+  JXL_CHECK(colormap != nullptr);
+  std::vector<uint8_t> tmp(xsize * components);
+  for (size_t x = 0; x < xsize; ++x) {
+    JXL_CHECK(row[x] < num_colors);
+    for (int c = 0; c < components; ++c) {
+      tmp[x * components + c] = colormap[c][row[x]];
+    }
+  }
+  memcpy(row, tmp.data(), tmp.size());
+}
+
+void ReadOutputPass(j_decompress_ptr cinfo, const DecompressParams& dparams,
+                    TestImage* output) {
+  JDIMENSION xoffset = 0;
+  JDIMENSION yoffset = 0;
+  JDIMENSION xsize_cropped = cinfo->output_width;
+  JDIMENSION ysize_cropped = cinfo->output_height;
+  if (dparams.crop_output) {
+    xoffset = xsize_cropped = cinfo->output_width / 3;
+    yoffset = ysize_cropped = cinfo->output_height / 3;
+    jpeg_crop_scanline(cinfo, &xoffset, &xsize_cropped);
+    JXL_CHECK(xsize_cropped == cinfo->output_width);
+  }
+  output->xsize = xsize_cropped;
+  output->ysize = ysize_cropped;
+  output->components = cinfo->out_color_components;
+  if (cinfo->quantize_colors) {
+    jxl::msan::UnpoisonMemory(cinfo->colormap, cinfo->out_color_components *
+                                                   sizeof(cinfo->colormap[0]));
+    for (int c = 0; c < cinfo->out_color_components; ++c) {
+      jxl::msan::UnpoisonMemory(
+          cinfo->colormap[c],
+          cinfo->actual_number_of_colors * sizeof(cinfo->colormap[c][0]));
+    }
+  }
+  if (!cinfo->raw_data_out) {
+    size_t stride = output->xsize * output->components;
+    output->pixels.resize(output->ysize * stride);
+    output->color_space = cinfo->out_color_space;
+    if (yoffset > 0) {
+      jpeg_skip_scanlines(cinfo, yoffset);
+    }
+    for (size_t y = 0; y < output->ysize; ++y) {
+      JSAMPROW rows[] = {
+          reinterpret_cast<JSAMPLE*>(&output->pixels[y * stride])};
+      JXL_CHECK(1 == jpeg_read_scanlines(cinfo, rows, 1));
+      jxl::msan::UnpoisonMemory(
+          rows[0], sizeof(JSAMPLE) * cinfo->output_components * output->xsize);
+      if (cinfo->quantize_colors) {
+        UnmapColors(rows[0], cinfo->output_width, cinfo->out_color_components,
+                    cinfo->colormap, cinfo->actual_number_of_colors);
+      }
+    }
+    if (cinfo->output_scanline < cinfo->output_height) {
+      jpeg_skip_scanlines(cinfo, cinfo->output_height - cinfo->output_scanline);
+    }
+  } else {
+    output->color_space = cinfo->jpeg_color_space;
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+      size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+      std::vector<uint8_t> plane(ysize * xsize);
+      output->raw_data.emplace_back(std::move(plane));
+    }
+    while (cinfo->output_scanline < cinfo->output_height) {
+      size_t iMCU_height = cinfo->max_v_samp_factor * DCTSIZE;
+      JXL_CHECK(cinfo->output_scanline == cinfo->output_iMCU_row * iMCU_height);
+      std::vector<std::vector<JSAMPROW>> rowdata(cinfo->num_components);
+      std::vector<JSAMPARRAY> data(cinfo->num_components);
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+        size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+        size_t num_lines = cinfo->comp_info[c].v_samp_factor * DCTSIZE;
+        rowdata[c].resize(num_lines);
+        size_t y0 = cinfo->output_iMCU_row * num_lines;
+        for (size_t i = 0; i < num_lines; ++i) {
+          rowdata[c][i] =
+              y0 + i < ysize ? &output->raw_data[c][(y0 + i) * xsize] : nullptr;
+        }
+        data[c] = &rowdata[c][0];
+      }
+      JXL_CHECK(iMCU_height ==
+                jpeg_read_raw_data(cinfo, &data[0], iMCU_height));
+    }
+  }
+  JXL_CHECK(cinfo->total_iMCU_rows ==
+            DivCeil(cinfo->image_height, cinfo->max_v_samp_factor * DCTSIZE));
+}
+
+void CopyCoefficients(j_decompress_ptr cinfo, jvirt_barray_ptr* coef_arrays,
+                      TestImage* output) {
+  output->xsize = cinfo->image_width;
+  output->ysize = cinfo->image_height;
+  output->components = cinfo->num_components;
+  output->color_space = cinfo->out_color_space;
+  j_common_ptr comptr = reinterpret_cast<j_common_ptr>(cinfo);
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    std::vector<JCOEF> coeffs(comp->width_in_blocks * comp->height_in_blocks *
+                              DCTSIZE2);
+    for (size_t by = 0; by < comp->height_in_blocks; ++by) {
+      JBLOCKARRAY ba = (*cinfo->mem->access_virt_barray)(comptr, coef_arrays[c],
+                                                         by, 1, true);
+      size_t stride = comp->width_in_blocks * sizeof(JBLOCK);
+      size_t offset = by * comp->width_in_blocks * DCTSIZE2;
+      memcpy(&coeffs[offset], ba[0], stride);
+    }
+    output->coeffs.emplace_back(std::move(coeffs));
+  }
+}
+
+// Verifies that an image encoded with libjpegli can be decoded with libjpeg,
+// and checks that the jpeg coding metadata matches jparams.
+void DecodeAllScansWithLibjpeg(const CompressParams& jparams,
+                               const DecompressParams& dparams,
+                               const std::vector<uint8_t>& compressed,
+                               std::vector<TestImage>* output_progression) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() {
+    ERROR_HANDLER_SETUP(jpeg);
+    jpeg_create_decompress(&cinfo);
+    jpeg_mem_src(&cinfo, compressed.data(), compressed.size());
+    if (jparams.add_marker) {
+      jpeg_save_markers(&cinfo, kSpecialMarker0, 0xffff);
+      jpeg_save_markers(&cinfo, kSpecialMarker1, 0xffff);
+    }
+    JXL_CHECK(JPEG_REACHED_SOS ==
+              jpeg_read_header(&cinfo, /*require_image=*/TRUE));
+    cinfo.buffered_image = TRUE;
+    SetDecompressParams(dparams, &cinfo, /*is_jpegli=*/false);
+    VerifyHeader(jparams, &cinfo);
+    JXL_CHECK(jpeg_start_decompress(&cinfo));
+    // start decompress should not read the whole input in buffered image mode
+    JXL_CHECK(!jpeg_input_complete(&cinfo));
+    JXL_CHECK(cinfo.output_scan_number == 0);
+    int sos_marker_cnt = 1;  // read header reads the first SOS marker
+    while (!jpeg_input_complete(&cinfo)) {
+      JXL_CHECK(cinfo.input_scan_number == sos_marker_cnt);
+      if (dparams.skip_scans && (cinfo.input_scan_number % 2) != 1) {
+        int result = JPEG_SUSPENDED;
+        while (result != JPEG_REACHED_SOS && result != JPEG_REACHED_EOI) {
+          result = jpeg_consume_input(&cinfo);
+        }
+        if (result == JPEG_REACHED_SOS) ++sos_marker_cnt;
+        continue;
+      }
+      SetScanDecompressParams(dparams, &cinfo, cinfo.input_scan_number,
+                              /*is_jpegli=*/false);
+      JXL_CHECK(jpeg_start_output(&cinfo, cinfo.input_scan_number));
+      // start output sets output_scan_number, but does not change
+      // input_scan_number
+      JXL_CHECK(cinfo.output_scan_number == cinfo.input_scan_number);
+      JXL_CHECK(cinfo.input_scan_number == sos_marker_cnt);
+      VerifyScanHeader(jparams, &cinfo);
+      TestImage output;
+      ReadOutputPass(&cinfo, dparams, &output);
+      output_progression->emplace_back(std::move(output));
+      // read scanlines/read raw data does not change input/output scan number
+      if (!cinfo.progressive_mode) {
+        JXL_CHECK(cinfo.input_scan_number == sos_marker_cnt);
+        JXL_CHECK(cinfo.output_scan_number == cinfo.input_scan_number);
+      }
+      JXL_CHECK(jpeg_finish_output(&cinfo));
+      ++sos_marker_cnt;  // finish output reads the next SOS marker or EOI
+      if (dparams.output_mode == COEFFICIENTS) {
+        jvirt_barray_ptr* coef_arrays = jpeg_read_coefficients(&cinfo);
+        JXL_CHECK(coef_arrays != nullptr);
+        CopyCoefficients(&cinfo, coef_arrays, &output_progression->back());
+      }
+    }
+    JXL_CHECK(jpeg_finish_decompress(&cinfo));
+    return true;
+  };
+  JXL_CHECK(try_catch_block());
+  jpeg_destroy_decompress(&cinfo);
+}
+
+void DecodeWithLibjpeg(const CompressParams& jparams,
+                       const DecompressParams& dparams, j_decompress_ptr cinfo,
+                       TestImage* output) {
+  if (jparams.add_marker) {
+    jpeg_save_markers(cinfo, kSpecialMarker0, 0xffff);
+    jpeg_save_markers(cinfo, kSpecialMarker1, 0xffff);
+  }
+  if (!jparams.icc.empty()) {
+    jpeg_save_markers(cinfo, JPEG_APP0 + 2, 0xffff);
+  }
+  JXL_CHECK(JPEG_REACHED_SOS ==
+            jpeg_read_header(cinfo, /*require_image=*/TRUE));
+  if (!jparams.icc.empty()) {
+    uint8_t* icc_data = nullptr;
+    unsigned int icc_len;
+    JXL_CHECK(jpeg_read_icc_profile(cinfo, &icc_data, &icc_len));
+    JXL_CHECK(icc_data);
+    jxl::msan::UnpoisonMemory(icc_data, icc_len);
+    JXL_CHECK(0 == memcmp(jparams.icc.data(), icc_data, icc_len));
+    free(icc_data);
+  }
+  SetDecompressParams(dparams, cinfo, /*is_jpegli=*/false);
+  VerifyHeader(jparams, cinfo);
+  if (dparams.output_mode == COEFFICIENTS) {
+    jvirt_barray_ptr* coef_arrays = jpeg_read_coefficients(cinfo);
+    JXL_CHECK(coef_arrays != nullptr);
+    CopyCoefficients(cinfo, coef_arrays, output);
+  } else {
+    JXL_CHECK(jpeg_start_decompress(cinfo));
+    VerifyScanHeader(jparams, cinfo);
+    ReadOutputPass(cinfo, dparams, output);
+  }
+  JXL_CHECK(jpeg_finish_decompress(cinfo));
+}
+
+void DecodeWithLibjpeg(const CompressParams& jparams,
+                       const DecompressParams& dparams,
+                       const std::vector<uint8_t>& compressed,
+                       TestImage* output) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() {
+    ERROR_HANDLER_SETUP(jpeg);
+    jpeg_create_decompress(&cinfo);
+    jpeg_mem_src(&cinfo, compressed.data(), compressed.size());
+    DecodeWithLibjpeg(jparams, dparams, &cinfo, output);
+    return true;
+  };
+  JXL_CHECK(try_catch_block());
+  jpeg_destroy_decompress(&cinfo);
+}
+
+void DumpImage(const TestImage& image, const std::string fn) {
+  JXL_CHECK(image.components == 1 || image.components == 3);
+  jxl::FileWrapper f(fn.c_str(), "wb");
+  size_t bytes_per_sample = jpegli_bytes_per_sample(image.data_type);
+  uint32_t maxval = (1u << (8 * bytes_per_sample)) - 1;
+  char type = image.components == 1 ? '5' : '6';
+  fprintf(f, "P%c\n%" PRIuS " %" PRIuS "\n%u\n", type, image.xsize, image.ysize,
+          maxval);
+  fwrite(image.pixels.data(), 1, image.pixels.size(), f);
+}
+
+double DistanceRms(const TestImage& input, const TestImage& output,
+                   size_t start_line, size_t num_lines, double* max_diff) {
+  size_t stride = input.xsize * input.components;
+  size_t start_offset = start_line * stride;
+  auto get_sample = [&](const TestImage& im, const std::vector<uint8_t>& data,
+                        size_t idx) -> double {
+    size_t bytes_per_sample = jpegli_bytes_per_sample(im.data_type);
+    bool is_little_endian =
+        (im.endianness == JPEGLI_LITTLE_ENDIAN ||
+         (im.endianness == JPEGLI_NATIVE_ENDIAN && IsLittleEndian()));
+    size_t offset = start_offset + idx * bytes_per_sample;
+    JXL_CHECK(offset < data.size());
+    const uint8_t* p = &data[offset];
+    if (im.data_type == JPEGLI_TYPE_UINT8) {
+      static const double mul8 = 1.0 / 255.0;
+      return p[0] * mul8;
+    } else if (im.data_type == JPEGLI_TYPE_UINT16) {
+      static const double mul16 = 1.0 / 65535.0;
+      return (is_little_endian ? LoadLE16(p) : LoadBE16(p)) * mul16;
+    } else if (im.data_type == JPEGLI_TYPE_FLOAT) {
+      return (is_little_endian ? LoadLEFloat(p) : LoadBEFloat(p));
+    }
+    return 0.0;
+  };
+  double diff2 = 0.0;
+  size_t num_samples = 0;
+  if (max_diff) *max_diff = 0.0;
+  if (!input.pixels.empty() && !output.pixels.empty()) {
+    num_samples = num_lines * stride;
+    for (size_t i = 0; i < num_samples; ++i) {
+      double sample_orig = get_sample(input, input.pixels, i);
+      double sample_output = get_sample(output, output.pixels, i);
+      double diff = sample_orig - sample_output;
+      if (max_diff) *max_diff = std::max(*max_diff, 255.0 * std::abs(diff));
+      diff2 += diff * diff;
+    }
+  } else {
+    JXL_CHECK(!input.raw_data.empty());
+    JXL_CHECK(!output.raw_data.empty());
+    for (size_t c = 0; c < input.raw_data.size(); ++c) {
+      JXL_CHECK(c < output.raw_data.size());
+      num_samples += input.raw_data[c].size();
+      for (size_t i = 0; i < input.raw_data[c].size(); ++i) {
+        double sample_orig = get_sample(input, input.raw_data[c], i);
+        double sample_output = get_sample(output, output.raw_data[c], i);
+        double diff = sample_orig - sample_output;
+        if (max_diff) *max_diff = std::max(*max_diff, 255.0 * std::abs(diff));
+        diff2 += diff * diff;
+      }
+    }
+  }
+  return std::sqrt(diff2 / num_samples) * 255.0;
+}
+
+double DistanceRms(const TestImage& input, const TestImage& output,
+                   double* max_diff) {
+  return DistanceRms(input, output, 0, output.ysize, max_diff);
+}
+
+void VerifyOutputImage(const TestImage& input, const TestImage& output,
+                       size_t start_line, size_t num_lines, double max_rms,
+                       double max_diff) {
+  double max_d;
+  double rms = DistanceRms(input, output, start_line, num_lines, &max_d);
+  printf("rms: %f, max_rms: %f, max_d: %f,  max_diff: %f\n", rms, max_rms,
+         max_d, max_diff);
+  JXL_CHECK(rms <= max_rms);
+  JXL_CHECK(max_d <= max_diff);
+}
+
+void VerifyOutputImage(const TestImage& input, const TestImage& output,
+                       double max_rms, double max_diff) {
+  JXL_CHECK(output.xsize == input.xsize);
+  JXL_CHECK(output.ysize == input.ysize);
+  JXL_CHECK(output.components == input.components);
+  JXL_CHECK(output.color_space == input.color_space);
+  if (!input.coeffs.empty()) {
+    JXL_CHECK(input.coeffs.size() == input.components);
+    JXL_CHECK(output.coeffs.size() == input.components);
+    for (size_t c = 0; c < input.components; ++c) {
+      JXL_CHECK(output.coeffs[c].size() == input.coeffs[c].size());
+      JXL_CHECK(0 == memcmp(input.coeffs[c].data(), output.coeffs[c].data(),
+                            input.coeffs[c].size()));
+    }
+  } else {
+    VerifyOutputImage(input, output, 0, output.ysize, max_rms, max_diff);
+  }
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/test_utils.h b/third_party/jpeg-xl/lib/jpegli/test_utils.h
new file mode 100644
index 0000000000..f300b5de9e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/test_utils.h
@@ -0,0 +1,318 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_TEST_UTILS_H_
+#define LIB_JPEGLI_TEST_UTILS_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+#include <setjmp.h>
+/* clang-format on */
+
+#include "lib/jpegli/common.h"
+
+namespace jpegli {
+
+// We define this here as well to make sure that the *_api_test.cc tests only
+// use the public API and therefore we don't include any *_internal.h headers.
+template <typename T1, typename T2>
+constexpr inline T1 DivCeil(T1 a, T2 b) {
+  return (a + b - 1) / b;
+}
+
+#define ERROR_HANDLER_SETUP(flavor)                                \
+  jpeg_error_mgr jerr;                                             \
+  jmp_buf env;                                                     \
+  cinfo.err = flavor##_std_error(&jerr);                           \
+  if (setjmp(env)) {                                               \
+    return false;                                                  \
+  }                                                                \
+  cinfo.client_data = reinterpret_cast<void*>(&env);               \
+  cinfo.err->error_exit = [](j_common_ptr cinfo) {                 \
+    (*cinfo->err->output_message)(cinfo);                          \
+    jmp_buf* env = reinterpret_cast<jmp_buf*>(cinfo->client_data); \
+    flavor##_destroy(cinfo);                                       \
+    longjmp(*env, 1);                                              \
+  };
+
+#define ARRAY_SIZE(X) (sizeof(X) / sizeof((X)[0]))
+
+static constexpr int kSpecialMarker0 = 0xe5;
+static constexpr int kSpecialMarker1 = 0xe9;
+static constexpr uint8_t kMarkerData[] = {0, 1, 255, 0, 17};
+static constexpr uint8_t kMarkerSequence[] = {0xe6, 0xe8, 0xe7,
+                                              0xe6, 0xe7, 0xe8};
+static constexpr size_t kMarkerSequenceLen = ARRAY_SIZE(kMarkerSequence);
+
+static constexpr jpeg_scan_info kScript1[] = {
+    {1, {0}, 0, 63, 0, 0},
+    {1, {1}, 0, 63, 0, 0},
+    {1, {2}, 0, 63, 0, 0},
+};
+
+static constexpr jpeg_scan_info kScript2[] = {
+    {3, {0, 1, 2}, 0, 0, 0, 0},
+    {1, {0}, 1, 63, 0, 0},
+    {1, {1}, 1, 63, 0, 0},
+    {1, {2}, 1, 63, 0, 0},
+};
+static constexpr jpeg_scan_info kScript3[] = {
+    {1, {0}, 0, 0, 0, 0},  {1, {1}, 0, 0, 0, 0},  {1, {2}, 0, 0, 0, 0},
+    {1, {0}, 1, 63, 0, 0}, {1, {1}, 1, 63, 0, 0}, {1, {2}, 1, 63, 0, 0},
+};
+static constexpr jpeg_scan_info kScript4[] = {
+    {3, {0, 1, 2}, 0, 0, 0, 0}, {1, {0}, 1, 63, 0, 1}, {1, {1}, 1, 63, 0, 1},
+    {1, {2}, 1, 63, 0, 1},      {1, {0}, 1, 63, 1, 0}, {1, {1}, 1, 63, 1, 0},
+    {1, {2}, 1, 63, 1, 0},
+};
+
+struct ScanScript {
+  int num_scans;
+  const jpeg_scan_info* scans;
+};
+
+static constexpr ScanScript kTestScript[] = {
+    {ARRAY_SIZE(kScript1), kScript1},
+    {ARRAY_SIZE(kScript2), kScript2},
+    {ARRAY_SIZE(kScript3), kScript3},
+    {ARRAY_SIZE(kScript4), kScript4},
+};
+static constexpr int kNumTestScripts = ARRAY_SIZE(kTestScript);
+
+static constexpr int kLastScan = 0xffff;
+
+static uint32_t kTestColorMap[] = {
+    0x000000, 0xff0000, 0x00ff00, 0x0000ff, 0xffff00, 0x00ffff,
+    0xff00ff, 0xffffff, 0x6251fc, 0x45d9c7, 0xa7f059, 0xd9a945,
+    0xfa4e44, 0xceaffc, 0xbad7db, 0xc1f0b1, 0xdbca9a, 0xfacac5,
+    0xf201ff, 0x0063db, 0x00f01c, 0xdbb204, 0xf12f0c, 0x7ba1dc};
+static constexpr int kTestColorMapNumColors = ARRAY_SIZE(kTestColorMap);
+
+std::string IOMethodName(JpegliDataType data_type, JpegliEndianness endianness);
+
+std::string ColorSpaceName(J_COLOR_SPACE colorspace);
+
+enum JpegIOMode {
+  PIXELS,
+  RAW_DATA,
+  COEFFICIENTS,
+};
+
+struct CustomQuantTable {
+  int slot_idx = 0;
+  uint16_t table_type = 0;
+  int scale_factor = 100;
+  bool add_raw = false;
+  bool force_baseline = true;
+  std::vector<unsigned int> basic_table;
+  std::vector<unsigned int> quantval;
+  void Generate();
+};
+
+struct TestImage {
+  size_t xsize = 2268;
+  size_t ysize = 1512;
+  J_COLOR_SPACE color_space = JCS_RGB;
+  size_t components = 3;
+  JpegliDataType data_type = JPEGLI_TYPE_UINT8;
+  JpegliEndianness endianness = JPEGLI_NATIVE_ENDIAN;
+  std::vector<uint8_t> pixels;
+  std::vector<std::vector<uint8_t>> raw_data;
+  std::vector<std::vector<JCOEF>> coeffs;
+  void AllocatePixels() {
+    pixels.resize(ysize * xsize * components *
+                  jpegli_bytes_per_sample(data_type));
+  }
+  void Clear() {
+    pixels.clear();
+    raw_data.clear();
+    coeffs.clear();
+  }
+};
+
+std::ostream& operator<<(std::ostream& os, const TestImage& input);
+
+struct CompressParams {
+  int quality = 90;
+  bool set_jpeg_colorspace = false;
+  J_COLOR_SPACE jpeg_color_space = JCS_UNKNOWN;
+  std::vector<int> quant_indexes;
+  std::vector<CustomQuantTable> quant_tables;
+  std::vector<int> h_sampling;
+  std::vector<int> v_sampling;
+  std::vector<int> comp_ids;
+  int override_JFIF = -1;
+  int override_Adobe = -1;
+  bool add_marker = false;
+  bool simple_progression = false;
+  // -1 is library default
+  // 0, 1, 2 is set through jpegli_set_progressive_level()
+  // 2 + N is kScriptN
+  int progressive_mode = -1;
+  unsigned int restart_interval = 0;
+  int restart_in_rows = 0;
+  int smoothing_factor = 0;
+  int optimize_coding = -1;
+  bool use_flat_dc_luma_code = false;
+  bool omit_standard_tables = false;
+  bool xyb_mode = false;
+  bool libjpeg_mode = false;
+  bool use_adaptive_quantization = true;
+  std::vector<uint8_t> icc;
+
+  int h_samp(int c) const { return h_sampling.empty() ? 1 : h_sampling[c]; }
+  int v_samp(int c) const { return v_sampling.empty() ? 1 : v_sampling[c]; }
+  int max_h_sample() const {
+    auto it = std::max_element(h_sampling.begin(), h_sampling.end());
+    return it == h_sampling.end() ? 1 : *it;
+  }
+  int max_v_sample() const {
+    auto it = std::max_element(v_sampling.begin(), v_sampling.end());
+    return it == v_sampling.end() ? 1 : *it;
+  }
+  int comp_width(const TestImage& input, int c) const {
+    return DivCeil(input.xsize * h_samp(c), max_h_sample() * DCTSIZE) * DCTSIZE;
+  }
+  int comp_height(const TestImage& input, int c) const {
+    return DivCeil(input.ysize * v_samp(c), max_v_sample() * DCTSIZE) * DCTSIZE;
+  }
+};
+
+std::ostream& operator<<(std::ostream& os, const CompressParams& jparams);
+
+void VerifyHeader(const CompressParams& jparams, j_decompress_ptr cinfo);
+void VerifyScanHeader(const CompressParams& jparams, j_decompress_ptr cinfo);
+
+enum ColorQuantMode {
+  CQUANT_1PASS,
+  CQUANT_2PASS,
+  CQUANT_EXTERNAL,
+  CQUANT_REUSE,
+};
+
+struct ScanDecompressParams {
+  int max_scan_number;
+  J_DITHER_MODE dither_mode;
+  ColorQuantMode color_quant_mode;
+};
+
+struct DecompressParams {
+  float size_factor = 1.0f;
+  size_t chunk_size = 65536;
+  size_t max_output_lines = 16;
+  JpegIOMode output_mode = PIXELS;
+  JpegliDataType data_type = JPEGLI_TYPE_UINT8;
+  JpegliEndianness endianness = JPEGLI_NATIVE_ENDIAN;
+  bool set_out_color_space = false;
+  J_COLOR_SPACE out_color_space = JCS_UNKNOWN;
+  bool crop_output = false;
+  bool do_block_smoothing = false;
+  bool do_fancy_upsampling = true;
+  bool skip_scans = false;
+  int scale_num = 1;
+  int scale_denom = 1;
+  bool quantize_colors = false;
+  int desired_number_of_colors = 256;
+  std::vector<ScanDecompressParams> scan_params;
+};
+
+void SetDecompressParams(const DecompressParams& dparams,
+                         j_decompress_ptr cinfo, bool is_jpegli);
+
+void SetScanDecompressParams(const DecompressParams& dparams,
+                             j_decompress_ptr cinfo, int scan_number,
+                             bool is_jpegli);
+
+void CopyCoefficients(j_decompress_ptr cinfo, jvirt_barray_ptr* coef_arrays,
+                      TestImage* output);
+
+void UnmapColors(uint8_t* row, size_t xsize, int components,
+                 JSAMPARRAY colormap, size_t num_colors);
+
+std::string GetTestDataPath(const std::string& filename);
+std::vector<uint8_t> ReadTestData(const std::string& filename);
+
+class PNMParser {
+ public:
+  explicit PNMParser(const uint8_t* data, const size_t len)
+      : pos_(data), end_(data + len) {}
+
+  // Sets "pos" to the first non-header byte/pixel on success.
+  bool ParseHeader(const uint8_t** pos, size_t* xsize, size_t* ysize,
+                   size_t* num_channels, size_t* bitdepth);
+
+ private:
+  static bool IsLineBreak(const uint8_t c) { return c == '\r' || c == '\n'; }
+  static bool IsWhitespace(const uint8_t c) {
+    return IsLineBreak(c) || c == '\t' || c == ' ';
+  }
+
+  bool ParseUnsigned(size_t* number);
+
+  bool SkipWhitespace();
+
+  const uint8_t* pos_;
+  const uint8_t* const end_;
+};
+
+bool ReadPNM(const std::vector<uint8_t>& data, size_t* xsize, size_t* ysize,
+             size_t* num_channels, size_t* bitdepth,
+             std::vector<uint8_t>* pixels);
+
+void SetNumChannels(J_COLOR_SPACE colorspace, size_t* channels);
+
+void ConvertToGrayscale(TestImage* img);
+
+void GeneratePixels(TestImage* img);
+
+void GenerateRawData(const CompressParams& jparams, TestImage* img);
+
+void GenerateCoeffs(const CompressParams& jparams, TestImage* img);
+
+void EncodeWithJpegli(const TestImage& input, const CompressParams& jparams,
+                      j_compress_ptr cinfo);
+
+bool EncodeWithJpegli(const TestImage& input, const CompressParams& jparams,
+                      std::vector<uint8_t>* compressed);
+
+// Verifies that an image encoded with libjpegli can be decoded with libjpeg,
+// and checks that the jpeg coding metadata matches jparams.
+void DecodeAllScansWithLibjpeg(const CompressParams& jparams,
+                               const DecompressParams& dparams,
+                               const std::vector<uint8_t>& compressed,
+                               std::vector<TestImage>* output_progression);
+void DecodeWithLibjpeg(const CompressParams& jparams,
+                       const DecompressParams& dparams, j_decompress_ptr cinfo,
+                       TestImage* output);
+void DecodeWithLibjpeg(const CompressParams& jparams,
+                       const DecompressParams& dparams,
+                       const std::vector<uint8_t>& compressed,
+                       TestImage* output);
+
+double DistanceRms(const TestImage& input, const TestImage& output,
+                   size_t start_line, size_t num_lines,
+                   double* max_diff = nullptr);
+
+double DistanceRms(const TestImage& input, const TestImage& output,
+                   double* max_diff = nullptr);
+
+void VerifyOutputImage(const TestImage& input, const TestImage& output,
+                       size_t start_line, size_t num_lines, double max_rms,
+                       double max_diff = 255.0);
+
+void VerifyOutputImage(const TestImage& input, const TestImage& output,
+                       double max_rms, double max_diff = 255.0);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_TEST_UTILS_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/testing.h b/third_party/jpeg-xl/lib/jpegli/testing.h
new file mode 100644
index 0000000000..873a0171e7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/testing.h
@@ -0,0 +1,35 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_TESTING_H_
+#define LIB_JPEGLI_TESTING_H_
+
+// GTest/GMock specific macros / wrappers.
+
+// gmock unconditionally redefines those macros (to wrong values).
+// Lets include it only here and mitigate the problem.
+#pragma push_macro("PRIdS")
+#pragma push_macro("PRIuS")
+#include "gmock/gmock.h"
+#pragma pop_macro("PRIuS")
+#pragma pop_macro("PRIdS")
+
+#include "gtest/gtest.h"
+
+// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
+// used INSTANTIATE_TEST_CASE_P which is now deprecated.
+#ifdef INSTANTIATE_TEST_SUITE_P
+#define JPEGLI_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
+#else
+#define JPEGLI_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
+#endif
+
+// Ensures that we don't make our test bounds too lax, effectively disabling the
+// tests.
+MATCHER_P(IsSlightlyBelow, max, "") {
+  return max * 0.75 <= arg && arg <= max * 1.0;
+}
+
+#endif  // LIB_JPEGLI_TESTING_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/transcode_api_test.cc b/third_party/jpeg-xl/lib/jpegli/transcode_api_test.cc
new file mode 100644
index 0000000000..1d99ce37fa
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/transcode_api_test.cc
@@ -0,0 +1,133 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/base/status.h"
+
+namespace jpegli {
+namespace {
+
+void TranscodeWithJpegli(const std::vector<uint8_t>& jpeg_input,
+                         const CompressParams& jparams,
+                         std::vector<uint8_t>* jpeg_output) {
+  jpeg_decompress_struct dinfo = {};
+  jpeg_compress_struct cinfo = {};
+  uint8_t* transcoded_data = nullptr;
+  unsigned long transcoded_size;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    dinfo.err = cinfo.err;
+    dinfo.client_data = cinfo.client_data;
+    jpegli_create_decompress(&dinfo);
+    jpegli_mem_src(&dinfo, jpeg_input.data(), jpeg_input.size());
+    EXPECT_EQ(JPEG_REACHED_SOS,
+              jpegli_read_header(&dinfo, /*require_image=*/TRUE));
+    jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(&dinfo);
+    JXL_CHECK(coef_arrays != nullptr);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &transcoded_data, &transcoded_size);
+    jpegli_copy_critical_parameters(&dinfo, &cinfo);
+    jpegli_set_progressive_level(&cinfo, jparams.progressive_mode);
+    cinfo.optimize_coding = jparams.optimize_coding;
+    jpegli_write_coefficients(&cinfo, coef_arrays);
+    jpegli_finish_compress(&cinfo);
+    jpegli_finish_decompress(&dinfo);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&dinfo);
+  jpegli_destroy_compress(&cinfo);
+  if (transcoded_data) {
+    jpeg_output->assign(transcoded_data, transcoded_data + transcoded_size);
+    free(transcoded_data);
+  }
+}
+
+struct TestConfig {
+  TestImage input;
+  CompressParams jparams;
+};
+
+class TranscodeAPITestParam : public ::testing::TestWithParam<TestConfig> {};
+
+TEST_P(TranscodeAPITestParam, TestAPI) {
+  TestConfig config = GetParam();
+  CompressParams& jparams = config.jparams;
+  GeneratePixels(&config.input);
+
+  // Start with sequential non-optimized jpeg.
+  jparams.progressive_mode = 0;
+  jparams.optimize_coding = 0;
+  std::vector<uint8_t> compressed;
+  ASSERT_TRUE(EncodeWithJpegli(config.input, jparams, &compressed));
+  TestImage output0;
+  DecodeWithLibjpeg(jparams, DecompressParams(), compressed, &output0);
+
+  // Transcode first to a sequential optimized jpeg, and then further to
+  // a progressive jpeg.
+  for (int progr : {0, 2}) {
+    std::vector<uint8_t> transcoded;
+    jparams.progressive_mode = progr;
+    jparams.optimize_coding = 1;
+    TranscodeWithJpegli(compressed, jparams, &transcoded);
+
+    // We expect a size reduction of at least 2%.
+    EXPECT_LT(transcoded.size(), compressed.size() * 0.98f);
+
+    // Verify that transcoding is lossless.
+    TestImage output1;
+    DecodeWithLibjpeg(jparams, DecompressParams(), transcoded, &output1);
+    ASSERT_EQ(output0.pixels.size(), output1.pixels.size());
+    EXPECT_EQ(0, memcmp(output0.pixels.data(), output1.pixels.data(),
+                        output0.pixels.size()));
+    compressed = transcoded;
+  }
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  const size_t xsize0 = 1024;
+  const size_t ysize0 = 768;
+  for (int dxsize : {0, 1, 8, 9}) {
+    for (int dysize : {0, 1, 8, 9}) {
+      for (int h_sampling : {1, 2}) {
+        for (int v_sampling : {1, 2}) {
+          TestConfig config;
+          config.input.xsize = xsize0 + dxsize;
+          config.input.ysize = ysize0 + dysize;
+          config.jparams.h_sampling = {h_sampling, 1, 1};
+          config.jparams.v_sampling = {v_sampling, 1, 1};
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  os << c.input;
+  os << c.jparams;
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<TranscodeAPITestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(TranscodeAPITest, TranscodeAPITestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/transpose-inl.h b/third_party/jpeg-xl/lib/jpegli/transpose-inl.h
new file mode 100644
index 0000000000..9fdd222f4e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/transpose-inl.h
@@ -0,0 +1,111 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JPEGLI_TRANSPOSE_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JPEGLI_TRANSPOSE_INL_H_
+#undef LIB_JPEGLI_TRANSPOSE_INL_H_
+#else
+#define LIB_JPEGLI_TRANSPOSE_INL_H_
+#endif
+
+#include "lib/jxl/base/compiler_specific.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+namespace {
+
+#if HWY_CAP_GE256
+static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
+                                         float* JXL_RESTRICT to) {
+  const HWY_CAPPED(float, 8) d;
+  auto i0 = Load(d, from);
+  auto i1 = Load(d, from + 1 * 8);
+  auto i2 = Load(d, from + 2 * 8);
+  auto i3 = Load(d, from + 3 * 8);
+  auto i4 = Load(d, from + 4 * 8);
+  auto i5 = Load(d, from + 5 * 8);
+  auto i6 = Load(d, from + 6 * 8);
+  auto i7 = Load(d, from + 7 * 8);
+
+  const auto q0 = InterleaveLower(d, i0, i2);
+  const auto q1 = InterleaveLower(d, i1, i3);
+  const auto q2 = InterleaveUpper(d, i0, i2);
+  const auto q3 = InterleaveUpper(d, i1, i3);
+  const auto q4 = InterleaveLower(d, i4, i6);
+  const auto q5 = InterleaveLower(d, i5, i7);
+  const auto q6 = InterleaveUpper(d, i4, i6);
+  const auto q7 = InterleaveUpper(d, i5, i7);
+
+  const auto r0 = InterleaveLower(d, q0, q1);
+  const auto r1 = InterleaveUpper(d, q0, q1);
+  const auto r2 = InterleaveLower(d, q2, q3);
+  const auto r3 = InterleaveUpper(d, q2, q3);
+  const auto r4 = InterleaveLower(d, q4, q5);
+  const auto r5 = InterleaveUpper(d, q4, q5);
+  const auto r6 = InterleaveLower(d, q6, q7);
+  const auto r7 = InterleaveUpper(d, q6, q7);
+
+  i0 = ConcatLowerLower(d, r4, r0);
+  i1 = ConcatLowerLower(d, r5, r1);
+  i2 = ConcatLowerLower(d, r6, r2);
+  i3 = ConcatLowerLower(d, r7, r3);
+  i4 = ConcatUpperUpper(d, r4, r0);
+  i5 = ConcatUpperUpper(d, r5, r1);
+  i6 = ConcatUpperUpper(d, r6, r2);
+  i7 = ConcatUpperUpper(d, r7, r3);
+
+  Store(i0, d, to);
+  Store(i1, d, to + 1 * 8);
+  Store(i2, d, to + 2 * 8);
+  Store(i3, d, to + 3 * 8);
+  Store(i4, d, to + 4 * 8);
+  Store(i5, d, to + 5 * 8);
+  Store(i6, d, to + 6 * 8);
+  Store(i7, d, to + 7 * 8);
+}
+#elif HWY_TARGET != HWY_SCALAR
+static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
+                                         float* JXL_RESTRICT to) {
+  const HWY_CAPPED(float, 4) d;
+  for (size_t n = 0; n < 8; n += 4) {
+    for (size_t m = 0; m < 8; m += 4) {
+      auto p0 = Load(d, from + n * 8 + m);
+      auto p1 = Load(d, from + (n + 1) * 8 + m);
+      auto p2 = Load(d, from + (n + 2) * 8 + m);
+      auto p3 = Load(d, from + (n + 3) * 8 + m);
+      const auto q0 = InterleaveLower(d, p0, p2);
+      const auto q1 = InterleaveLower(d, p1, p3);
+      const auto q2 = InterleaveUpper(d, p0, p2);
+      const auto q3 = InterleaveUpper(d, p1, p3);
+
+      const auto r0 = InterleaveLower(d, q0, q1);
+      const auto r1 = InterleaveUpper(d, q0, q1);
+      const auto r2 = InterleaveLower(d, q2, q3);
+      const auto r3 = InterleaveUpper(d, q2, q3);
+      Store(r0, d, to + m * 8 + n);
+      Store(r1, d, to + (1 + m) * 8 + n);
+      Store(r2, d, to + (2 + m) * 8 + n);
+      Store(r3, d, to + (3 + m) * 8 + n);
+    }
+  }
+}
+#else
+static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
+                                         float* JXL_RESTRICT to) {
+  for (size_t n = 0; n < 8; ++n) {
+    for (size_t m = 0; m < 8; ++m) {
+      to[8 * n + m] = from[8 * m + n];
+    }
+  }
+}
+#endif
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+#endif  // LIB_JPEGLI_TRANSPOSE_INL_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/upsample.cc b/third_party/jpeg-xl/lib/jpegli/upsample.cc
new file mode 100644
index 0000000000..5559aa78a6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/upsample.cc
@@ -0,0 +1,137 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/upsample.h"
+
+#include <string.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/upsample.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Vec;
+
+#if HWY_CAP_GE512
+using hwy::HWY_NAMESPACE::Half;
+using hwy::HWY_NAMESPACE::Vec;
+template <size_t i, class DF, class V>
+HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) {
+  using HF = Half<DF>;
+  using HHF = Half<HF>;
+  auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v);
+  return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half);
+}
+
+template <class DF, class V>
+HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) {
+  using HF = Half<DF>;
+  return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0));
+}
+
+#endif
+
+// Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be
+// aligned.
+template <class DF, class V, typename T>
+void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
+  static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
+#if HWY_TARGET == HWY_SCALAR
+  Store(v0, df, mem);
+  Store(v1, df, mem + 1);
+#elif !HWY_CAP_GE256
+  Store(InterleaveLower(df, v0, v1), df, mem);
+  Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
+#else
+  if (!HWY_CAP_GE512 || Lanes(df) == 8) {
+    auto t0 = InterleaveLower(df, v0, v1);
+    auto t1 = InterleaveUpper(df, v0, v1);
+    Store(ConcatLowerLower(df, t1, t0), df, mem);
+    Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
+  } else {
+#if HWY_CAP_GE512
+    auto t0 = InterleaveLower(df, v0, v1);
+    auto t1 = InterleaveUpper(df, v0, v1);
+    Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
+                  Quarter<1>(df, t0), Quarter<1>(df, t1)),
+          df, mem);
+    Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
+                  Quarter<3>(df, t0), Quarter<3>(df, t1)),
+          df, mem + Lanes(df));
+#endif
+  }
+#endif
+}
+
+void Upsample2Horizontal(float* JXL_RESTRICT row,
+                         float* JXL_RESTRICT scratch_space, size_t len_out) {
+  HWY_FULL(float) df;
+  auto threefour = Set(df, 0.75f);
+  auto onefour = Set(df, 0.25f);
+  const size_t len_in = (len_out + 1) >> 1;
+  memcpy(scratch_space, row, len_in * sizeof(row[0]));
+  scratch_space[-1] = scratch_space[0];
+  scratch_space[len_in] = scratch_space[len_in - 1];
+  for (size_t x = 0; x < len_in; x += Lanes(df)) {
+    auto current = Mul(Load(df, scratch_space + x), threefour);
+    auto prev = LoadU(df, scratch_space + x - 1);
+    auto next = LoadU(df, scratch_space + x + 1);
+    auto left = MulAdd(onefour, prev, current);
+    auto right = MulAdd(onefour, next, current);
+    StoreInterleaved(df, left, right, row + x * 2);
+  }
+}
+
+void Upsample2Vertical(const float* JXL_RESTRICT row_top,
+                       const float* JXL_RESTRICT row_mid,
+                       const float* JXL_RESTRICT row_bot,
+                       float* JXL_RESTRICT row_out0,
+                       float* JXL_RESTRICT row_out1, size_t len) {
+  HWY_FULL(float) df;
+  auto threefour = Set(df, 0.75f);
+  auto onefour = Set(df, 0.25f);
+  for (size_t x = 0; x < len; x += Lanes(df)) {
+    auto it = Load(df, row_top + x);
+    auto im = Load(df, row_mid + x);
+    auto ib = Load(df, row_bot + x);
+    auto im_scaled = Mul(im, threefour);
+    Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
+    Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(Upsample2Horizontal);
+HWY_EXPORT(Upsample2Vertical);
+
+void Upsample2Horizontal(float* JXL_RESTRICT row,
+                         float* JXL_RESTRICT scratch_space, size_t len_out) {
+  return HWY_DYNAMIC_DISPATCH(Upsample2Horizontal)(row, scratch_space, len_out);
+}
+
+void Upsample2Vertical(const float* JXL_RESTRICT row_top,
+                       const float* JXL_RESTRICT row_mid,
+                       const float* JXL_RESTRICT row_bot,
+                       float* JXL_RESTRICT row_out0,
+                       float* JXL_RESTRICT row_out1, size_t len) {
+  return HWY_DYNAMIC_DISPATCH(Upsample2Vertical)(row_top, row_mid, row_bot,
+                                                 row_out0, row_out1, len);
+}
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/upsample.h b/third_party/jpeg-xl/lib/jpegli/upsample.h
new file mode 100644
index 0000000000..1a057208dc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/upsample.h
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_UPSAMPLE_H_
+#define LIB_JPEGLI_UPSAMPLE_H_
+
+#include <stddef.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jpegli {
+
+void Upsample2Horizontal(float* JXL_RESTRICT row,
+                         float* JXL_RESTRICT scratch_space, size_t len_out);
+
+void Upsample2Vertical(const float* JXL_RESTRICT row_top,
+                       const float* JXL_RESTRICT row_mid,
+                       const float* JXL_RESTRICT row_bot,
+                       float* JXL_RESTRICT row_out0,
+                       float* JXL_RESTRICT row_out1, size_t len);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_UPSAMPLE_H_
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 17:32:43 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 17:32:43 +0000
commit	6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch)
tree	a68f146d7fa01f0134297619fbe7e33db084e0aa /third_party/jpeg-xl/lib/jpegli
parent	Initial commit. (diff)
download	thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.tar.xz thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.zip