summaryrefslogtreecommitdiffstats
path: root/media/libwebp/sharpyuv
diff options
context:
space:
mode:
Diffstat (limited to 'media/libwebp/sharpyuv')
-rw-r--r--media/libwebp/sharpyuv/moz.build49
-rw-r--r--media/libwebp/sharpyuv/sharpyuv.c527
-rw-r--r--media/libwebp/sharpyuv/sharpyuv.h103
-rw-r--r--media/libwebp/sharpyuv/sharpyuv_cpu.h22
-rw-r--r--media/libwebp/sharpyuv/sharpyuv_csp.c110
-rw-r--r--media/libwebp/sharpyuv/sharpyuv_csp.h60
-rw-r--r--media/libwebp/sharpyuv/sharpyuv_dsp.c104
-rw-r--r--media/libwebp/sharpyuv/sharpyuv_dsp.h28
-rw-r--r--media/libwebp/sharpyuv/sharpyuv_gamma.c113
-rw-r--r--media/libwebp/sharpyuv/sharpyuv_gamma.h35
-rw-r--r--media/libwebp/sharpyuv/sharpyuv_neon.c181
-rw-r--r--media/libwebp/sharpyuv/sharpyuv_sse2.c201
12 files changed, 1533 insertions, 0 deletions
diff --git a/media/libwebp/sharpyuv/moz.build b/media/libwebp/sharpyuv/moz.build
new file mode 100644
index 0000000000..3b498f0bf1
--- /dev/null
+++ b/media/libwebp/sharpyuv/moz.build
@@ -0,0 +1,49 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+SOURCES += [
+ 'sharpyuv.c',
+ 'sharpyuv_csp.c',
+ 'sharpyuv_dsp.c',
+ 'sharpyuv_gamma.c',
+]
+
+LOCAL_INCLUDES += [
+ '/media/libwebp',
+]
+
+if CONFIG['TARGET_CPU'] == 'arm' and CONFIG['BUILD_ARM_NEON']:
+ SOURCES += [
+ 'sharpyuv_neon.c',
+ ]
+ DEFINES['WEBP_HAVE_NEON'] = 1;
+ for f in SOURCES:
+ if f.endswith('neon.c'):
+ SOURCES[f].flags += CONFIG['NEON_FLAGS']
+elif CONFIG['TARGET_CPU'] == 'aarch64':
+ SOURCES += [
+ 'sharpyuv_neon.c',
+ ]
+ DEFINES['WEBP_HAVE_NEON'] = 1;
+elif CONFIG['INTEL_ARCHITECTURE']:
+ SOURCES += [
+ 'sharpyuv_sse2.c',
+ ]
+ DEFINES['WEBP_HAVE_SSE2'] = 1;
+ for f in SOURCES:
+ if f.endswith('sse2.c'):
+ SOURCES[f].flags += CONFIG['SSE2_FLAGS']
+
+if CONFIG['CC_TYPE'] in ('clang', 'clang-cl'):
+ CFLAGS += ['-Wno-unreachable-code']
+
+# Add libFuzzer configuration directives
+include('/tools/fuzzing/libfuzzer-config.mozbuild')
+
+FINAL_LIBRARY = 'gkmedias'
+
+# We allow warnings for third-party code that can be updated from upstream.
+AllowCompilerWarnings()
diff --git a/media/libwebp/sharpyuv/sharpyuv.c b/media/libwebp/sharpyuv/sharpyuv.c
new file mode 100644
index 0000000000..a074564888
--- /dev/null
+++ b/media/libwebp/sharpyuv/sharpyuv.c
@@ -0,0 +1,527 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Sharp RGB to YUV conversion.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "src/webp/types.h"
+#include "sharpyuv/sharpyuv_cpu.h"
+#include "sharpyuv/sharpyuv_dsp.h"
+#include "sharpyuv/sharpyuv_gamma.h"
+
+//------------------------------------------------------------------------------
+
+int SharpYuvGetVersion(void) {
+ return SHARPYUV_VERSION;
+}
+
+//------------------------------------------------------------------------------
+// Sharp RGB->YUV conversion
+
+static const int kNumIterations = 4;
+
+#define YUV_FIX 16 // fixed-point precision for RGB->YUV
+static const int kYuvHalf = 1 << (YUV_FIX - 1);
+
+// Max bit depth so that intermediate calculations fit in 16 bits.
+static const int kMaxBitDepth = 14;
+
+// Returns the precision shift to use based on the input rgb_bit_depth.
+static int GetPrecisionShift(int rgb_bit_depth) {
+ // Try to add 2 bits of precision if it fits in kMaxBitDepth. Otherwise remove
+ // bits if needed.
+ return ((rgb_bit_depth + 2) <= kMaxBitDepth) ? 2
+ : (kMaxBitDepth - rgb_bit_depth);
+}
+
+typedef int16_t fixed_t; // signed type with extra precision for UV
+typedef uint16_t fixed_y_t; // unsigned type with extra precision for W
+
+//------------------------------------------------------------------------------
+
+static uint8_t clip_8b(fixed_t v) {
+ return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
+}
+
+static uint16_t clip(fixed_t v, int max) {
+ return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static fixed_y_t clip_bit_depth(int y, int bit_depth) {
+ const int max = (1 << bit_depth) - 1;
+ return (!(y & ~max)) ? (fixed_y_t)y : (y < 0) ? 0 : max;
+}
+
+//------------------------------------------------------------------------------
+
+static int RGBToGray(int64_t r, int64_t g, int64_t b) {
+ const int64_t luma = 13933 * r + 46871 * g + 4732 * b + kYuvHalf;
+ return (int)(luma >> YUV_FIX);
+}
+
+static uint32_t ScaleDown(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
+ int rgb_bit_depth) {
+ const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
+ const uint32_t A = SharpYuvGammaToLinear(a, bit_depth);
+ const uint32_t B = SharpYuvGammaToLinear(b, bit_depth);
+ const uint32_t C = SharpYuvGammaToLinear(c, bit_depth);
+ const uint32_t D = SharpYuvGammaToLinear(d, bit_depth);
+ return SharpYuvLinearToGamma((A + B + C + D + 2) >> 2, bit_depth);
+}
+
+static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w,
+ int rgb_bit_depth) {
+ const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
+ int i;
+ for (i = 0; i < w; ++i) {
+ const uint32_t R = SharpYuvGammaToLinear(src[0 * w + i], bit_depth);
+ const uint32_t G = SharpYuvGammaToLinear(src[1 * w + i], bit_depth);
+ const uint32_t B = SharpYuvGammaToLinear(src[2 * w + i], bit_depth);
+ const uint32_t Y = RGBToGray(R, G, B);
+ dst[i] = (fixed_y_t)SharpYuvLinearToGamma(Y, bit_depth);
+ }
+}
+
+static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
+ fixed_t* dst, int uv_w, int rgb_bit_depth) {
+ int i;
+ for (i = 0; i < uv_w; ++i) {
+ const int r =
+ ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1], src2[0 * uv_w + 0],
+ src2[0 * uv_w + 1], rgb_bit_depth);
+ const int g =
+ ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1], src2[2 * uv_w + 0],
+ src2[2 * uv_w + 1], rgb_bit_depth);
+ const int b =
+ ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1], src2[4 * uv_w + 0],
+ src2[4 * uv_w + 1], rgb_bit_depth);
+ const int W = RGBToGray(r, g, b);
+ dst[0 * uv_w] = (fixed_t)(r - W);
+ dst[1 * uv_w] = (fixed_t)(g - W);
+ dst[2 * uv_w] = (fixed_t)(b - W);
+ dst += 1;
+ src1 += 2;
+ src2 += 2;
+ }
+}
+
+static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
+ int i;
+ assert(w > 0);
+ for (i = 0; i < w; ++i) {
+ y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
+ }
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0, int bit_depth) {
+ const int v0 = (A * 3 + B + 2) >> 2;
+ return clip_bit_depth(v0 + W0, bit_depth);
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE int Shift(int v, int shift) {
+ return (shift >= 0) ? (v << shift) : (v >> -shift);
+}
+
+static void ImportOneRow(const uint8_t* const r_ptr,
+ const uint8_t* const g_ptr,
+ const uint8_t* const b_ptr,
+ int rgb_step,
+ int rgb_bit_depth,
+ int pic_width,
+ fixed_y_t* const dst) {
+ // Convert the rgb_step from a number of bytes to a number of uint8_t or
+ // uint16_t values depending the bit depth.
+ const int step = (rgb_bit_depth > 8) ? rgb_step / 2 : rgb_step;
+ int i;
+ const int w = (pic_width + 1) & ~1;
+ for (i = 0; i < pic_width; ++i) {
+ const int off = i * step;
+ const int shift = GetPrecisionShift(rgb_bit_depth);
+ if (rgb_bit_depth == 8) {
+ dst[i + 0 * w] = Shift(r_ptr[off], shift);
+ dst[i + 1 * w] = Shift(g_ptr[off], shift);
+ dst[i + 2 * w] = Shift(b_ptr[off], shift);
+ } else {
+ dst[i + 0 * w] = Shift(((uint16_t*)r_ptr)[off], shift);
+ dst[i + 1 * w] = Shift(((uint16_t*)g_ptr)[off], shift);
+ dst[i + 2 * w] = Shift(((uint16_t*)b_ptr)[off], shift);
+ }
+ }
+ if (pic_width & 1) { // replicate rightmost pixel
+ dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
+ dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
+ dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
+ }
+}
+
+static void InterpolateTwoRows(const fixed_y_t* const best_y,
+ const fixed_t* prev_uv,
+ const fixed_t* cur_uv,
+ const fixed_t* next_uv,
+ int w,
+ fixed_y_t* out1,
+ fixed_y_t* out2,
+ int rgb_bit_depth) {
+ const int uv_w = w >> 1;
+ const int len = (w - 1) >> 1; // length to filter
+ int k = 3;
+ const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
+ while (k-- > 0) { // process each R/G/B segments in turn
+ // special boundary case for i==0
+ out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0], bit_depth);
+ out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w], bit_depth);
+
+ SharpYuvFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1,
+ bit_depth);
+ SharpYuvFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1,
+ bit_depth);
+
+ // special boundary case for i == w - 1 when w is even
+ if (!(w & 1)) {
+ out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
+ best_y[w - 1 + 0], bit_depth);
+ out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
+ best_y[w - 1 + w], bit_depth);
+ }
+ out1 += w;
+ out2 += w;
+ prev_uv += uv_w;
+ cur_uv += uv_w;
+ next_uv += uv_w;
+ }
+}
+
+static WEBP_INLINE int RGBToYUVComponent(int r, int g, int b,
+ const int coeffs[4], int sfix) {
+ const int srounder = 1 << (YUV_FIX + sfix - 1);
+ const int luma = coeffs[0] * r + coeffs[1] * g + coeffs[2] * b +
+ coeffs[3] + srounder;
+ return (luma >> (YUV_FIX + sfix));
+}
+
+static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
+ uint8_t* y_ptr, int y_stride, uint8_t* u_ptr,
+ int u_stride, uint8_t* v_ptr, int v_stride,
+ int rgb_bit_depth,
+ int yuv_bit_depth, int width, int height,
+ const SharpYuvConversionMatrix* yuv_matrix) {
+ int i, j;
+ const fixed_t* const best_uv_base = best_uv;
+ const int w = (width + 1) & ~1;
+ const int h = (height + 1) & ~1;
+ const int uv_w = w >> 1;
+ const int uv_h = h >> 1;
+ const int sfix = GetPrecisionShift(rgb_bit_depth);
+ const int yuv_max = (1 << yuv_bit_depth) - 1;
+
+ for (best_uv = best_uv_base, j = 0; j < height; ++j) {
+ for (i = 0; i < width; ++i) {
+ const int off = (i >> 1);
+ const int W = best_y[i];
+ const int r = best_uv[off + 0 * uv_w] + W;
+ const int g = best_uv[off + 1 * uv_w] + W;
+ const int b = best_uv[off + 2 * uv_w] + W;
+ const int y = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_y, sfix);
+ if (yuv_bit_depth <= 8) {
+ y_ptr[i] = clip_8b(y);
+ } else {
+ ((uint16_t*)y_ptr)[i] = clip(y, yuv_max);
+ }
+ }
+ best_y += w;
+ best_uv += (j & 1) * 3 * uv_w;
+ y_ptr += y_stride;
+ }
+ for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
+ for (i = 0; i < uv_w; ++i) {
+ const int off = i;
+ // Note r, g and b values here are off by W, but a constant offset on all
+ // 3 components doesn't change the value of u and v with a YCbCr matrix.
+ const int r = best_uv[off + 0 * uv_w];
+ const int g = best_uv[off + 1 * uv_w];
+ const int b = best_uv[off + 2 * uv_w];
+ const int u = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_u, sfix);
+ const int v = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_v, sfix);
+ if (yuv_bit_depth <= 8) {
+ u_ptr[i] = clip_8b(u);
+ v_ptr[i] = clip_8b(v);
+ } else {
+ ((uint16_t*)u_ptr)[i] = clip(u, yuv_max);
+ ((uint16_t*)v_ptr)[i] = clip(v, yuv_max);
+ }
+ }
+ best_uv += 3 * uv_w;
+ u_ptr += u_stride;
+ v_ptr += v_stride;
+ }
+ return 1;
+}
+
+//------------------------------------------------------------------------------
+// Main function
+
+static void* SafeMalloc(uint64_t nmemb, size_t size) {
+ const uint64_t total_size = nmemb * (uint64_t)size;
+ if (total_size != (size_t)total_size) return NULL;
+ return malloc((size_t)total_size);
+}
+
+#define SAFE_ALLOC(W, H, T) ((T*)SafeMalloc((W) * (H), sizeof(T)))
+
+static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
+ const uint8_t* b_ptr, int rgb_step, int rgb_stride,
+ int rgb_bit_depth, uint8_t* y_ptr, int y_stride,
+ uint8_t* u_ptr, int u_stride, uint8_t* v_ptr,
+ int v_stride, int yuv_bit_depth, int width,
+ int height,
+ const SharpYuvConversionMatrix* yuv_matrix) {
+ // we expand the right/bottom border if needed
+ const int w = (width + 1) & ~1;
+ const int h = (height + 1) & ~1;
+ const int uv_w = w >> 1;
+ const int uv_h = h >> 1;
+ uint64_t prev_diff_y_sum = ~0;
+ int j, iter;
+
+ // TODO(skal): allocate one big memory chunk. But for now, it's easier
+ // for valgrind debugging to have several chunks.
+ fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t); // scratch
+ fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+ fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+ fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
+ fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+ fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+ fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
+ fixed_y_t* best_y = best_y_base;
+ fixed_y_t* target_y = target_y_base;
+ fixed_t* best_uv = best_uv_base;
+ fixed_t* target_uv = target_uv_base;
+ const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
+ int ok;
+ assert(w > 0);
+ assert(h > 0);
+
+ if (best_y_base == NULL || best_uv_base == NULL ||
+ target_y_base == NULL || target_uv_base == NULL ||
+ best_rgb_y == NULL || best_rgb_uv == NULL ||
+ tmp_buffer == NULL) {
+ ok = 0;
+ goto End;
+ }
+
+ // Import RGB samples to W/RGB representation.
+ for (j = 0; j < height; j += 2) {
+ const int is_last_row = (j == height - 1);
+ fixed_y_t* const src1 = tmp_buffer + 0 * w;
+ fixed_y_t* const src2 = tmp_buffer + 3 * w;
+
+ // prepare two rows of input
+ ImportOneRow(r_ptr, g_ptr, b_ptr, rgb_step, rgb_bit_depth, width,
+ src1);
+ if (!is_last_row) {
+ ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
+ rgb_step, rgb_bit_depth, width, src2);
+ } else {
+ memcpy(src2, src1, 3 * w * sizeof(*src2));
+ }
+ StoreGray(src1, best_y + 0, w);
+ StoreGray(src2, best_y + w, w);
+
+ UpdateW(src1, target_y, w, rgb_bit_depth);
+ UpdateW(src2, target_y + w, w, rgb_bit_depth);
+ UpdateChroma(src1, src2, target_uv, uv_w, rgb_bit_depth);
+ memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
+ best_y += 2 * w;
+ best_uv += 3 * uv_w;
+ target_y += 2 * w;
+ target_uv += 3 * uv_w;
+ r_ptr += 2 * rgb_stride;
+ g_ptr += 2 * rgb_stride;
+ b_ptr += 2 * rgb_stride;
+ }
+
+ // Iterate and resolve clipping conflicts.
+ for (iter = 0; iter < kNumIterations; ++iter) {
+ const fixed_t* cur_uv = best_uv_base;
+ const fixed_t* prev_uv = best_uv_base;
+ uint64_t diff_y_sum = 0;
+
+ best_y = best_y_base;
+ best_uv = best_uv_base;
+ target_y = target_y_base;
+ target_uv = target_uv_base;
+ for (j = 0; j < h; j += 2) {
+ fixed_y_t* const src1 = tmp_buffer + 0 * w;
+ fixed_y_t* const src2 = tmp_buffer + 3 * w;
+ {
+ const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
+ InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w,
+ src1, src2, rgb_bit_depth);
+ prev_uv = cur_uv;
+ cur_uv = next_uv;
+ }
+
+ UpdateW(src1, best_rgb_y + 0 * w, w, rgb_bit_depth);
+ UpdateW(src2, best_rgb_y + 1 * w, w, rgb_bit_depth);
+ UpdateChroma(src1, src2, best_rgb_uv, uv_w, rgb_bit_depth);
+
+ // update two rows of Y and one row of RGB
+ diff_y_sum +=
+ SharpYuvUpdateY(target_y, best_rgb_y, best_y, 2 * w,
+ rgb_bit_depth + GetPrecisionShift(rgb_bit_depth));
+ SharpYuvUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
+
+ best_y += 2 * w;
+ best_uv += 3 * uv_w;
+ target_y += 2 * w;
+ target_uv += 3 * uv_w;
+ }
+ // test exit condition
+ if (iter > 0) {
+ if (diff_y_sum < diff_y_threshold) break;
+ if (diff_y_sum > prev_diff_y_sum) break;
+ }
+ prev_diff_y_sum = diff_y_sum;
+ }
+
+ // final reconstruction
+ ok = ConvertWRGBToYUV(best_y_base, best_uv_base, y_ptr, y_stride, u_ptr,
+ u_stride, v_ptr, v_stride, rgb_bit_depth, yuv_bit_depth,
+ width, height, yuv_matrix);
+
+ End:
+ free(best_y_base);
+ free(best_uv_base);
+ free(target_y_base);
+ free(target_uv_base);
+ free(best_rgb_y);
+ free(best_rgb_uv);
+ free(tmp_buffer);
+ return ok;
+}
+#undef SAFE_ALLOC
+
+#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
+#include <pthread.h> // NOLINT
+
+#define LOCK_ACCESS \
+ static pthread_mutex_t sharpyuv_lock = PTHREAD_MUTEX_INITIALIZER; \
+ if (pthread_mutex_lock(&sharpyuv_lock)) return
+#define UNLOCK_ACCESS_AND_RETURN \
+ do { \
+ (void)pthread_mutex_unlock(&sharpyuv_lock); \
+ return; \
+ } while (0)
+#else // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
+#define LOCK_ACCESS do {} while (0)
+#define UNLOCK_ACCESS_AND_RETURN return
+#endif // defined(WEBP_USE_THREAD) && !defined(_WIN32)
+
+// Hidden exported init function.
+// By default SharpYuvConvert calls it with SharpYuvGetCPUInfo. If needed,
+// users can declare it as extern and call it with an alternate VP8CPUInfo
+// function.
+extern VP8CPUInfo SharpYuvGetCPUInfo;
+SHARPYUV_EXTERN void SharpYuvInit(VP8CPUInfo cpu_info_func);
+void SharpYuvInit(VP8CPUInfo cpu_info_func) {
+ static volatile VP8CPUInfo sharpyuv_last_cpuinfo_used =
+ (VP8CPUInfo)&sharpyuv_last_cpuinfo_used;
+ LOCK_ACCESS;
+ // Only update SharpYuvGetCPUInfo when called from external code to avoid a
+ // race on reading the value in SharpYuvConvert().
+ if (cpu_info_func != (VP8CPUInfo)&SharpYuvGetCPUInfo) {
+ SharpYuvGetCPUInfo = cpu_info_func;
+ }
+ if (sharpyuv_last_cpuinfo_used == SharpYuvGetCPUInfo) {
+ UNLOCK_ACCESS_AND_RETURN;
+ }
+
+ SharpYuvInitDsp();
+ SharpYuvInitGammaTables();
+
+ sharpyuv_last_cpuinfo_used = SharpYuvGetCPUInfo;
+ UNLOCK_ACCESS_AND_RETURN;
+}
+
+int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
+ const void* b_ptr, int rgb_step, int rgb_stride,
+ int rgb_bit_depth, void* y_ptr, int y_stride,
+ void* u_ptr, int u_stride, void* v_ptr,
+ int v_stride, int yuv_bit_depth, int width,
+ int height, const SharpYuvConversionMatrix* yuv_matrix) {
+ SharpYuvConversionMatrix scaled_matrix;
+ const int rgb_max = (1 << rgb_bit_depth) - 1;
+ const int rgb_round = 1 << (rgb_bit_depth - 1);
+ const int yuv_max = (1 << yuv_bit_depth) - 1;
+ const int sfix = GetPrecisionShift(rgb_bit_depth);
+
+ if (width < 1 || height < 1 || width == INT_MAX || height == INT_MAX ||
+ r_ptr == NULL || g_ptr == NULL || b_ptr == NULL || y_ptr == NULL ||
+ u_ptr == NULL || v_ptr == NULL) {
+ return 0;
+ }
+ if (rgb_bit_depth != 8 && rgb_bit_depth != 10 && rgb_bit_depth != 12 &&
+ rgb_bit_depth != 16) {
+ return 0;
+ }
+ if (yuv_bit_depth != 8 && yuv_bit_depth != 10 && yuv_bit_depth != 12) {
+ return 0;
+ }
+ if (rgb_bit_depth > 8 && (rgb_step % 2 != 0 || rgb_stride %2 != 0)) {
+ // Step/stride should be even for uint16_t buffers.
+ return 0;
+ }
+ if (yuv_bit_depth > 8 &&
+ (y_stride % 2 != 0 || u_stride % 2 != 0 || v_stride % 2 != 0)) {
+ // Stride should be even for uint16_t buffers.
+ return 0;
+ }
+ // The address of the function pointer is used to avoid a read race.
+ SharpYuvInit((VP8CPUInfo)&SharpYuvGetCPUInfo);
+
+ // Add scaling factor to go from rgb_bit_depth to yuv_bit_depth, to the
+ // rgb->yuv conversion matrix.
+ if (rgb_bit_depth == yuv_bit_depth) {
+ memcpy(&scaled_matrix, yuv_matrix, sizeof(scaled_matrix));
+ } else {
+ int i;
+ for (i = 0; i < 3; ++i) {
+ scaled_matrix.rgb_to_y[i] =
+ (yuv_matrix->rgb_to_y[i] * yuv_max + rgb_round) / rgb_max;
+ scaled_matrix.rgb_to_u[i] =
+ (yuv_matrix->rgb_to_u[i] * yuv_max + rgb_round) / rgb_max;
+ scaled_matrix.rgb_to_v[i] =
+ (yuv_matrix->rgb_to_v[i] * yuv_max + rgb_round) / rgb_max;
+ }
+ }
+ // Also incorporate precision change scaling.
+ scaled_matrix.rgb_to_y[3] = Shift(yuv_matrix->rgb_to_y[3], sfix);
+ scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix);
+ scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix);
+
+ return DoSharpArgbToYuv(r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride,
+ rgb_bit_depth, y_ptr, y_stride, u_ptr, u_stride,
+ v_ptr, v_stride, yuv_bit_depth, width, height,
+ &scaled_matrix);
+}
+
+//------------------------------------------------------------------------------
diff --git a/media/libwebp/sharpyuv/sharpyuv.h b/media/libwebp/sharpyuv/sharpyuv.h
new file mode 100644
index 0000000000..7b9904d6f9
--- /dev/null
+++ b/media/libwebp/sharpyuv/sharpyuv.h
@@ -0,0 +1,103 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Sharp RGB to YUV conversion.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_H_
+#define WEBP_SHARPYUV_SHARPYUV_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef SHARPYUV_EXTERN
+#ifdef WEBP_EXTERN
+#define SHARPYUV_EXTERN WEBP_EXTERN
+#else
+// This explicitly marks library functions and allows for changing the
+// signature for e.g., Windows DLL builds.
+#if defined(__GNUC__) && __GNUC__ >= 4
+#define SHARPYUV_EXTERN extern __attribute__((visibility("default")))
+#else
+#if defined(_MSC_VER) && defined(WEBP_DLL)
+#define SHARPYUV_EXTERN __declspec(dllexport)
+#else
+#define SHARPYUV_EXTERN extern
+#endif /* _MSC_VER && WEBP_DLL */
+#endif /* __GNUC__ >= 4 */
+#endif /* WEBP_EXTERN */
+#endif /* SHARPYUV_EXTERN */
+
+// SharpYUV API version following the convention from semver.org
+#define SHARPYUV_VERSION_MAJOR 0
+#define SHARPYUV_VERSION_MINOR 2
+#define SHARPYUV_VERSION_PATCH 1
+// Version as a uint32_t. The major number is the high 8 bits.
+// The minor number is the middle 8 bits. The patch number is the low 16 bits.
+#define SHARPYUV_MAKE_VERSION(MAJOR, MINOR, PATCH) \
+ (((MAJOR) << 24) | ((MINOR) << 16) | (PATCH))
+#define SHARPYUV_VERSION \
+ SHARPYUV_MAKE_VERSION(SHARPYUV_VERSION_MAJOR, SHARPYUV_VERSION_MINOR, \
+ SHARPYUV_VERSION_PATCH)
+
+// Returns the library's version number, packed in hexadecimal. See
+// SHARPYUV_VERSION.
+SHARPYUV_EXTERN int SharpYuvGetVersion(void);
+
+// RGB to YUV conversion matrix, in 16 bit fixed point.
+// y = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3]
+// u = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3]
+// v = rgb_to_v[0] * r + rgb_to_v[1] * g + rgb_to_v[2] * b + rgb_to_v[3]
+// Then y, u and v values are divided by 1<<16 and rounded.
+typedef struct {
+ int rgb_to_y[4];
+ int rgb_to_u[4];
+ int rgb_to_v[4];
+} SharpYuvConversionMatrix;
+
+// Converts RGB to YUV420 using a downsampling algorithm that minimizes
+// artefacts caused by chroma subsampling.
+// This is slower than standard downsampling (averaging of 4 UV values).
+// Assumes that the image will be upsampled using a bilinear filter. If nearest
+// neighbor is used instead, the upsampled image might look worse than with
+// standard downsampling.
+// r_ptr, g_ptr, b_ptr: pointers to the source r, g and b channels. Should point
+// to uint8_t buffers if rgb_bit_depth is 8, or uint16_t buffers otherwise.
+// rgb_step: distance in bytes between two horizontally adjacent pixels on the
+// r, g and b channels. If rgb_bit_depth is > 8, it should be a
+// multiple of 2.
+// rgb_stride: distance in bytes between two vertically adjacent pixels on the
+// r, g, and b channels. If rgb_bit_depth is > 8, it should be a
+// multiple of 2.
+// rgb_bit_depth: number of bits for each r/g/b value. One of: 8, 10, 12, 16.
+// Note: 16 bit input is truncated to 14 bits before conversion to yuv.
+// yuv_bit_depth: number of bits for each y/u/v value. One of: 8, 10, 12.
+// y_ptr, u_ptr, v_ptr: pointers to the destination y, u and v channels. Should
+// point to uint8_t buffers if yuv_bit_depth is 8, or uint16_t buffers
+// otherwise.
+// y_stride, u_stride, v_stride: distance in bytes between two vertically
+// adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they
+// should be multiples of 2.
+// width, height: width and height of the image in pixels
+SHARPYUV_EXTERN int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
+ const void* b_ptr, int rgb_step,
+ int rgb_stride, int rgb_bit_depth,
+ void* y_ptr, int y_stride, void* u_ptr,
+ int u_stride, void* v_ptr, int v_stride,
+ int yuv_bit_depth, int width, int height,
+ const SharpYuvConversionMatrix* yuv_matrix);
+
+// TODO(b/194336375): Add YUV444 to YUV420 conversion. Maybe also add 422
+// support (it's rarely used in practice, especially for images).
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // WEBP_SHARPYUV_SHARPYUV_H_
diff --git a/media/libwebp/sharpyuv/sharpyuv_cpu.h b/media/libwebp/sharpyuv/sharpyuv_cpu.h
new file mode 100644
index 0000000000..176ca3eb16
--- /dev/null
+++ b/media/libwebp/sharpyuv/sharpyuv_cpu.h
@@ -0,0 +1,22 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+#ifndef WEBP_SHARPYUV_SHARPYUV_CPU_H_
+#define WEBP_SHARPYUV_SHARPYUV_CPU_H_
+
+#include "sharpyuv/sharpyuv.h"
+
+// Avoid exporting SharpYuvGetCPUInfo in shared object / DLL builds.
+// SharpYuvInit() replaces the use of the function pointer.
+#undef WEBP_EXTERN
+#define WEBP_EXTERN extern
+#define VP8GetCPUInfo SharpYuvGetCPUInfo
+#include "src/dsp/cpu.h"
+
+#endif // WEBP_SHARPYUV_SHARPYUV_CPU_H_
diff --git a/media/libwebp/sharpyuv/sharpyuv_csp.c b/media/libwebp/sharpyuv/sharpyuv_csp.c
new file mode 100644
index 0000000000..0ad22be945
--- /dev/null
+++ b/media/libwebp/sharpyuv/sharpyuv_csp.c
@@ -0,0 +1,110 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Colorspace utilities.
+
+#include "sharpyuv/sharpyuv_csp.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stddef.h>
+
+static int ToFixed16(float f) { return (int)floor(f * (1 << 16) + 0.5f); }
+
+void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
+ SharpYuvConversionMatrix* matrix) {
+ const float kr = yuv_color_space->kr;
+ const float kb = yuv_color_space->kb;
+ const float kg = 1.0f - kr - kb;
+ const float cr = 0.5f / (1.0f - kb);
+ const float cb = 0.5f / (1.0f - kr);
+
+ const int shift = yuv_color_space->bit_depth - 8;
+
+ const float denom = (float)((1 << yuv_color_space->bit_depth) - 1);
+ float scale_y = 1.0f;
+ float add_y = 0.0f;
+ float scale_u = cr;
+ float scale_v = cb;
+ float add_uv = (float)(128 << shift);
+ assert(yuv_color_space->bit_depth >= 8);
+
+ if (yuv_color_space->range == kSharpYuvRangeLimited) {
+ scale_y *= (219 << shift) / denom;
+ scale_u *= (224 << shift) / denom;
+ scale_v *= (224 << shift) / denom;
+ add_y = (float)(16 << shift);
+ }
+
+ matrix->rgb_to_y[0] = ToFixed16(kr * scale_y);
+ matrix->rgb_to_y[1] = ToFixed16(kg * scale_y);
+ matrix->rgb_to_y[2] = ToFixed16(kb * scale_y);
+ matrix->rgb_to_y[3] = ToFixed16(add_y);
+
+ matrix->rgb_to_u[0] = ToFixed16(-kr * scale_u);
+ matrix->rgb_to_u[1] = ToFixed16(-kg * scale_u);
+ matrix->rgb_to_u[2] = ToFixed16((1 - kb) * scale_u);
+ matrix->rgb_to_u[3] = ToFixed16(add_uv);
+
+ matrix->rgb_to_v[0] = ToFixed16((1 - kr) * scale_v);
+ matrix->rgb_to_v[1] = ToFixed16(-kg * scale_v);
+ matrix->rgb_to_v[2] = ToFixed16(-kb * scale_v);
+ matrix->rgb_to_v[3] = ToFixed16(add_uv);
+}
+
+// Matrices are in YUV_FIX fixed point precision.
+// WebP's matrix, similar but not identical to kRec601LimitedMatrix.
+static const SharpYuvConversionMatrix kWebpMatrix = {
+ {16839, 33059, 6420, 16 << 16},
+ {-9719, -19081, 28800, 128 << 16},
+ {28800, -24116, -4684, 128 << 16},
+};
+// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeLimited
+static const SharpYuvConversionMatrix kRec601LimitedMatrix = {
+ {16829, 33039, 6416, 16 << 16},
+ {-9714, -19071, 28784, 128 << 16},
+ {28784, -24103, -4681, 128 << 16},
+};
+// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeFull
+static const SharpYuvConversionMatrix kRec601FullMatrix = {
+ {19595, 38470, 7471, 0},
+ {-11058, -21710, 32768, 128 << 16},
+ {32768, -27439, -5329, 128 << 16},
+};
+// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeLimited
+static const SharpYuvConversionMatrix kRec709LimitedMatrix = {
+ {11966, 40254, 4064, 16 << 16},
+ {-6596, -22189, 28784, 128 << 16},
+ {28784, -26145, -2639, 128 << 16},
+};
+// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeFull
+static const SharpYuvConversionMatrix kRec709FullMatrix = {
+ {13933, 46871, 4732, 0},
+ {-7509, -25259, 32768, 128 << 16},
+ {32768, -29763, -3005, 128 << 16},
+};
+
+const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix(
+ SharpYuvMatrixType matrix_type) {
+ switch (matrix_type) {
+ case kSharpYuvMatrixWebp:
+ return &kWebpMatrix;
+ case kSharpYuvMatrixRec601Limited:
+ return &kRec601LimitedMatrix;
+ case kSharpYuvMatrixRec601Full:
+ return &kRec601FullMatrix;
+ case kSharpYuvMatrixRec709Limited:
+ return &kRec709LimitedMatrix;
+ case kSharpYuvMatrixRec709Full:
+ return &kRec709FullMatrix;
+ case kSharpYuvMatrixNum:
+ return NULL;
+ }
+ return NULL;
+}
diff --git a/media/libwebp/sharpyuv/sharpyuv_csp.h b/media/libwebp/sharpyuv/sharpyuv_csp.h
new file mode 100644
index 0000000000..3214e3ac60
--- /dev/null
+++ b/media/libwebp/sharpyuv/sharpyuv_csp.h
@@ -0,0 +1,60 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Colorspace utilities.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_CSP_H_
+#define WEBP_SHARPYUV_SHARPYUV_CSP_H_
+
+#include "sharpyuv/sharpyuv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Range of YUV values.
+typedef enum {
+ kSharpYuvRangeFull, // YUV values between [0;255] (for 8 bit)
+ kSharpYuvRangeLimited // Y in [16;235], YUV in [16;240] (for 8 bit)
+} SharpYuvRange;
+
+// Constants that define a YUV color space.
+typedef struct {
+ // Kr and Kb are defined such that:
+ // Y = Kr * r + Kg * g + Kb * b where Kg = 1 - Kr - Kb.
+ float kr;
+ float kb;
+ int bit_depth; // 8, 10 or 12
+ SharpYuvRange range;
+} SharpYuvColorSpace;
+
+// Fills in 'matrix' for the given YUVColorSpace.
+SHARPYUV_EXTERN void SharpYuvComputeConversionMatrix(
+ const SharpYuvColorSpace* yuv_color_space,
+ SharpYuvConversionMatrix* matrix);
+
+// Enums for precomputed conversion matrices.
+typedef enum {
+ kSharpYuvMatrixWebp = 0,
+ kSharpYuvMatrixRec601Limited,
+ kSharpYuvMatrixRec601Full,
+ kSharpYuvMatrixRec709Limited,
+ kSharpYuvMatrixRec709Full,
+ kSharpYuvMatrixNum
+} SharpYuvMatrixType;
+
+// Returns a pointer to a matrix for one of the predefined colorspaces.
+SHARPYUV_EXTERN const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix(
+ SharpYuvMatrixType matrix_type);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // WEBP_SHARPYUV_SHARPYUV_CSP_H_
diff --git a/media/libwebp/sharpyuv/sharpyuv_dsp.c b/media/libwebp/sharpyuv/sharpyuv_dsp.c
new file mode 100644
index 0000000000..0da3efc0b8
--- /dev/null
+++ b/media/libwebp/sharpyuv/sharpyuv_dsp.c
@@ -0,0 +1,104 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv_dsp.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "sharpyuv/sharpyuv_cpu.h"
+
+//-----------------------------------------------------------------------------
+
+#if !WEBP_NEON_OMIT_C_CODE
+static uint16_t clip(int v, int max) {
+ return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static uint64_t SharpYuvUpdateY_C(const uint16_t* ref, const uint16_t* src,
+ uint16_t* dst, int len, int bit_depth) {
+ uint64_t diff = 0;
+ int i;
+ const int max_y = (1 << bit_depth) - 1;
+ for (i = 0; i < len; ++i) {
+ const int diff_y = ref[i] - src[i];
+ const int new_y = (int)dst[i] + diff_y;
+ dst[i] = clip(new_y, max_y);
+ diff += (uint64_t)abs(diff_y);
+ }
+ return diff;
+}
+
+static void SharpYuvUpdateRGB_C(const int16_t* ref, const int16_t* src,
+ int16_t* dst, int len) {
+ int i;
+ for (i = 0; i < len; ++i) {
+ const int diff_uv = ref[i] - src[i];
+ dst[i] += diff_uv;
+ }
+}
+
+static void SharpYuvFilterRow_C(const int16_t* A, const int16_t* B, int len,
+ const uint16_t* best_y, uint16_t* out,
+ int bit_depth) {
+ int i;
+ const int max_y = (1 << bit_depth) - 1;
+ for (i = 0; i < len; ++i, ++A, ++B) {
+ const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
+ const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
+ out[2 * i + 0] = clip(best_y[2 * i + 0] + v0, max_y);
+ out[2 * i + 1] = clip(best_y[2 * i + 1] + v1, max_y);
+ }
+}
+#endif // !WEBP_NEON_OMIT_C_CODE
+
+//-----------------------------------------------------------------------------
+
+uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref,
+ uint16_t* dst, int len, int bit_depth);
+void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref, int16_t* dst,
+ int len);
+void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len,
+ const uint16_t* best_y, uint16_t* out,
+ int bit_depth);
+
+extern VP8CPUInfo SharpYuvGetCPUInfo;
+extern void InitSharpYuvSSE2(void);
+extern void InitSharpYuvNEON(void);
+
+void SharpYuvInitDsp(void) {
+#if !WEBP_NEON_OMIT_C_CODE
+ SharpYuvUpdateY = SharpYuvUpdateY_C;
+ SharpYuvUpdateRGB = SharpYuvUpdateRGB_C;
+ SharpYuvFilterRow = SharpYuvFilterRow_C;
+#endif
+
+ if (SharpYuvGetCPUInfo != NULL) {
+#if defined(WEBP_HAVE_SSE2)
+ if (SharpYuvGetCPUInfo(kSSE2)) {
+ InitSharpYuvSSE2();
+ }
+#endif // WEBP_HAVE_SSE2
+ }
+
+#if defined(WEBP_HAVE_NEON)
+ if (WEBP_NEON_OMIT_C_CODE ||
+ (SharpYuvGetCPUInfo != NULL && SharpYuvGetCPUInfo(kNEON))) {
+ InitSharpYuvNEON();
+ }
+#endif // WEBP_HAVE_NEON
+
+ assert(SharpYuvUpdateY != NULL);
+ assert(SharpYuvUpdateRGB != NULL);
+ assert(SharpYuvFilterRow != NULL);
+}
diff --git a/media/libwebp/sharpyuv/sharpyuv_dsp.h b/media/libwebp/sharpyuv/sharpyuv_dsp.h
new file mode 100644
index 0000000000..805fbadbf6
--- /dev/null
+++ b/media/libwebp/sharpyuv/sharpyuv_dsp.h
@@ -0,0 +1,28 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_DSP_H_
+#define WEBP_SHARPYUV_SHARPYUV_DSP_H_
+
+#include "sharpyuv/sharpyuv_cpu.h"
+#include "src/webp/types.h"
+
+extern uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref,
+ uint16_t* dst, int len, int bit_depth);
+extern void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref,
+ int16_t* dst, int len);
+extern void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len,
+ const uint16_t* best_y, uint16_t* out,
+ int bit_depth);
+
+void SharpYuvInitDsp(void);
+
+#endif // WEBP_SHARPYUV_SHARPYUV_DSP_H_
diff --git a/media/libwebp/sharpyuv/sharpyuv_gamma.c b/media/libwebp/sharpyuv/sharpyuv_gamma.c
new file mode 100644
index 0000000000..20ab2da6bc
--- /dev/null
+++ b/media/libwebp/sharpyuv/sharpyuv_gamma.c
@@ -0,0 +1,113 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Gamma correction utilities.
+
+#include "sharpyuv/sharpyuv_gamma.h"
+
+#include <assert.h>
+#include <math.h>
+
+#include "src/webp/types.h"
+
+// Gamma correction compensates loss of resolution during chroma subsampling.
+// Size of pre-computed table for converting from gamma to linear.
+#define GAMMA_TO_LINEAR_TAB_BITS 10
+#define GAMMA_TO_LINEAR_TAB_SIZE (1 << GAMMA_TO_LINEAR_TAB_BITS)
+static uint32_t kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 2];
+#define LINEAR_TO_GAMMA_TAB_BITS 9
+#define LINEAR_TO_GAMMA_TAB_SIZE (1 << LINEAR_TO_GAMMA_TAB_BITS)
+static uint32_t kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 2];
+
+static const double kGammaF = 1. / 0.45;
+#define GAMMA_TO_LINEAR_BITS 16
+
+static volatile int kGammaTablesSOk = 0;
+void SharpYuvInitGammaTables(void) {
+ assert(GAMMA_TO_LINEAR_BITS <= 16);
+ if (!kGammaTablesSOk) {
+ int v;
+ const double a = 0.09929682680944;
+ const double thresh = 0.018053968510807;
+ const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
+ // Precompute gamma to linear table.
+ {
+ const double norm = 1. / GAMMA_TO_LINEAR_TAB_SIZE;
+ const double a_rec = 1. / (1. + a);
+ for (v = 0; v <= GAMMA_TO_LINEAR_TAB_SIZE; ++v) {
+ const double g = norm * v;
+ double value;
+ if (g <= thresh * 4.5) {
+ value = g / 4.5;
+ } else {
+ value = pow(a_rec * (g + a), kGammaF);
+ }
+ kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
+ }
+ // to prevent small rounding errors to cause read-overflow:
+ kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 1] =
+ kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE];
+ }
+ // Precompute linear to gamma table.
+ {
+ const double scale = 1. / LINEAR_TO_GAMMA_TAB_SIZE;
+ for (v = 0; v <= LINEAR_TO_GAMMA_TAB_SIZE; ++v) {
+ const double g = scale * v;
+ double value;
+ if (g <= thresh) {
+ value = 4.5 * g;
+ } else {
+ value = (1. + a) * pow(g, 1. / kGammaF) - a;
+ }
+ kLinearToGammaTabS[v] =
+ (uint32_t)(final_scale * value + 0.5);
+ }
+ // to prevent small rounding errors to cause read-overflow:
+ kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 1] =
+ kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE];
+ }
+ kGammaTablesSOk = 1;
+ }
+}
+
+static WEBP_INLINE int Shift(int v, int shift) {
+ return (shift >= 0) ? (v << shift) : (v >> -shift);
+}
+
+static WEBP_INLINE uint32_t FixedPointInterpolation(int v, uint32_t* tab,
+ int tab_pos_shift_right,
+ int tab_value_shift) {
+ const uint32_t tab_pos = Shift(v, -tab_pos_shift_right);
+ // fractional part, in 'tab_pos_shift' fixed-point precision
+ const uint32_t x = v - (tab_pos << tab_pos_shift_right); // fractional part
+ // v0 / v1 are in kGammaToLinearBits fixed-point precision (range [0..1])
+ const uint32_t v0 = Shift(tab[tab_pos + 0], tab_value_shift);
+ const uint32_t v1 = Shift(tab[tab_pos + 1], tab_value_shift);
+ // Final interpolation.
+ const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0.
+ const int half =
+ (tab_pos_shift_right > 0) ? 1 << (tab_pos_shift_right - 1) : 0;
+ const uint32_t result = v0 + ((v2 + half) >> tab_pos_shift_right);
+ return result;
+}
+
+uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth) {
+ const int shift = GAMMA_TO_LINEAR_TAB_BITS - bit_depth;
+ if (shift > 0) {
+ return kGammaToLinearTabS[v << shift];
+ }
+ return FixedPointInterpolation(v, kGammaToLinearTabS, -shift, 0);
+}
+
+uint16_t SharpYuvLinearToGamma(uint32_t value, int bit_depth) {
+ return FixedPointInterpolation(
+ value, kLinearToGammaTabS,
+ (GAMMA_TO_LINEAR_BITS - LINEAR_TO_GAMMA_TAB_BITS),
+ bit_depth - GAMMA_TO_LINEAR_BITS);
+}
diff --git a/media/libwebp/sharpyuv/sharpyuv_gamma.h b/media/libwebp/sharpyuv/sharpyuv_gamma.h
new file mode 100644
index 0000000000..d13aff59e1
--- /dev/null
+++ b/media/libwebp/sharpyuv/sharpyuv_gamma.h
@@ -0,0 +1,35 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Gamma correction utilities.
+
+#ifndef WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
+#define WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
+
+#include "src/webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Initializes precomputed tables. Must be called once before calling
+// SharpYuvGammaToLinear or SharpYuvLinearToGamma.
+void SharpYuvInitGammaTables(void);
+
+// Converts a gamma color value on 'bit_depth' bits to a 16 bit linear value.
+uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth);
+
+// Converts a 16 bit linear color value to a gamma value on 'bit_depth' bits.
+uint16_t SharpYuvLinearToGamma(uint32_t value, int bit_depth);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
diff --git a/media/libwebp/sharpyuv/sharpyuv_neon.c b/media/libwebp/sharpyuv/sharpyuv_neon.c
new file mode 100644
index 0000000000..5840914865
--- /dev/null
+++ b/media/libwebp/sharpyuv/sharpyuv_neon.c
@@ -0,0 +1,181 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv_dsp.h"
+
+#if defined(WEBP_USE_NEON)
+#include <assert.h>
+#include <stdlib.h>
+#include <arm_neon.h>
+
+static uint16_t clip_NEON(int v, int max) {
+ return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static uint64_t SharpYuvUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
+ uint16_t* dst, int len, int bit_depth) {
+ const int max_y = (1 << bit_depth) - 1;
+ int i;
+ const int16x8_t zero = vdupq_n_s16(0);
+ const int16x8_t max = vdupq_n_s16(max_y);
+ uint64x2_t sum = vdupq_n_u64(0);
+ uint64_t diff;
+
+ for (i = 0; i + 8 <= len; i += 8) {
+ const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
+ const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
+ const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
+ const int16x8_t D = vsubq_s16(A, B); // diff_y
+ const int16x8_t F = vaddq_s16(C, D); // new_y
+ const uint16x8_t H =
+ vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
+ const int16x8_t I = vabsq_s16(D); // abs(diff_y)
+ vst1q_u16(dst + i, H);
+ sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
+ }
+ diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
+ for (; i < len; ++i) {
+ const int diff_y = ref[i] - src[i];
+ const int new_y = (int)(dst[i]) + diff_y;
+ dst[i] = clip_NEON(new_y, max_y);
+ diff += (uint64_t)(abs(diff_y));
+ }
+ return diff;
+}
+
+static void SharpYuvUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
+ int16_t* dst, int len) {
+ int i;
+ for (i = 0; i + 8 <= len; i += 8) {
+ const int16x8_t A = vld1q_s16(ref + i);
+ const int16x8_t B = vld1q_s16(src + i);
+ const int16x8_t C = vld1q_s16(dst + i);
+ const int16x8_t D = vsubq_s16(A, B); // diff_uv
+ const int16x8_t E = vaddq_s16(C, D); // new_uv
+ vst1q_s16(dst + i, E);
+ }
+ for (; i < len; ++i) {
+ const int diff_uv = ref[i] - src[i];
+ dst[i] += diff_uv;
+ }
+}
+
+static void SharpYuvFilterRow16_NEON(const int16_t* A, const int16_t* B,
+ int len, const uint16_t* best_y,
+ uint16_t* out, int bit_depth) {
+ const int max_y = (1 << bit_depth) - 1;
+ int i;
+ const int16x8_t max = vdupq_n_s16(max_y);
+ const int16x8_t zero = vdupq_n_s16(0);
+ for (i = 0; i + 8 <= len; i += 8) {
+ const int16x8_t a0 = vld1q_s16(A + i + 0);
+ const int16x8_t a1 = vld1q_s16(A + i + 1);
+ const int16x8_t b0 = vld1q_s16(B + i + 0);
+ const int16x8_t b1 = vld1q_s16(B + i + 1);
+ const int16x8_t a0b1 = vaddq_s16(a0, b1);
+ const int16x8_t a1b0 = vaddq_s16(a1, b0);
+ const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
+ const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
+ const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
+ const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
+ const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
+ const int16x8_t e0 = vrhaddq_s16(c1, a0);
+ const int16x8_t e1 = vrhaddq_s16(c0, a1);
+ const int16x8x2_t f = vzipq_s16(e0, e1);
+ const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
+ const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
+ const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
+ const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
+ const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
+ const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
+ vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
+ vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
+ }
+ for (; i < len; ++i) {
+ const int a0b1 = A[i + 0] + B[i + 1];
+ const int a1b0 = A[i + 1] + B[i + 0];
+ const int a0a1b0b1 = a0b1 + a1b0 + 8;
+ const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+ const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+ out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y);
+ out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y);
+ }
+}
+
+static void SharpYuvFilterRow32_NEON(const int16_t* A, const int16_t* B,
+ int len, const uint16_t* best_y,
+ uint16_t* out, int bit_depth) {
+ const int max_y = (1 << bit_depth) - 1;
+ int i;
+ const uint16x8_t max = vdupq_n_u16(max_y);
+ for (i = 0; i + 4 <= len; i += 4) {
+ const int16x4_t a0 = vld1_s16(A + i + 0);
+ const int16x4_t a1 = vld1_s16(A + i + 1);
+ const int16x4_t b0 = vld1_s16(B + i + 0);
+ const int16x4_t b1 = vld1_s16(B + i + 1);
+ const int32x4_t a0b1 = vaddl_s16(a0, b1);
+ const int32x4_t a1b0 = vaddl_s16(a1, b0);
+ const int32x4_t a0a1b0b1 = vaddq_s32(a0b1, a1b0); // A0+A1+B0+B1
+ const int32x4_t a0b1_2 = vaddq_s32(a0b1, a0b1); // 2*(A0+B1)
+ const int32x4_t a1b0_2 = vaddq_s32(a1b0, a1b0); // 2*(A1+B0)
+ const int32x4_t c0 = vshrq_n_s32(vaddq_s32(a0b1_2, a0a1b0b1), 3);
+ const int32x4_t c1 = vshrq_n_s32(vaddq_s32(a1b0_2, a0a1b0b1), 3);
+ const int32x4_t e0 = vrhaddq_s32(c1, vmovl_s16(a0));
+ const int32x4_t e1 = vrhaddq_s32(c0, vmovl_s16(a1));
+ const int32x4x2_t f = vzipq_s32(e0, e1);
+
+ const int16x8_t g = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i));
+ const int32x4_t h0 = vaddw_s16(f.val[0], vget_low_s16(g));
+ const int32x4_t h1 = vaddw_s16(f.val[1], vget_high_s16(g));
+ const uint16x8_t i_16 = vcombine_u16(vqmovun_s32(h0), vqmovun_s32(h1));
+ const uint16x8_t i_clamped = vminq_u16(i_16, max);
+ vst1q_u16(out + 2 * i + 0, i_clamped);
+ }
+ for (; i < len; ++i) {
+ const int a0b1 = A[i + 0] + B[i + 1];
+ const int a1b0 = A[i + 1] + B[i + 0];
+ const int a0a1b0b1 = a0b1 + a1b0 + 8;
+ const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+ const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+ out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y);
+ out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y);
+ }
+}
+
+static void SharpYuvFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
+ const uint16_t* best_y, uint16_t* out,
+ int bit_depth) {
+ if (bit_depth <= 10) {
+ SharpYuvFilterRow16_NEON(A, B, len, best_y, out, bit_depth);
+ } else {
+ SharpYuvFilterRow32_NEON(A, B, len, best_y, out, bit_depth);
+ }
+}
+
+//------------------------------------------------------------------------------
+
+extern void InitSharpYuvNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvNEON(void) {
+ SharpYuvUpdateY = SharpYuvUpdateY_NEON;
+ SharpYuvUpdateRGB = SharpYuvUpdateRGB_NEON;
+ SharpYuvFilterRow = SharpYuvFilterRow_NEON;
+}
+
+#else // !WEBP_USE_NEON
+
+extern void InitSharpYuvNEON(void);
+
+void InitSharpYuvNEON(void) {}
+
+#endif // WEBP_USE_NEON
diff --git a/media/libwebp/sharpyuv/sharpyuv_sse2.c b/media/libwebp/sharpyuv/sharpyuv_sse2.c
new file mode 100644
index 0000000000..9744d1bb6c
--- /dev/null
+++ b/media/libwebp/sharpyuv/sharpyuv_sse2.c
@@ -0,0 +1,201 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical functions for Sharp YUV.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "sharpyuv/sharpyuv_dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <stdlib.h>
+#include <emmintrin.h>
+
+static uint16_t clip_SSE2(int v, int max) {
+ return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
+}
+
+static uint64_t SharpYuvUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
+ uint16_t* dst, int len, int bit_depth) {
+ const int max_y = (1 << bit_depth) - 1;
+ uint64_t diff = 0;
+ uint32_t tmp[4];
+ int i;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i max = _mm_set1_epi16(max_y);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i sum = zero;
+
+ for (i = 0; i + 8 <= len; i += 8) {
+ const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
+ const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
+ const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
+ const __m128i D = _mm_sub_epi16(A, B); // diff_y
+ const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
+ const __m128i F = _mm_add_epi16(C, D); // new_y
+ const __m128i G = _mm_or_si128(E, one); // -1 or 1
+ const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
+ const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
+ _mm_storeu_si128((__m128i*)(dst + i), H);
+ sum = _mm_add_epi32(sum, I);
+ }
+ _mm_storeu_si128((__m128i*)tmp, sum);
+ diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
+ for (; i < len; ++i) {
+ const int diff_y = ref[i] - src[i];
+ const int new_y = (int)dst[i] + diff_y;
+ dst[i] = clip_SSE2(new_y, max_y);
+ diff += (uint64_t)abs(diff_y);
+ }
+ return diff;
+}
+
+static void SharpYuvUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
+ int16_t* dst, int len) {
+ int i = 0;
+ for (i = 0; i + 8 <= len; i += 8) {
+ const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
+ const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
+ const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
+ const __m128i D = _mm_sub_epi16(A, B); // diff_uv
+ const __m128i E = _mm_add_epi16(C, D); // new_uv
+ _mm_storeu_si128((__m128i*)(dst + i), E);
+ }
+ for (; i < len; ++i) {
+ const int diff_uv = ref[i] - src[i];
+ dst[i] += diff_uv;
+ }
+}
+
+static void SharpYuvFilterRow16_SSE2(const int16_t* A, const int16_t* B,
+ int len, const uint16_t* best_y,
+ uint16_t* out, int bit_depth) {
+ const int max_y = (1 << bit_depth) - 1;
+ int i;
+ const __m128i kCst8 = _mm_set1_epi16(8);
+ const __m128i max = _mm_set1_epi16(max_y);
+ const __m128i zero = _mm_setzero_si128();
+ for (i = 0; i + 8 <= len; i += 8) {
+ const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
+ const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
+ const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
+ const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
+ const __m128i a0b1 = _mm_add_epi16(a0, b1);
+ const __m128i a1b0 = _mm_add_epi16(a1, b0);
+ const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
+ const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
+ const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
+ const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
+ const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
+ const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
+ const __m128i d0 = _mm_add_epi16(c1, a0);
+ const __m128i d1 = _mm_add_epi16(c0, a1);
+ const __m128i e0 = _mm_srai_epi16(d0, 1);
+ const __m128i e1 = _mm_srai_epi16(d1, 1);
+ const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
+ const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
+ const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
+ const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
+ const __m128i h0 = _mm_add_epi16(g0, f0);
+ const __m128i h1 = _mm_add_epi16(g1, f1);
+ const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
+ const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
+ _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
+ _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
+ }
+ for (; i < len; ++i) {
+ // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
+ // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
+ // We reuse the common sub-expressions.
+ const int a0b1 = A[i + 0] + B[i + 1];
+ const int a1b0 = A[i + 1] + B[i + 0];
+ const int a0a1b0b1 = a0b1 + a1b0 + 8;
+ const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+ const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+ out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y);
+ out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y);
+ }
+}
+
+static WEBP_INLINE __m128i s16_to_s32(__m128i in) {
+ return _mm_srai_epi32(_mm_unpacklo_epi16(in, in), 16);
+}
+
+static void SharpYuvFilterRow32_SSE2(const int16_t* A, const int16_t* B,
+ int len, const uint16_t* best_y,
+ uint16_t* out, int bit_depth) {
+ const int max_y = (1 << bit_depth) - 1;
+ int i;
+ const __m128i kCst8 = _mm_set1_epi32(8);
+ const __m128i max = _mm_set1_epi16(max_y);
+ const __m128i zero = _mm_setzero_si128();
+ for (i = 0; i + 4 <= len; i += 4) {
+ const __m128i a0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 0)));
+ const __m128i a1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 1)));
+ const __m128i b0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 0)));
+ const __m128i b1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 1)));
+ const __m128i a0b1 = _mm_add_epi32(a0, b1);
+ const __m128i a1b0 = _mm_add_epi32(a1, b0);
+ const __m128i a0a1b0b1 = _mm_add_epi32(a0b1, a1b0); // A0+A1+B0+B1
+ const __m128i a0a1b0b1_8 = _mm_add_epi32(a0a1b0b1, kCst8);
+ const __m128i a0b1_2 = _mm_add_epi32(a0b1, a0b1); // 2*(A0+B1)
+ const __m128i a1b0_2 = _mm_add_epi32(a1b0, a1b0); // 2*(A1+B0)
+ const __m128i c0 = _mm_srai_epi32(_mm_add_epi32(a0b1_2, a0a1b0b1_8), 3);
+ const __m128i c1 = _mm_srai_epi32(_mm_add_epi32(a1b0_2, a0a1b0b1_8), 3);
+ const __m128i d0 = _mm_add_epi32(c1, a0);
+ const __m128i d1 = _mm_add_epi32(c0, a1);
+ const __m128i e0 = _mm_srai_epi32(d0, 1);
+ const __m128i e1 = _mm_srai_epi32(d1, 1);
+ const __m128i f0 = _mm_unpacklo_epi32(e0, e1);
+ const __m128i f1 = _mm_unpackhi_epi32(e0, e1);
+ const __m128i g = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
+ const __m128i h_16 = _mm_add_epi16(g, _mm_packs_epi32(f0, f1));
+ const __m128i final = _mm_max_epi16(_mm_min_epi16(h_16, max), zero);
+ _mm_storeu_si128((__m128i*)(out + 2 * i + 0), final);
+ }
+ for (; i < len; ++i) {
+ // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
+ // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
+ // We reuse the common sub-expressions.
+ const int a0b1 = A[i + 0] + B[i + 1];
+ const int a1b0 = A[i + 1] + B[i + 0];
+ const int a0a1b0b1 = a0b1 + a1b0 + 8;
+ const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+ const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+ out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y);
+ out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y);
+ }
+}
+
+static void SharpYuvFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
+ const uint16_t* best_y, uint16_t* out,
+ int bit_depth) {
+ if (bit_depth <= 10) {
+ SharpYuvFilterRow16_SSE2(A, B, len, best_y, out, bit_depth);
+ } else {
+ SharpYuvFilterRow32_SSE2(A, B, len, best_y, out, bit_depth);
+ }
+}
+
+//------------------------------------------------------------------------------
+
+extern void InitSharpYuvSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvSSE2(void) {
+ SharpYuvUpdateY = SharpYuvUpdateY_SSE2;
+ SharpYuvUpdateRGB = SharpYuvUpdateRGB_SSE2;
+ SharpYuvFilterRow = SharpYuvFilterRow_SSE2;
+}
+#else // !WEBP_USE_SSE2
+
+extern void InitSharpYuvSSE2(void);
+
+void InitSharpYuvSSE2(void) {}
+
+#endif // WEBP_USE_SSE2