diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-12 05:35:29 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-12 05:35:29 +0000 |
commit | 59203c63bb777a3bacec32fb8830fba33540e809 (patch) | |
tree | 58298e711c0ff0575818c30485b44a2f21bf28a0 /media/libwebp/src/dsp | |
parent | Adding upstream version 126.0.1. (diff) | |
download | firefox-59203c63bb777a3bacec32fb8830fba33540e809.tar.xz firefox-59203c63bb777a3bacec32fb8830fba33540e809.zip |
Adding upstream version 127.0.upstream/127.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'media/libwebp/src/dsp')
29 files changed, 346 insertions, 251 deletions
diff --git a/media/libwebp/src/dsp/alpha_processing_sse2.c b/media/libwebp/src/dsp/alpha_processing_sse2.c index f0843d0feb..aa0cc2848a 100644 --- a/media/libwebp/src/dsp/alpha_processing_sse2.c +++ b/media/libwebp/src/dsp/alpha_processing_sse2.c @@ -144,6 +144,46 @@ static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride, return (alpha_and == 0xff); } +static void ExtractGreen_SSE2(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT alpha, int size) { + int i; + const __m128i mask = _mm_set1_epi32(0xff); + const __m128i* src = (const __m128i*)argb; + + for (i = 0; i + 16 <= size; i += 16, src += 4) { + const __m128i a0 = _mm_loadu_si128(src + 0); + const __m128i a1 = _mm_loadu_si128(src + 1); + const __m128i a2 = _mm_loadu_si128(src + 2); + const __m128i a3 = _mm_loadu_si128(src + 3); + const __m128i b0 = _mm_srli_epi32(a0, 8); + const __m128i b1 = _mm_srli_epi32(a1, 8); + const __m128i b2 = _mm_srli_epi32(a2, 8); + const __m128i b3 = _mm_srli_epi32(a3, 8); + const __m128i c0 = _mm_and_si128(b0, mask); + const __m128i c1 = _mm_and_si128(b1, mask); + const __m128i c2 = _mm_and_si128(b2, mask); + const __m128i c3 = _mm_and_si128(b3, mask); + const __m128i d0 = _mm_packs_epi32(c0, c1); + const __m128i d1 = _mm_packs_epi32(c2, c3); + const __m128i e = _mm_packus_epi16(d0, d1); + // store + _mm_storeu_si128((__m128i*)&alpha[i], e); + } + if (i + 8 <= size) { + const __m128i a0 = _mm_loadu_si128(src + 0); + const __m128i a1 = _mm_loadu_si128(src + 1); + const __m128i b0 = _mm_srli_epi32(a0, 8); + const __m128i b1 = _mm_srli_epi32(a1, 8); + const __m128i c0 = _mm_and_si128(b0, mask); + const __m128i c1 = _mm_and_si128(b1, mask); + const __m128i d = _mm_packs_epi32(c0, c1); + const __m128i e = _mm_packus_epi16(d, d); + _mm_storel_epi64((__m128i*)&alpha[i], e); + i += 8; + } + for (; i < size; ++i) alpha[i] = argb[i] >> 8; +} + //------------------------------------------------------------------------------ // Non-dither premultiplied modes @@ -354,6 +394,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) { WebPDispatchAlpha = DispatchAlpha_SSE2; WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2; WebPExtractAlpha = ExtractAlpha_SSE2; + WebPExtractGreen = ExtractGreen_SSE2; WebPHasAlpha8b = HasAlpha8b_SSE2; WebPHasAlpha32b = HasAlpha32b_SSE2; diff --git a/media/libwebp/src/dsp/dec.c b/media/libwebp/src/dsp/dec.c index 33d8df8a62..451d649d58 100644 --- a/media/libwebp/src/dsp/dec.c +++ b/media/libwebp/src/dsp/dec.c @@ -37,9 +37,6 @@ static WEBP_INLINE uint8_t clip_8b(int v) { STORE(3, y, DC - (d)); \ } while (0) -#define MUL1(a) ((((a) * 20091) >> 16) + (a)) -#define MUL2(a) (((a) * 35468) >> 16) - #if !WEBP_NEON_OMIT_C_CODE static void TransformOne_C(const int16_t* in, uint8_t* dst) { int C[4 * 4], *tmp; @@ -48,8 +45,10 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) { for (i = 0; i < 4; ++i) { // vertical pass const int a = in[0] + in[8]; // [-4096, 4094] const int b = in[0] - in[8]; // [-4095, 4095] - const int c = MUL2(in[4]) - MUL1(in[12]); // [-3783, 3783] - const int d = MUL1(in[4]) + MUL2(in[12]); // [-3785, 3781] + const int c = WEBP_TRANSFORM_AC3_MUL2(in[4]) - + WEBP_TRANSFORM_AC3_MUL1(in[12]); // [-3783, 3783] + const int d = WEBP_TRANSFORM_AC3_MUL1(in[4]) + + WEBP_TRANSFORM_AC3_MUL2(in[12]); // [-3785, 3781] tmp[0] = a + d; // [-7881, 7875] tmp[1] = b + c; // [-7878, 7878] tmp[2] = b - c; // [-7878, 7878] @@ -69,8 +68,10 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) { const int dc = tmp[0] + 4; const int a = dc + tmp[8]; const int b = dc - tmp[8]; - const int c = MUL2(tmp[4]) - MUL1(tmp[12]); - const int d = MUL1(tmp[4]) + MUL2(tmp[12]); + const int c = + WEBP_TRANSFORM_AC3_MUL2(tmp[4]) - WEBP_TRANSFORM_AC3_MUL1(tmp[12]); + const int d = + WEBP_TRANSFORM_AC3_MUL1(tmp[4]) + WEBP_TRANSFORM_AC3_MUL2(tmp[12]); STORE(0, 0, a + d); STORE(1, 0, b + c); STORE(2, 0, b - c); @@ -83,17 +84,15 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) { // Simplified transform when only in[0], in[1] and in[4] are non-zero static void TransformAC3_C(const int16_t* in, uint8_t* dst) { const int a = in[0] + 4; - const int c4 = MUL2(in[4]); - const int d4 = MUL1(in[4]); - const int c1 = MUL2(in[1]); - const int d1 = MUL1(in[1]); + const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]); + const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]); + const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]); + const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]); STORE2(0, a + d4, d1, c1); STORE2(1, a + c4, d1, c1); STORE2(2, a - c4, d1, c1); STORE2(3, a - d4, d1, c1); } -#undef MUL1 -#undef MUL2 #undef STORE2 static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) { diff --git a/media/libwebp/src/dsp/dec_mips32.c b/media/libwebp/src/dsp/dec_mips32.c index e4e70966d2..f0e7de4ac4 100644 --- a/media/libwebp/src/dsp/dec_mips32.c +++ b/media/libwebp/src/dsp/dec_mips32.c @@ -18,8 +18,8 @@ #include "src/dsp/mips_macro.h" -static const int kC1 = 20091 + (1 << 16); -static const int kC2 = 35468; +static const int kC1 = WEBP_TRANSFORM_AC3_C1; +static const int kC2 = WEBP_TRANSFORM_AC3_C2; static WEBP_INLINE int abs_mips32(int x) { const int sign = x >> 31; @@ -219,7 +219,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { int temp0, temp1, temp2, temp3, temp4; int temp5, temp6, temp7, temp8, temp9; int temp10, temp11, temp12, temp13, temp14; - int temp15, temp16, temp17, temp18; + int temp15, temp16, temp17, temp18, temp19; int16_t* p_in = (int16_t*)in; // loops unrolled and merged to avoid usage of tmp buffer @@ -233,16 +233,14 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { "addu %[temp16], %[temp0], %[temp8] \n\t" "subu %[temp0], %[temp0], %[temp8] \n\t" "mul %[temp8], %[temp4], %[kC2] \n\t" - "mul %[temp17], %[temp12], %[kC1] \n\t" - "mul %[temp4], %[temp4], %[kC1] \n\t" + MUL_SHIFT_C1(temp17, temp12) + MUL_SHIFT_C1_IO(temp4, temp19) "mul %[temp12], %[temp12], %[kC2] \n\t" "lh %[temp1], 2(%[in]) \n\t" "lh %[temp5], 10(%[in]) \n\t" "lh %[temp9], 18(%[in]) \n\t" "lh %[temp13], 26(%[in]) \n\t" "sra %[temp8], %[temp8], 16 \n\t" - "sra %[temp17], %[temp17], 16 \n\t" - "sra %[temp4], %[temp4], 16 \n\t" "sra %[temp12], %[temp12], 16 \n\t" "lh %[temp2], 4(%[in]) \n\t" "lh %[temp6], 12(%[in]) \n\t" @@ -261,49 +259,43 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { "addu %[temp12], %[temp0], %[temp17] \n\t" "subu %[temp0], %[temp0], %[temp17] \n\t" "mul %[temp9], %[temp5], %[kC2] \n\t" - "mul %[temp17], %[temp13], %[kC1] \n\t" - "mul %[temp5], %[temp5], %[kC1] \n\t" + MUL_SHIFT_C1(temp17, temp13) + MUL_SHIFT_C1_IO(temp5, temp19) "mul %[temp13], %[temp13], %[kC2] \n\t" "sra %[temp9], %[temp9], 16 \n\t" - "sra %[temp17], %[temp17], 16 \n\t" "subu %[temp17], %[temp9], %[temp17] \n\t" - "sra %[temp5], %[temp5], 16 \n\t" "sra %[temp13], %[temp13], 16 \n\t" "addu %[temp5], %[temp5], %[temp13] \n\t" "addu %[temp13], %[temp1], %[temp17] \n\t" "subu %[temp1], %[temp1], %[temp17] \n\t" - "mul %[temp17], %[temp14], %[kC1] \n\t" + MUL_SHIFT_C1(temp17, temp14) "mul %[temp14], %[temp14], %[kC2] \n\t" "addu %[temp9], %[temp16], %[temp5] \n\t" "subu %[temp5], %[temp16], %[temp5] \n\t" "addu %[temp16], %[temp2], %[temp10] \n\t" "subu %[temp2], %[temp2], %[temp10] \n\t" "mul %[temp10], %[temp6], %[kC2] \n\t" - "mul %[temp6], %[temp6], %[kC1] \n\t" - "sra %[temp17], %[temp17], 16 \n\t" + MUL_SHIFT_C1_IO(temp6, temp19) "sra %[temp14], %[temp14], 16 \n\t" "sra %[temp10], %[temp10], 16 \n\t" - "sra %[temp6], %[temp6], 16 \n\t" "subu %[temp17], %[temp10], %[temp17] \n\t" "addu %[temp6], %[temp6], %[temp14] \n\t" "addu %[temp10], %[temp16], %[temp6] \n\t" "subu %[temp6], %[temp16], %[temp6] \n\t" "addu %[temp14], %[temp2], %[temp17] \n\t" "subu %[temp2], %[temp2], %[temp17] \n\t" - "mul %[temp17], %[temp15], %[kC1] \n\t" + MUL_SHIFT_C1(temp17, temp15) "mul %[temp15], %[temp15], %[kC2] \n\t" "addu %[temp16], %[temp3], %[temp11] \n\t" "subu %[temp3], %[temp3], %[temp11] \n\t" "mul %[temp11], %[temp7], %[kC2] \n\t" - "mul %[temp7], %[temp7], %[kC1] \n\t" + MUL_SHIFT_C1_IO(temp7, temp19) "addiu %[temp8], %[temp8], 4 \n\t" "addiu %[temp12], %[temp12], 4 \n\t" "addiu %[temp0], %[temp0], 4 \n\t" "addiu %[temp4], %[temp4], 4 \n\t" - "sra %[temp17], %[temp17], 16 \n\t" "sra %[temp15], %[temp15], 16 \n\t" "sra %[temp11], %[temp11], 16 \n\t" - "sra %[temp7], %[temp7], 16 \n\t" "subu %[temp17], %[temp11], %[temp17] \n\t" "addu %[temp7], %[temp7], %[temp15] \n\t" "addu %[temp15], %[temp3], %[temp17] \n\t" @@ -313,48 +305,40 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { "addu %[temp16], %[temp8], %[temp10] \n\t" "subu %[temp8], %[temp8], %[temp10] \n\t" "mul %[temp10], %[temp9], %[kC2] \n\t" - "mul %[temp17], %[temp11], %[kC1] \n\t" - "mul %[temp9], %[temp9], %[kC1] \n\t" + MUL_SHIFT_C1(temp17, temp11) + MUL_SHIFT_C1_IO(temp9, temp19) "mul %[temp11], %[temp11], %[kC2] \n\t" "sra %[temp10], %[temp10], 16 \n\t" - "sra %[temp17], %[temp17], 16 \n\t" - "sra %[temp9], %[temp9], 16 \n\t" "sra %[temp11], %[temp11], 16 \n\t" "subu %[temp17], %[temp10], %[temp17] \n\t" "addu %[temp11], %[temp9], %[temp11] \n\t" "addu %[temp10], %[temp12], %[temp14] \n\t" "subu %[temp12], %[temp12], %[temp14] \n\t" "mul %[temp14], %[temp13], %[kC2] \n\t" - "mul %[temp9], %[temp15], %[kC1] \n\t" - "mul %[temp13], %[temp13], %[kC1] \n\t" + MUL_SHIFT_C1(temp9, temp15) + MUL_SHIFT_C1_IO(temp13, temp19) "mul %[temp15], %[temp15], %[kC2] \n\t" "sra %[temp14], %[temp14], 16 \n\t" - "sra %[temp9], %[temp9], 16 \n\t" - "sra %[temp13], %[temp13], 16 \n\t" "sra %[temp15], %[temp15], 16 \n\t" "subu %[temp9], %[temp14], %[temp9] \n\t" "addu %[temp15], %[temp13], %[temp15] \n\t" "addu %[temp14], %[temp0], %[temp2] \n\t" "subu %[temp0], %[temp0], %[temp2] \n\t" "mul %[temp2], %[temp1], %[kC2] \n\t" - "mul %[temp13], %[temp3], %[kC1] \n\t" - "mul %[temp1], %[temp1], %[kC1] \n\t" + MUL_SHIFT_C1(temp13, temp3) + MUL_SHIFT_C1_IO(temp1, temp19) "mul %[temp3], %[temp3], %[kC2] \n\t" "sra %[temp2], %[temp2], 16 \n\t" - "sra %[temp13], %[temp13], 16 \n\t" - "sra %[temp1], %[temp1], 16 \n\t" "sra %[temp3], %[temp3], 16 \n\t" "subu %[temp13], %[temp2], %[temp13] \n\t" "addu %[temp3], %[temp1], %[temp3] \n\t" "addu %[temp2], %[temp4], %[temp6] \n\t" "subu %[temp4], %[temp4], %[temp6] \n\t" "mul %[temp6], %[temp5], %[kC2] \n\t" - "mul %[temp1], %[temp7], %[kC1] \n\t" - "mul %[temp5], %[temp5], %[kC1] \n\t" + MUL_SHIFT_C1(temp1, temp7) + MUL_SHIFT_C1_IO(temp5, temp19) "mul %[temp7], %[temp7], %[kC2] \n\t" "sra %[temp6], %[temp6], 16 \n\t" - "sra %[temp1], %[temp1], 16 \n\t" - "sra %[temp5], %[temp5], 16 \n\t" "sra %[temp7], %[temp7], 16 \n\t" "subu %[temp1], %[temp6], %[temp1] \n\t" "addu %[temp7], %[temp5], %[temp7] \n\t" @@ -542,7 +526,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17), - [temp18]"=&r"(temp18) + [temp18]"=&r"(temp18), [temp19]"=&r"(temp19) : [in]"r"(p_in), [kC1]"r"(kC1), [kC2]"r"(kC2), [dst]"r"(dst) : "memory", "hi", "lo" ); diff --git a/media/libwebp/src/dsp/dec_mips_dsp_r2.c b/media/libwebp/src/dsp/dec_mips_dsp_r2.c index b0936bc46e..0ba706a2ef 100644 --- a/media/libwebp/src/dsp/dec_mips_dsp_r2.c +++ b/media/libwebp/src/dsp/dec_mips_dsp_r2.c @@ -18,10 +18,8 @@ #include "src/dsp/mips_macro.h" -static const int kC1 = 20091 + (1 << 16); -static const int kC2 = 35468; - -#define MUL(a, b) (((a) * (b)) >> 16) +static const int kC1 = WEBP_TRANSFORM_AC3_C1; +static const int kC2 = WEBP_TRANSFORM_AC3_C2; static void TransformDC(const int16_t* in, uint8_t* dst) { int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10; @@ -49,10 +47,10 @@ static void TransformDC(const int16_t* in, uint8_t* dst) { static void TransformAC3(const int16_t* in, uint8_t* dst) { const int a = in[0] + 4; - int c4 = MUL(in[4], kC2); - const int d4 = MUL(in[4], kC1); - const int c1 = MUL(in[1], kC2); - const int d1 = MUL(in[1], kC1); + int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]); + const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]); + const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]); + const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]); int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; @@ -479,8 +477,6 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride, FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); } -#undef MUL - //------------------------------------------------------------------------------ // Simple In-loop filtering (Paragraph 15.2) diff --git a/media/libwebp/src/dsp/dec_msa.c b/media/libwebp/src/dsp/dec_msa.c index 8090622b7b..58d1730192 100644 --- a/media/libwebp/src/dsp/dec_msa.c +++ b/media/libwebp/src/dsp/dec_msa.c @@ -37,8 +37,6 @@ d1_m = d_tmp1_m + d_tmp2_m; \ BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ } -#define MULT1(a) ((((a) * 20091) >> 16) + (a)) -#define MULT2(a) (((a) * 35468) >> 16) static void TransformOne(const int16_t* in, uint8_t* dst) { v8i16 input0, input1; @@ -124,10 +122,10 @@ static void TransformDC(const int16_t* in, uint8_t* dst) { static void TransformAC3(const int16_t* in, uint8_t* dst) { const int a = in[0] + 4; - const int c4 = MULT2(in[4]); - const int d4 = MULT1(in[4]); - const int in2 = MULT2(in[1]); - const int in3 = MULT1(in[1]); + const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]); + const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]); + const int in2 = WEBP_TRANSFORM_AC3_MUL2(in[1]); + const int in3 = WEBP_TRANSFORM_AC3_MUL1(in[1]); v4i32 tmp0 = { 0 }; v4i32 out0 = __msa_fill_w(a + d4); v4i32 out1 = __msa_fill_w(a + c4); diff --git a/media/libwebp/src/dsp/dec_neon.c b/media/libwebp/src/dsp/dec_neon.c index 22784cf15a..83b3a1f970 100644 --- a/media/libwebp/src/dsp/dec_neon.c +++ b/media/libwebp/src/dsp/dec_neon.c @@ -1000,8 +1000,9 @@ static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride, // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the // same issue with kC1 and vqdmulh that we work around by down shifting kC2 -static const int16_t kC1 = 20091; -static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. +static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1; +static const int16_t kC2 = + WEBP_TRANSFORM_AC3_C2 / 2; // half of kC2, actually. See comment above. #if defined(WEBP_USE_INTRINSICS) static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0, @@ -1255,15 +1256,12 @@ static void TransformWHT_NEON(const int16_t* in, int16_t* out) { //------------------------------------------------------------------------------ -#define MUL(a, b) (((a) * (b)) >> 16) static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) { - static const int kC1_full = 20091 + (1 << 16); - static const int kC2_full = 35468; const int16x4_t A = vld1_dup_s16(in); - const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full)); - const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full)); - const int c1 = MUL(in[1], kC2_full); - const int d1 = MUL(in[1], kC1_full); + const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4])); + const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4])); + const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]); + const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]); const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 | (uint64_t)( c1 & 0xffff) << 16 | (uint64_t)(-c1 & 0xffff) << 32 | @@ -1274,7 +1272,6 @@ static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) { const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4)); Add4x4_NEON(m0_m1, m2_m3, dst); } -#undef MUL //------------------------------------------------------------------------------ // 4x4 diff --git a/media/libwebp/src/dsp/dec_sse2.c b/media/libwebp/src/dsp/dec_sse2.c index 01e6bcb636..ff3a28555b 100644 --- a/media/libwebp/src/dsp/dec_sse2.c +++ b/media/libwebp/src/dsp/dec_sse2.c @@ -196,15 +196,13 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) { } #if (USE_TRANSFORM_AC3 == 1) -#define MUL(a, b) (((a) * (b)) >> 16) + static void TransformAC3(const int16_t* in, uint8_t* dst) { - static const int kC1 = 20091 + (1 << 16); - static const int kC2 = 35468; const __m128i A = _mm_set1_epi16(in[0] + 4); - const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2)); - const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1)); - const int c1 = MUL(in[1], kC2); - const int d1 = MUL(in[1], kC1); + const __m128i c4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL2(in[4])); + const __m128i d4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL1(in[4])); + const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]); + const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]); const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1); const __m128i B = _mm_adds_epi16(A, CD); const __m128i m0 = _mm_adds_epi16(B, d4); @@ -238,7 +236,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2)); WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3)); } -#undef MUL + #endif // USE_TRANSFORM_AC3 //------------------------------------------------------------------------------ @@ -259,15 +257,15 @@ static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) { *x = _mm_packs_epi16(lo_1, hi_1); } -#define FLIP_SIGN_BIT2(a, b) { \ +#define FLIP_SIGN_BIT2(a, b) do { \ (a) = _mm_xor_si128(a, sign_bit); \ (b) = _mm_xor_si128(b, sign_bit); \ -} +} while (0) -#define FLIP_SIGN_BIT4(a, b, c, d) { \ +#define FLIP_SIGN_BIT4(a, b, c, d) do { \ FLIP_SIGN_BIT2(a, b); \ FLIP_SIGN_BIT2(c, d); \ -} +} while (0) // input/output is uint8_t static WEBP_INLINE void GetNotHEV_SSE2(const __m128i* const p1, @@ -645,12 +643,12 @@ static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) { (m) = _mm_max_epu8(m, MM_ABS(p2, p1)); \ } while (0) -#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) { \ +#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) do { \ (e1) = _mm_loadu_si128((__m128i*)&(p)[0 * (stride)]); \ (e2) = _mm_loadu_si128((__m128i*)&(p)[1 * (stride)]); \ (e3) = _mm_loadu_si128((__m128i*)&(p)[2 * (stride)]); \ (e4) = _mm_loadu_si128((__m128i*)&(p)[3 * (stride)]); \ -} +} while (0) #define LOADUV_H_EDGE(p, u, v, stride) do { \ const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \ @@ -658,18 +656,18 @@ static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) { (p) = _mm_unpacklo_epi64(U, V); \ } while (0) -#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) { \ +#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) do { \ LOADUV_H_EDGE(e1, u, v, 0 * (stride)); \ LOADUV_H_EDGE(e2, u, v, 1 * (stride)); \ LOADUV_H_EDGE(e3, u, v, 2 * (stride)); \ LOADUV_H_EDGE(e4, u, v, 3 * (stride)); \ -} +} while (0) -#define STOREUV(p, u, v, stride) { \ +#define STOREUV(p, u, v, stride) do { \ _mm_storel_epi64((__m128i*)&(u)[(stride)], p); \ (p) = _mm_srli_si128(p, 8); \ _mm_storel_epi64((__m128i*)&(v)[(stride)], p); \ -} +} while (0) static WEBP_INLINE void ComplexMask_SSE2(const __m128i* const p1, const __m128i* const p0, diff --git a/media/libwebp/src/dsp/dsp.h b/media/libwebp/src/dsp/dsp.h index d2000b8efc..23bc296514 100644 --- a/media/libwebp/src/dsp/dsp.h +++ b/media/libwebp/src/dsp/dsp.h @@ -203,6 +203,11 @@ extern VP8DecIdct VP8TransformDC; extern VP8DecIdct VP8TransformDCUV; extern VP8WHT VP8TransformWHT; +#define WEBP_TRANSFORM_AC3_C1 20091 +#define WEBP_TRANSFORM_AC3_C2 35468 +#define WEBP_TRANSFORM_AC3_MUL1(a) ((((a) * WEBP_TRANSFORM_AC3_C1) >> 16) + (a)) +#define WEBP_TRANSFORM_AC3_MUL2(a) (((a) * WEBP_TRANSFORM_AC3_C2) >> 16) + // *dst is the destination block, with stride BPS. Boundary samples are // assumed accessible when needed. typedef void (*VP8PredFunc)(uint8_t* dst); diff --git a/media/libwebp/src/dsp/enc.c b/media/libwebp/src/dsp/enc.c index 2ba97ba8d6..395ad05b0b 100644 --- a/media/libwebp/src/dsp/enc.c +++ b/media/libwebp/src/dsp/enc.c @@ -109,10 +109,6 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) { #define STORE(x, y, v) \ dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3)) -static const int kC1 = 20091 + (1 << 16); -static const int kC2 = 35468; -#define MUL(a, b) (((a) * (b)) >> 16) - static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, uint8_t* dst) { int C[4 * 4], *tmp; @@ -121,8 +117,10 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, for (i = 0; i < 4; ++i) { // vertical pass const int a = in[0] + in[8]; const int b = in[0] - in[8]; - const int c = MUL(in[4], kC2) - MUL(in[12], kC1); - const int d = MUL(in[4], kC1) + MUL(in[12], kC2); + const int c = + WEBP_TRANSFORM_AC3_MUL2(in[4]) - WEBP_TRANSFORM_AC3_MUL1(in[12]); + const int d = + WEBP_TRANSFORM_AC3_MUL1(in[4]) + WEBP_TRANSFORM_AC3_MUL2(in[12]); tmp[0] = a + d; tmp[1] = b + c; tmp[2] = b - c; @@ -134,10 +132,12 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, tmp = C; for (i = 0; i < 4; ++i) { // horizontal pass const int dc = tmp[0] + 4; - const int a = dc + tmp[8]; - const int b = dc - tmp[8]; - const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1); - const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2); + const int a = dc + tmp[8]; + const int b = dc - tmp[8]; + const int c = + WEBP_TRANSFORM_AC3_MUL2(tmp[4]) - WEBP_TRANSFORM_AC3_MUL1(tmp[12]); + const int d = + WEBP_TRANSFORM_AC3_MUL1(tmp[4]) + WEBP_TRANSFORM_AC3_MUL2(tmp[12]); STORE(0, i, a + d); STORE(1, i, b + c); STORE(2, i, b - c); @@ -222,7 +222,6 @@ static void FTransformWHT_C(const int16_t* in, int16_t* out) { } #endif // !WEBP_NEON_OMIT_C_CODE -#undef MUL #undef STORE //------------------------------------------------------------------------------ diff --git a/media/libwebp/src/dsp/enc_mips32.c b/media/libwebp/src/dsp/enc_mips32.c index 618f0fc0ee..50518a5f1a 100644 --- a/media/libwebp/src/dsp/enc_mips32.c +++ b/media/libwebp/src/dsp/enc_mips32.c @@ -21,8 +21,8 @@ #include "src/enc/vp8i_enc.h" #include "src/enc/cost_enc.h" -static const int kC1 = 20091 + (1 << 16); -static const int kC2 = 35468; +static const int kC1 = WEBP_TRANSFORM_AC3_C1; +static const int kC2 = WEBP_TRANSFORM_AC3_C2; // macro for one vertical pass in ITransformOne // MUL macro inlined @@ -30,7 +30,7 @@ static const int kC2 = 35468; // A..D - offsets in bytes to load from in buffer // TEMP0..TEMP3 - registers for corresponding tmp elements // TEMP4..TEMP5 - temporary registers -#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3) \ +#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3) \ "lh %[temp16], " #A "(%[temp20]) \n\t" \ "lh %[temp18], " #B "(%[temp20]) \n\t" \ "lh %[temp17], " #C "(%[temp20]) \n\t" \ @@ -38,12 +38,10 @@ static const int kC2 = 35468; "addu %[" #TEMP4 "], %[temp16], %[temp18] \n\t" \ "subu %[temp16], %[temp16], %[temp18] \n\t" \ "mul %[" #TEMP0 "], %[temp17], %[kC2] \n\t" \ - "mul %[temp18], %[temp19], %[kC1] \n\t" \ - "mul %[temp17], %[temp17], %[kC1] \n\t" \ + MUL_SHIFT_C1_IO(temp17, temp18) \ + MUL_SHIFT_C1(temp18, temp19) \ "mul %[temp19], %[temp19], %[kC2] \n\t" \ "sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\n" \ - "sra %[temp18], %[temp18], 16 \n\n" \ - "sra %[temp17], %[temp17], 16 \n\n" \ "sra %[temp19], %[temp19], 16 \n\n" \ "subu %[" #TEMP2 "], %[" #TEMP0 "], %[temp18] \n\t" \ "addu %[" #TEMP3 "], %[temp17], %[temp19] \n\t" \ @@ -58,17 +56,15 @@ static const int kC2 = 35468; // temp0..temp15 holds tmp[0]..tmp[15] // A - offset in bytes to load from ref and store to dst buffer // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements -#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12) \ +#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12) \ "addiu %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \ "addu %[temp16], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \ "subu %[temp17], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \ "mul %[" #TEMP0 "], %[" #TEMP4 "], %[kC2] \n\t" \ - "mul %[" #TEMP8 "], %[" #TEMP12 "], %[kC1] \n\t" \ - "mul %[" #TEMP4 "], %[" #TEMP4 "], %[kC1] \n\t" \ + MUL_SHIFT_C1_IO(TEMP4, TEMP8) \ + MUL_SHIFT_C1(TEMP8, TEMP12) \ "mul %[" #TEMP12 "], %[" #TEMP12 "], %[kC2] \n\t" \ "sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\t" \ - "sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \ - "sra %[" #TEMP4 "], %[" #TEMP4 "], 16 \n\t" \ "sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \ "subu %[temp18], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \ "addu %[temp19], %[" #TEMP4 "], %[" #TEMP12 "] \n\t" \ diff --git a/media/libwebp/src/dsp/enc_mips_dsp_r2.c b/media/libwebp/src/dsp/enc_mips_dsp_r2.c index 9ddd895086..e1431f3bef 100644 --- a/media/libwebp/src/dsp/enc_mips_dsp_r2.c +++ b/media/libwebp/src/dsp/enc_mips_dsp_r2.c @@ -20,8 +20,8 @@ #include "src/enc/cost_enc.h" #include "src/enc/vp8i_enc.h" -static const int kC1 = 20091 + (1 << 16); -static const int kC2 = 35468; +static const int kC1 = WEBP_TRANSFORM_AC3_C1; +static const int kC2 = WEBP_TRANSFORM_AC3_C2; // O - output // I - input (macro doesn't change it) diff --git a/media/libwebp/src/dsp/enc_neon.c b/media/libwebp/src/dsp/enc_neon.c index 714800367b..6f641c9a76 100644 --- a/media/libwebp/src/dsp/enc_neon.c +++ b/media/libwebp/src/dsp/enc_neon.c @@ -27,8 +27,9 @@ // This code is pretty much the same as TransformOne in the dec_neon.c, except // for subtraction to *ref. See the comments there for algorithmic explanations. -static const int16_t kC1 = 20091; -static const int16_t kC2 = 17734; // half of kC2, actually. See comment above. +static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1; +static const int16_t kC2 = + WEBP_TRANSFORM_AC3_C2 / 2; // half of kC2, actually. See comment above. // This code works but is *slower* than the inlined-asm version below // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to diff --git a/media/libwebp/src/dsp/filters.c b/media/libwebp/src/dsp/filters.c index 85eee5098f..c9232ff16a 100644 --- a/media/libwebp/src/dsp/filters.c +++ b/media/libwebp/src/dsp/filters.c @@ -19,14 +19,16 @@ //------------------------------------------------------------------------------ // Helpful macro. -# define SANITY_CHECK(in, out) \ - assert((in) != NULL); \ - assert((out) != NULL); \ - assert(width > 0); \ - assert(height > 0); \ - assert(stride >= width); \ - assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \ - (void)height; // Silence unused warning. +#define DCHECK(in, out) \ + do { \ + assert((in) != NULL); \ + assert((out) != NULL); \ + assert(width > 0); \ + assert(height > 0); \ + assert(stride >= width); \ + assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \ + (void)height; /* Silence unused warning. */ \ + } while (0) #if !WEBP_NEON_OMIT_C_CODE static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred, @@ -49,7 +51,7 @@ static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in, const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; - SANITY_CHECK(in, out); + DCHECK(in, out); in += start_offset; out += start_offset; preds = inverse ? out : in; @@ -86,7 +88,7 @@ static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in, const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; - SANITY_CHECK(in, out); + DCHECK(in, out); in += start_offset; out += start_offset; preds = inverse ? out : in; @@ -131,7 +133,7 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in, const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; - SANITY_CHECK(in, out); + DCHECK(in, out); in += start_offset; out += start_offset; preds = inverse ? out : in; @@ -165,7 +167,7 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in, } #endif // !WEBP_NEON_OMIT_C_CODE -#undef SANITY_CHECK +#undef DCHECK //------------------------------------------------------------------------------ @@ -189,6 +191,12 @@ static void GradientFilter_C(const uint8_t* data, int width, int height, //------------------------------------------------------------------------------ +static void NoneUnfilter_C(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + (void)prev; + if (out != in) memcpy(out, in, width * sizeof(*out)); +} + static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in, uint8_t* out, int width) { uint8_t pred = (prev == NULL) ? 0 : prev[0]; @@ -240,7 +248,7 @@ extern void VP8FiltersInitNEON(void); extern void VP8FiltersInitSSE2(void); WEBP_DSP_INIT_FUNC(VP8FiltersInit) { - WebPUnfilters[WEBP_FILTER_NONE] = NULL; + WebPUnfilters[WEBP_FILTER_NONE] = NoneUnfilter_C; #if !WEBP_NEON_OMIT_C_CODE WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C; WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_C; @@ -279,6 +287,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) { } #endif + assert(WebPUnfilters[WEBP_FILTER_NONE] != NULL); assert(WebPUnfilters[WEBP_FILTER_HORIZONTAL] != NULL); assert(WebPUnfilters[WEBP_FILTER_VERTICAL] != NULL); assert(WebPUnfilters[WEBP_FILTER_GRADIENT] != NULL); diff --git a/media/libwebp/src/dsp/filters_mips_dsp_r2.c b/media/libwebp/src/dsp/filters_mips_dsp_r2.c index 9382b12823..eca866f578 100644 --- a/media/libwebp/src/dsp/filters_mips_dsp_r2.c +++ b/media/libwebp/src/dsp/filters_mips_dsp_r2.c @@ -24,14 +24,16 @@ //------------------------------------------------------------------------------ // Helpful macro. -# define SANITY_CHECK(in, out) \ - assert(in != NULL); \ - assert(out != NULL); \ - assert(width > 0); \ - assert(height > 0); \ - assert(stride >= width); \ - assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \ - (void)height; // Silence unused warning. +#define DCHECK(in, out) \ + do { \ + assert(in != NULL); \ + assert(out != NULL); \ + assert(width > 0); \ + assert(height > 0); \ + assert(stride >= width); \ + assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \ + (void)height; /* Silence unused warning. */ \ + } while (0) #define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do { \ const uint8_t* psrc = (uint8_t*)(SRC); \ @@ -200,7 +202,7 @@ static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in, const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; - SANITY_CHECK(in, out); + DCHECK(in, out); in += start_offset; out += start_offset; preds = in; @@ -248,7 +250,7 @@ static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in, const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; - SANITY_CHECK(in, out); + DCHECK(in, out); in += start_offset; out += start_offset; preds = in; @@ -316,7 +318,7 @@ static void DoGradientFilter_MIPSdspR2(const uint8_t* in, const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; - SANITY_CHECK(in, out); + DCHECK(in, out); in += start_offset; out += start_offset; preds = in; @@ -378,7 +380,7 @@ static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in, #undef DO_PREDICT_LINE_VERTICAL #undef PREDICT_LINE_ONE_PASS #undef DO_PREDICT_LINE -#undef SANITY_CHECK +#undef DCHECK //------------------------------------------------------------------------------ // Entry point diff --git a/media/libwebp/src/dsp/filters_msa.c b/media/libwebp/src/dsp/filters_msa.c index 14c437d141..33a1b20b70 100644 --- a/media/libwebp/src/dsp/filters_msa.c +++ b/media/libwebp/src/dsp/filters_msa.c @@ -56,12 +56,14 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src, //------------------------------------------------------------------------------ // Helpful macro. -#define SANITY_CHECK(in, out) \ - assert(in != NULL); \ - assert(out != NULL); \ - assert(width > 0); \ - assert(height > 0); \ - assert(stride >= width); +#define DCHECK(in, out) \ + do { \ + assert(in != NULL); \ + assert(out != NULL); \ + assert(width > 0); \ + assert(height > 0); \ + assert(stride >= width); \ + } while (0) //------------------------------------------------------------------------------ // Horrizontal filter @@ -72,7 +74,7 @@ static void HorizontalFilter_MSA(const uint8_t* data, int width, int height, const uint8_t* in = data; uint8_t* out = filtered_data; int row = 1; - SANITY_CHECK(in, out); + DCHECK(in, out); // Leftmost pixel is the same as input for topmost scanline. out[0] = in[0]; @@ -135,7 +137,7 @@ static void GradientFilter_MSA(const uint8_t* data, int width, int height, const uint8_t* preds = data; uint8_t* out = filtered_data; int row = 1; - SANITY_CHECK(in, out); + DCHECK(in, out); // left prediction for top scan-line out[0] = in[0]; @@ -163,7 +165,7 @@ static void VerticalFilter_MSA(const uint8_t* data, int width, int height, const uint8_t* preds = data; uint8_t* out = filtered_data; int row = 1; - SANITY_CHECK(in, out); + DCHECK(in, out); // Very first top-left pixel is copied. out[0] = in[0]; @@ -182,7 +184,7 @@ static void VerticalFilter_MSA(const uint8_t* data, int width, int height, } } -#undef SANITY_CHECK +#undef DCHECK //------------------------------------------------------------------------------ // Entry point diff --git a/media/libwebp/src/dsp/filters_neon.c b/media/libwebp/src/dsp/filters_neon.c index 3e6a578ea7..b49e515af1 100644 --- a/media/libwebp/src/dsp/filters_neon.c +++ b/media/libwebp/src/dsp/filters_neon.c @@ -21,14 +21,16 @@ //------------------------------------------------------------------------------ // Helpful macros. -# define SANITY_CHECK(in, out) \ - assert(in != NULL); \ - assert(out != NULL); \ - assert(width > 0); \ - assert(height > 0); \ - assert(stride >= width); \ - assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \ - (void)height; // Silence unused warning. +#define DCHECK(in, out) \ + do { \ + assert(in != NULL); \ + assert(out != NULL); \ + assert(width > 0); \ + assert(height > 0); \ + assert(stride >= width); \ + assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \ + (void)height; /* Silence unused warning. */ \ + } while (0) // load eight u8 and widen to s16 #define U8_TO_S16(A) vreinterpretq_s16_u16(vmovl_u8(A)) @@ -71,7 +73,7 @@ static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in, uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; - SANITY_CHECK(in, out); + DCHECK(in, out); in += start_offset; out += start_offset; @@ -110,7 +112,7 @@ static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in, uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; - SANITY_CHECK(in, out); + DCHECK(in, out); in += start_offset; out += start_offset; @@ -172,7 +174,7 @@ static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in, uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; - SANITY_CHECK(in, out); + DCHECK(in, out); in += start_offset; out += start_offset; @@ -201,7 +203,7 @@ static void GradientFilter_NEON(const uint8_t* data, int width, int height, filtered_data); } -#undef SANITY_CHECK +#undef DCHECK //------------------------------------------------------------------------------ // Inverse transforms diff --git a/media/libwebp/src/dsp/filters_sse2.c b/media/libwebp/src/dsp/filters_sse2.c index 5c33ec15e2..bb4b5d5874 100644 --- a/media/libwebp/src/dsp/filters_sse2.c +++ b/media/libwebp/src/dsp/filters_sse2.c @@ -23,14 +23,16 @@ //------------------------------------------------------------------------------ // Helpful macro. -# define SANITY_CHECK(in, out) \ - assert((in) != NULL); \ - assert((out) != NULL); \ - assert(width > 0); \ - assert(height > 0); \ - assert(stride >= width); \ - assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \ - (void)height; // Silence unused warning. +#define DCHECK(in, out) \ + do { \ + assert((in) != NULL); \ + assert((out) != NULL); \ + assert(width > 0); \ + assert(height > 0); \ + assert(stride >= width); \ + assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \ + (void)height; /* Silence unused warning. */ \ + } while (0) static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred, uint8_t* dst, int length) { @@ -78,7 +80,7 @@ static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in, uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; - SANITY_CHECK(in, out); + DCHECK(in, out); in += start_offset; out += start_offset; @@ -111,7 +113,7 @@ static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in, uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; - SANITY_CHECK(in, out); + DCHECK(in, out); in += start_offset; out += start_offset; @@ -174,7 +176,7 @@ static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in, uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; - SANITY_CHECK(in, out); + DCHECK(in, out); in += start_offset; out += start_offset; @@ -197,7 +199,7 @@ static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in, } } -#undef SANITY_CHECK +#undef DCHECK //------------------------------------------------------------------------------ diff --git a/media/libwebp/src/dsp/lossless.h b/media/libwebp/src/dsp/lossless.h index de60d95d0b..0bf10a1a3d 100644 --- a/media/libwebp/src/dsp/lossless.h +++ b/media/libwebp/src/dsp/lossless.h @@ -182,9 +182,9 @@ extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16]; // ----------------------------------------------------------------------------- // Huffman-cost related functions. -typedef float (*VP8LCostFunc)(const uint32_t* population, int length); -typedef float (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y, - int length); +typedef uint32_t (*VP8LCostFunc)(const uint32_t* population, int length); +typedef uint32_t (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y, + int length); typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256], const int Y[256]); diff --git a/media/libwebp/src/dsp/lossless_common.h b/media/libwebp/src/dsp/lossless_common.h index 6a2f736b5e..d6139b2b57 100644 --- a/media/libwebp/src/dsp/lossless_common.h +++ b/media/libwebp/src/dsp/lossless_common.h @@ -16,9 +16,9 @@ #ifndef WEBP_DSP_LOSSLESS_COMMON_H_ #define WEBP_DSP_LOSSLESS_COMMON_H_ -#include "src/webp/types.h" - +#include "src/dsp/cpu.h" #include "src/utils/utils.h" +#include "src/webp/types.h" #ifdef __cplusplus extern "C" { @@ -166,7 +166,7 @@ uint32_t VP8LSubPixels(uint32_t a, uint32_t b) { } //------------------------------------------------------------------------------ -// Transform-related functions use din both encoding and decoding. +// Transform-related functions used in both encoding and decoding. // Macros used to create a batch predictor that iteratively uses a // one-pixel predictor. diff --git a/media/libwebp/src/dsp/lossless_enc.c b/media/libwebp/src/dsp/lossless_enc.c index cde1280617..997d56c2ad 100644 --- a/media/libwebp/src/dsp/lossless_enc.c +++ b/media/libwebp/src/dsp/lossless_enc.c @@ -636,20 +636,25 @@ void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits, //------------------------------------------------------------------------------ -static float ExtraCost_C(const uint32_t* population, int length) { +static uint32_t ExtraCost_C(const uint32_t* population, int length) { int i; - float cost = 0.f; - for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2]; + uint32_t cost = population[4] + population[5]; + assert(length % 2 == 0); + for (i = 2; i < length / 2 - 1; ++i) { + cost += i * (population[2 * i + 2] + population[2 * i + 3]); + } return cost; } -static float ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y, - int length) { +static uint32_t ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y, + int length) { int i; - float cost = 0.f; - for (i = 2; i < length - 2; ++i) { - const int xy = X[i + 2] + Y[i + 2]; - cost += (i >> 1) * xy; + uint32_t cost = X[4] + Y[4] + X[5] + Y[5]; + assert(length % 2 == 0); + for (i = 2; i < length / 2 - 1; ++i) { + const int xy0 = X[2 * i + 2] + Y[2 * i + 2]; + const int xy1 = X[2 * i + 3] + Y[2 * i + 3]; + cost += i * (xy0 + xy1); } return cost; } diff --git a/media/libwebp/src/dsp/lossless_enc_mips32.c b/media/libwebp/src/dsp/lossless_enc_mips32.c index 639f786631..e10f12da9d 100644 --- a/media/libwebp/src/dsp/lossless_enc_mips32.c +++ b/media/libwebp/src/dsp/lossless_enc_mips32.c @@ -103,8 +103,8 @@ static float FastLog2Slow_MIPS32(uint32_t v) { // cost += i * *(pop + 1); // pop += 2; // } -// return (float)cost; -static float ExtraCost_MIPS32(const uint32_t* const population, int length) { +// return cost; +static uint32_t ExtraCost_MIPS32(const uint32_t* const population, int length) { int i, temp0, temp1; const uint32_t* pop = &population[4]; const uint32_t* const LoopEnd = &population[length]; @@ -130,7 +130,7 @@ static float ExtraCost_MIPS32(const uint32_t* const population, int length) { : "memory", "hi", "lo" ); - return (float)((int64_t)temp0 << 32 | temp1); + return ((int64_t)temp0 << 32 | temp1); } // C version of this function: @@ -148,9 +148,9 @@ static float ExtraCost_MIPS32(const uint32_t* const population, int length) { // pX += 2; // pY += 2; // } -// return (float)cost; -static float ExtraCostCombined_MIPS32(const uint32_t* const X, - const uint32_t* const Y, int length) { +// return cost; +static uint32_t ExtraCostCombined_MIPS32(const uint32_t* const X, + const uint32_t* const Y, int length) { int i, temp0, temp1, temp2, temp3; const uint32_t* pX = &X[4]; const uint32_t* pY = &Y[4]; @@ -183,7 +183,7 @@ static float ExtraCostCombined_MIPS32(const uint32_t* const X, : "memory", "hi", "lo" ); - return (float)((int64_t)temp0 << 32 | temp1); + return ((int64_t)temp0 << 32 | temp1); } #define HUFFMAN_COST_PASS \ diff --git a/media/libwebp/src/dsp/lossless_enc_sse41.c b/media/libwebp/src/dsp/lossless_enc_sse41.c index ad358a6f25..7ab83c2604 100644 --- a/media/libwebp/src/dsp/lossless_enc_sse41.c +++ b/media/libwebp/src/dsp/lossless_enc_sse41.c @@ -18,8 +18,53 @@ #include <smmintrin.h> #include "src/dsp/lossless.h" -// For sign-extended multiplying constants, pre-shifted by 5: -#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5) +//------------------------------------------------------------------------------ +// Cost operations. + +static WEBP_INLINE uint32_t HorizontalSum_SSE41(__m128i cost) { + cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 8)); + cost = _mm_add_epi32(cost, _mm_srli_si128(cost, 4)); + return _mm_cvtsi128_si32(cost); +} + +static uint32_t ExtraCost_SSE41(const uint32_t* const a, int length) { + int i; + __m128i cost = _mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]); + assert(length % 8 == 0); + + for (i = 8; i + 8 <= length; i += 8) { + const int j = (i - 2) >> 1; + const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); + const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j); + const __m128i a2 = _mm_hadd_epi32(a0, a1); + const __m128i mul = _mm_mullo_epi32(a2, w); + cost = _mm_add_epi32(mul, cost); + } + return HorizontalSum_SSE41(cost); +} + +static uint32_t ExtraCostCombined_SSE41(const uint32_t* const a, + const uint32_t* const b, int length) { + int i; + __m128i cost = _mm_add_epi32(_mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]), + _mm_set_epi32(2 * b[7], 2 * b[6], b[5], b[4])); + assert(length % 8 == 0); + + for (i = 8; i + 8 <= length; i += 8) { + const int j = (i - 2) >> 1; + const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); + const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i]); + const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]); + const __m128i w = _mm_set_epi32(j + 3, j + 2, j + 1, j); + const __m128i a2 = _mm_hadd_epi32(a0, a1); + const __m128i b2 = _mm_hadd_epi32(b0, b1); + const __m128i mul = _mm_mullo_epi32(_mm_add_epi32(a2, b2), w); + cost = _mm_add_epi32(mul, cost); + } + return HorizontalSum_SSE41(cost); +} //------------------------------------------------------------------------------ // Subtract-Green Transform @@ -44,6 +89,9 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data, //------------------------------------------------------------------------------ // Color Transform +// For sign-extended multiplying constants, pre-shifted by 5: +#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5) + #define MK_CST_16(HI, LO) \ _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) @@ -143,6 +191,8 @@ static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride, extern void VP8LEncDspInitSSE41(void); WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) { + VP8LExtraCost = ExtraCost_SSE41; + VP8LExtraCostCombined = ExtraCostCombined_SSE41; VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41; VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41; VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41; diff --git a/media/libwebp/src/dsp/lossless_neon.c b/media/libwebp/src/dsp/lossless_neon.c index ddc9b61711..e9960db38a 100644 --- a/media/libwebp/src/dsp/lossless_neon.c +++ b/media/libwebp/src/dsp/lossless_neon.c @@ -146,9 +146,9 @@ static void ConvertBGRAToRGB_NEON(const uint32_t* src, #define LOAD_U32P_AS_U8(IN) vreinterpret_u8_u32(vld1_u32((IN))) #define LOADQ_U32_AS_U8(IN) vreinterpretq_u8_u32(vdupq_n_u32((IN))) #define LOADQ_U32P_AS_U8(IN) vreinterpretq_u8_u32(vld1q_u32((IN))) -#define GET_U8_AS_U32(IN) vget_lane_u32(vreinterpret_u32_u8((IN)), 0); -#define GETQ_U8_AS_U32(IN) vgetq_lane_u32(vreinterpretq_u32_u8((IN)), 0); -#define STOREQ_U8_AS_U32P(OUT, IN) vst1q_u32((OUT), vreinterpretq_u32_u8((IN))); +#define GET_U8_AS_U32(IN) vget_lane_u32(vreinterpret_u32_u8((IN)), 0) +#define GETQ_U8_AS_U32(IN) vgetq_lane_u32(vreinterpretq_u32_u8((IN)), 0) +#define STOREQ_U8_AS_U32P(OUT, IN) vst1q_u32((OUT), vreinterpretq_u32_u8((IN))) #define ROTATE32_LEFT(L) vextq_u8((L), (L), 12) // D|C|B|A -> C|B|A|D static WEBP_INLINE uint8x8_t Average2_u8_NEON(uint32_t a0, uint32_t a1) { diff --git a/media/libwebp/src/dsp/mips_macro.h b/media/libwebp/src/dsp/mips_macro.h index 44aba9b71d..e810d3d382 100644 --- a/media/libwebp/src/dsp/mips_macro.h +++ b/media/libwebp/src/dsp/mips_macro.h @@ -45,28 +45,38 @@ "ulw %[" #O2 "], " #I3 "+" XSTR(I9) "*" #I7 "(%[" #I0 "]) \n\t" \ "ulw %[" #O3 "], " #I4 "+" XSTR(I9) "*" #I8 "(%[" #I0 "]) \n\t" + +// O - output +// I - input (macro doesn't change it so it should be different from I) +#define MUL_SHIFT_C1(O, I) \ + "mul %[" #O "], %[" #I "], %[kC1] \n\t" \ + "sra %[" #O "], %[" #O "], 16 \n\t" \ + "addu %[" #O "], %[" #O "], %[" #I "] \n\t" +#define MUL_SHIFT_C2(O, I) \ + "mul %[" #O "], %[" #I "], %[kC2] \n\t" \ + "sra %[" #O "], %[" #O "], 16 \n\t" + +// Same as #define MUL_SHIFT_C1 but I and O are the same. It stores the +// intermediary result in TMP. +#define MUL_SHIFT_C1_IO(IO, TMP) \ + "mul %[" #TMP "], %[" #IO "], %[kC1] \n\t" \ + "sra %[" #TMP "], %[" #TMP "], 16 \n\t" \ + "addu %[" #IO "], %[" #TMP "], %[" #IO "] \n\t" + // O - output // IO - input/output // I - input (macro doesn't change it) #define MUL_SHIFT_SUM(O0, O1, O2, O3, O4, O5, O6, O7, \ IO0, IO1, IO2, IO3, \ I0, I1, I2, I3, I4, I5, I6, I7) \ - "mul %[" #O0 "], %[" #I0 "], %[kC2] \n\t" \ - "mul %[" #O1 "], %[" #I0 "], %[kC1] \n\t" \ - "mul %[" #O2 "], %[" #I1 "], %[kC2] \n\t" \ - "mul %[" #O3 "], %[" #I1 "], %[kC1] \n\t" \ - "mul %[" #O4 "], %[" #I2 "], %[kC2] \n\t" \ - "mul %[" #O5 "], %[" #I2 "], %[kC1] \n\t" \ - "mul %[" #O6 "], %[" #I3 "], %[kC2] \n\t" \ - "mul %[" #O7 "], %[" #I3 "], %[kC1] \n\t" \ - "sra %[" #O0 "], %[" #O0 "], 16 \n\t" \ - "sra %[" #O1 "], %[" #O1 "], 16 \n\t" \ - "sra %[" #O2 "], %[" #O2 "], 16 \n\t" \ - "sra %[" #O3 "], %[" #O3 "], 16 \n\t" \ - "sra %[" #O4 "], %[" #O4 "], 16 \n\t" \ - "sra %[" #O5 "], %[" #O5 "], 16 \n\t" \ - "sra %[" #O6 "], %[" #O6 "], 16 \n\t" \ - "sra %[" #O7 "], %[" #O7 "], 16 \n\t" \ + MUL_SHIFT_C2(O0, I0) \ + MUL_SHIFT_C1(O1, I0) \ + MUL_SHIFT_C2(O2, I1) \ + MUL_SHIFT_C1(O3, I1) \ + MUL_SHIFT_C2(O4, I2) \ + MUL_SHIFT_C1(O5, I2) \ + MUL_SHIFT_C2(O6, I3) \ + MUL_SHIFT_C1(O7, I3) \ "addu %[" #IO0 "], %[" #IO0 "], %[" #I4 "] \n\t" \ "addu %[" #IO1 "], %[" #IO1 "], %[" #I5 "] \n\t" \ "subu %[" #IO2 "], %[" #IO2 "], %[" #I6 "] \n\t" \ diff --git a/media/libwebp/src/dsp/msa_macro.h b/media/libwebp/src/dsp/msa_macro.h index 51f6c643ab..90adbbc319 100644 --- a/media/libwebp/src/dsp/msa_macro.h +++ b/media/libwebp/src/dsp/msa_macro.h @@ -73,27 +73,25 @@ #define ST_UW(...) ST_W(v4u32, __VA_ARGS__) #define ST_SW(...) ST_W(v4i32, __VA_ARGS__) -#define MSA_LOAD_FUNC(TYPE, INSTR, FUNC_NAME) \ - static inline TYPE FUNC_NAME(const void* const psrc) { \ - const uint8_t* const psrc_m = (const uint8_t*)psrc; \ - TYPE val_m; \ - asm volatile ( \ - "" #INSTR " %[val_m], %[psrc_m] \n\t" \ - : [val_m] "=r" (val_m) \ - : [psrc_m] "m" (*psrc_m)); \ - return val_m; \ +#define MSA_LOAD_FUNC(TYPE, INSTR, FUNC_NAME) \ + static inline TYPE FUNC_NAME(const void* const psrc) { \ + const uint8_t* const psrc_m = (const uint8_t*)psrc; \ + TYPE val_m; \ + __asm__ volatile("" #INSTR " %[val_m], %[psrc_m] \n\t" \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + return val_m; \ } #define MSA_LOAD(psrc, FUNC_NAME) FUNC_NAME(psrc) -#define MSA_STORE_FUNC(TYPE, INSTR, FUNC_NAME) \ - static inline void FUNC_NAME(TYPE val, void* const pdst) { \ - uint8_t* const pdst_m = (uint8_t*)pdst; \ - TYPE val_m = val; \ - asm volatile ( \ - " " #INSTR " %[val_m], %[pdst_m] \n\t" \ - : [pdst_m] "=m" (*pdst_m) \ - : [val_m] "r" (val_m)); \ +#define MSA_STORE_FUNC(TYPE, INSTR, FUNC_NAME) \ + static inline void FUNC_NAME(TYPE val, void* const pdst) { \ + uint8_t* const pdst_m = (uint8_t*)pdst; \ + TYPE val_m = val; \ + __asm__ volatile(" " #INSTR " %[val_m], %[pdst_m] \n\t" \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ } #define MSA_STORE(val, pdst, FUNC_NAME) FUNC_NAME(val, pdst) diff --git a/media/libwebp/src/dsp/quant.h b/media/libwebp/src/dsp/quant.h index bf7734cb11..dcbc11c77c 100644 --- a/media/libwebp/src/dsp/quant.h +++ b/media/libwebp/src/dsp/quant.h @@ -36,8 +36,9 @@ static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks, int thresh) { const int16x8_t tst_ones = vdupq_n_s16(-1); uint32x4_t sum = vdupq_n_u32(0); + int i; - for (int i = 0; i < num_blocks; ++i) { + for (i = 0; i < num_blocks; ++i) { // Set DC to zero. const int16x8_t a_0 = vsetq_lane_s16(0, vld1q_s16(levels), 0); const int16x8_t a_1 = vld1q_s16(levels + 8); diff --git a/media/libwebp/src/dsp/rescaler_neon.c b/media/libwebp/src/dsp/rescaler_neon.c index b976a852cf..957a92dbc9 100644 --- a/media/libwebp/src/dsp/rescaler_neon.c +++ b/media/libwebp/src/dsp/rescaler_neon.c @@ -32,7 +32,7 @@ #define STORE_32x8(SRC0, SRC1, DST) do { \ vst1q_u32((DST) + 0, SRC0); \ vst1q_u32((DST) + 4, SRC1); \ -} while (0); +} while (0) #if (WEBP_RESCALER_RFIX == 32) #define MAKE_HALF_CST(C) vdupq_n_s32((int32_t)((C) >> 1)) diff --git a/media/libwebp/src/dsp/upsampling_sse2.c b/media/libwebp/src/dsp/upsampling_sse2.c index 08b6d0b1cf..77b4f7221e 100644 --- a/media/libwebp/src/dsp/upsampling_sse2.c +++ b/media/libwebp/src/dsp/upsampling_sse2.c @@ -58,7 +58,7 @@ } while (0) // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels. -#define UPSAMPLE_32PIXELS(r1, r2, out) { \ +#define UPSAMPLE_32PIXELS(r1, r2, out) do { \ const __m128i one = _mm_set1_epi8(1); \ const __m128i a = _mm_loadu_si128((const __m128i*)&(r1)[0]); \ const __m128i b = _mm_loadu_si128((const __m128i*)&(r1)[1]); \ @@ -85,7 +85,7 @@ /* pack the alternate pixels */ \ PACK_AND_STORE(a, b, diag1, diag2, (out) + 0); /* store top */ \ PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32); /* store bottom */ \ -} +} while (0) // Turn the macro into a function for reducing code-size when non-critical static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[], @@ -229,11 +229,11 @@ static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ } \ } -YUV444_FUNC(Yuv444ToRgba_SSE2, VP8YuvToRgba32_SSE2, WebPYuv444ToRgba_C, 4); -YUV444_FUNC(Yuv444ToBgra_SSE2, VP8YuvToBgra32_SSE2, WebPYuv444ToBgra_C, 4); +YUV444_FUNC(Yuv444ToRgba_SSE2, VP8YuvToRgba32_SSE2, WebPYuv444ToRgba_C, 4) +YUV444_FUNC(Yuv444ToBgra_SSE2, VP8YuvToBgra32_SSE2, WebPYuv444ToBgra_C, 4) #if !defined(WEBP_REDUCE_CSP) -YUV444_FUNC(Yuv444ToRgb_SSE2, VP8YuvToRgb32_SSE2, WebPYuv444ToRgb_C, 3); -YUV444_FUNC(Yuv444ToBgr_SSE2, VP8YuvToBgr32_SSE2, WebPYuv444ToBgr_C, 3); +YUV444_FUNC(Yuv444ToRgb_SSE2, VP8YuvToRgb32_SSE2, WebPYuv444ToRgb_C, 3) +YUV444_FUNC(Yuv444ToBgr_SSE2, VP8YuvToBgr32_SSE2, WebPYuv444ToBgr_C, 3) YUV444_FUNC(Yuv444ToArgb_SSE2, VP8YuvToArgb32_SSE2, WebPYuv444ToArgb_C, 4) YUV444_FUNC(Yuv444ToRgba4444_SSE2, VP8YuvToRgba444432_SSE2, \ WebPYuv444ToRgba4444_C, 2) diff --git a/media/libwebp/src/dsp/upsampling_sse41.c b/media/libwebp/src/dsp/upsampling_sse41.c index 648d456027..e38c88d5e6 100644 --- a/media/libwebp/src/dsp/upsampling_sse41.c +++ b/media/libwebp/src/dsp/upsampling_sse41.c @@ -60,7 +60,7 @@ } while (0) // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels. -#define UPSAMPLE_32PIXELS(r1, r2, out) { \ +#define UPSAMPLE_32PIXELS(r1, r2, out) do { \ const __m128i one = _mm_set1_epi8(1); \ const __m128i a = _mm_loadu_si128((const __m128i*)&(r1)[0]); \ const __m128i b = _mm_loadu_si128((const __m128i*)&(r1)[1]); \ @@ -87,7 +87,7 @@ /* pack the alternate pixels */ \ PACK_AND_STORE(a, b, diag1, diag2, (out) + 0); /* store top */ \ PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32); /* store bottom */ \ -} +} while (0) // Turn the macro into a function for reducing code-size when non-critical static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[], @@ -217,8 +217,8 @@ static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ } #if !defined(WEBP_REDUCE_CSP) -YUV444_FUNC(Yuv444ToRgb_SSE41, VP8YuvToRgb32_SSE41, WebPYuv444ToRgb_C, 3); -YUV444_FUNC(Yuv444ToBgr_SSE41, VP8YuvToBgr32_SSE41, WebPYuv444ToBgr_C, 3); +YUV444_FUNC(Yuv444ToRgb_SSE41, VP8YuvToRgb32_SSE41, WebPYuv444ToRgb_C, 3) +YUV444_FUNC(Yuv444ToBgr_SSE41, VP8YuvToBgr32_SSE41, WebPYuv444ToBgr_C, 3) #endif // WEBP_REDUCE_CSP WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE41(void) { |