From d8bbc7858622b6d9c278469aab701ca0b609cddf Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 15 May 2024 05:35:49 +0200 Subject: Merging upstream version 126.0. Signed-off-by: Daniel Baumann --- third_party/dav1d/src/ppc/cdef_tmpl.c | 399 +++++++++++++++++++++++----------- 1 file changed, 274 insertions(+), 125 deletions(-) (limited to 'third_party/dav1d/src/ppc/cdef_tmpl.c') diff --git a/third_party/dav1d/src/ppc/cdef_tmpl.c b/third_party/dav1d/src/ppc/cdef_tmpl.c index e2e759810f..6ef87ad448 100644 --- a/third_party/dav1d/src/ppc/cdef_tmpl.c +++ b/third_party/dav1d/src/ppc/cdef_tmpl.c @@ -29,11 +29,10 @@ #if BITDEPTH == 8 static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold, - const int damping) + const uint16_t shift) { const i16x8 zero = vec_splat_s16(0); if (!threshold) return zero; - const uint16_t shift = imax(0, damping - ulog2(threshold)); const i16x8 abs_diff = vec_abs(diff); const b16x8 mask = vec_cmplt(diff, zero); const i16x8 thr = vec_splats(threshold); @@ -44,7 +43,7 @@ static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold, return vec_sel(min, neg, mask); } -static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride, +static inline void copy4xN(uint16_t *tmp, const uint8_t *src, const ptrdiff_t src_stride, const uint8_t (*left)[2], const uint8_t *const top, const uint8_t *const bottom, const int w, const int h, @@ -114,7 +113,7 @@ static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride, } } -static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride, +static inline void copy8xN(uint16_t *tmp, const uint8_t *src, const ptrdiff_t src_stride, const uint8_t (*left)[2], const uint8_t *const top, const uint8_t *const bottom, const int w, const int h, @@ -218,16 +217,12 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) { #define LOAD_PIX(addr) \ const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \ - i16x8 max = px; \ - i16x8 min = px; \ i16x8 sum = vec_splat_s16(0); #define LOAD_PIX4(addr) \ const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \ - const i16x8 b = (i16x8)vec_vsx_ld(0, addr + tmp_stride); \ + const i16x8 b = (i16x8)vec_vsx_ld(0, addr + 8); \ const i16x8 px = vec_xxpermdi(a, b, 0); \ - i16x8 max = px; \ - i16x8 min = px; \ i16x8 sum = vec_splat_s16(0); #define LOAD_DIR(p, addr, o0, o1) \ @@ -238,22 +233,26 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) { #define LOAD_DIR4(p, addr, o0, o1) \ LOAD_DIR(p ## a, addr, o0, o1) \ - LOAD_DIR(p ## b, addr + tmp_stride, o0, o1) \ + LOAD_DIR(p ## b, addr + 8, o0, o1) \ const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \ const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \ const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \ const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0); -#define CONSTRAIN(p, strength) \ +#define CONSTRAIN(p, strength, shift) \ const i16x8 p ## _d0 = vec_sub(p ## 0, px); \ const i16x8 p ## _d1 = vec_sub(p ## 1, px); \ const i16x8 p ## _d2 = vec_sub(p ## 2, px); \ const i16x8 p ## _d3 = vec_sub(p ## 3, px); \ \ - i16x8 p ## _c0 = vconstrain(p ## _d0, strength, damping); \ - i16x8 p ## _c1 = vconstrain(p ## _d1, strength, damping); \ - i16x8 p ## _c2 = vconstrain(p ## _d2, strength, damping); \ - i16x8 p ## _c3 = vconstrain(p ## _d3, strength, damping); + i16x8 p ## _c0 = vconstrain(p ## _d0, strength, shift); \ + i16x8 p ## _c1 = vconstrain(p ## _d1, strength, shift); \ + i16x8 p ## _c2 = vconstrain(p ## _d2, strength, shift); \ + i16x8 p ## _c3 = vconstrain(p ## _d3, strength, shift); + +#define SETUP_MINMAX \ + i16x8 max = px; \ + i16x8 min = px; \ #define MIN_MAX(p) \ max = max_mask(p ## 0, max); \ @@ -265,19 +264,16 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) { max = max_mask(p ## 3, max); \ min = vec_min(p ## 3, min); -#define PRI_0(p) \ - p ## _c0 = vec_add(vec_sl(p ## _c0, vec_splat_u16(1)), vec_sl(p ## _c0, vec_splats(tap_even))); \ - p ## _c1 = vec_add(vec_sl(p ## _c1, vec_splat_u16(1)), vec_sl(p ## _c1, vec_splats(tap_even))); +#define MAKE_TAPS \ + const int16_t tap_odd = (pri_strength >> bitdepth_min_8) & 1; \ + const i16x8 tap0 = vec_splats((int16_t)(4 - tap_odd)); \ + const i16x8 tap1 = vec_splats((int16_t)(2 + tap_odd)); -#define PRI_1(p) \ - p ## _c2 = vec_sub(vec_sl(p ## _c2, vec_splat_u16(2)), vec_sl(p ## _c2, vec_splats(tap_even))); \ - p ## _c3 = vec_sub(vec_sl(p ## _c3, vec_splat_u16(2)), vec_sl(p ## _c3, vec_splats(tap_even))); - -#define SEC_0(p) \ - p ## _c0 = vec_sl(p ## _c0, vec_splat_u16(1)); \ - p ## _c1 = vec_sl(p ## _c1, vec_splat_u16(1)); \ - p ## _c2 = vec_sl(p ## _c2, vec_splat_u16(1)); \ - p ## _c3 = vec_sl(p ## _c3, vec_splat_u16(1)); +#define PRI_0_UPDATE_SUM(p) \ + sum = vec_madd(tap0, p ## _c0, sum); \ + sum = vec_madd(tap0, p ## _c1, sum); \ + sum = vec_madd(tap1, p ## _c2, sum); \ + sum = vec_madd(tap1, p ## _c3, sum); #define UPDATE_SUM(p) \ const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \ @@ -285,92 +281,198 @@ static inline i16x8 max_mask(i16x8 a, i16x8 b) { sum = vec_add(sum, p ## sum0); \ sum = vec_add(sum, p ## sum1); +#define SEC_0_UPDATE_SUM(p) \ + sum = vec_madd(vec_splat_s16(2), p ## _c0, sum); \ + sum = vec_madd(vec_splat_s16(2), p ## _c1, sum); \ + sum = vec_madd(vec_splat_s16(2), p ## _c2, sum); \ + sum = vec_madd(vec_splat_s16(2), p ## _c3, sum); + +#define BIAS \ + i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); \ + bias = vec_sub(vec_splat_s16(8), bias); \ + +#define STORE4 \ + dst[0] = vdst[0]; \ + dst[1] = vdst[1]; \ + dst[2] = vdst[2]; \ + dst[3] = vdst[3]; \ +\ + tmp += 8; \ + dst += PXSTRIDE(dst_stride); \ + dst[0] = vdst[4]; \ + dst[1] = vdst[5]; \ + dst[2] = vdst[6]; \ + dst[3] = vdst[7]; \ +\ + tmp += 8; \ + dst += PXSTRIDE(dst_stride); + +#define STORE4_CLAMPED \ + BIAS \ + i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ + i16x8 vdst = vec_max(vec_min(unclamped, max), min); \ + STORE4 + +#define STORE4_UNCLAMPED \ + BIAS \ + i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ + STORE4 + +#define STORE8 \ + dst[0] = vdst[0]; \ + dst[1] = vdst[1]; \ + dst[2] = vdst[2]; \ + dst[3] = vdst[3]; \ + dst[4] = vdst[4]; \ + dst[5] = vdst[5]; \ + dst[6] = vdst[6]; \ + dst[7] = vdst[7]; \ +\ + tmp += 16; \ + dst += PXSTRIDE(dst_stride); + +#define STORE8_CLAMPED \ + BIAS \ + i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ + i16x8 vdst = vec_max(vec_min(unclamped, max), min); \ + STORE8 + +#define STORE8_UNCLAMPED \ + BIAS \ + i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ + STORE8 + +#define DIRECTIONS(w, tmp_stride) \ + static const int8_t cdef_directions##w[8 /* dir */][2 /* pass */] = { \ + { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, \ + { 0 * tmp_stride + 1, -1 * tmp_stride + 2 }, \ + { 0 * tmp_stride + 1, 0 * tmp_stride + 2 }, \ + { 0 * tmp_stride + 1, 1 * tmp_stride + 2 }, \ + { 1 * tmp_stride + 1, 2 * tmp_stride + 2 }, \ + { 1 * tmp_stride + 0, 2 * tmp_stride + 1 }, \ + { 1 * tmp_stride + 0, 2 * tmp_stride + 0 }, \ + { 1 * tmp_stride + 0, 2 * tmp_stride - 1 } \ + }; + +DIRECTIONS(4, 8) +DIRECTIONS(8, 16) + static inline void filter_4xN(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, const int w, const int h, const int pri_strength, const int sec_strength, const int dir, - const int damping, const enum CdefEdgeFlags edges, - const ptrdiff_t tmp_stride, uint16_t *tmp) + const int pri_shift, const int sec_shift, + const enum CdefEdgeFlags edges, uint16_t *tmp) { - const int8_t cdef_directions[8 /* dir */][2 /* pass */] = { - { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, - { 0 * tmp_stride + 1, -1 * tmp_stride + 2 }, - { 0 * tmp_stride + 1, 0 * tmp_stride + 2 }, - { 0 * tmp_stride + 1, 1 * tmp_stride + 2 }, - { 1 * tmp_stride + 1, 2 * tmp_stride + 2 }, - { 1 * tmp_stride + 0, 2 * tmp_stride + 1 }, - { 1 * tmp_stride + 0, 2 * tmp_stride + 0 }, - { 1 * tmp_stride + 0, 2 * tmp_stride - 1 } - }; const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; - const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1); - const int off1 = cdef_directions[dir][0]; - const int off1_1 = cdef_directions[dir][1]; + const int off1 = cdef_directions4[dir][0]; + const int off1_1 = cdef_directions4[dir][1]; - const int off2 = cdef_directions[(dir + 2) & 7][0]; - const int off3 = cdef_directions[(dir + 6) & 7][0]; + const int off2 = cdef_directions4[(dir + 2) & 7][0]; + const int off3 = cdef_directions4[(dir + 6) & 7][0]; - const int off2_1 = cdef_directions[(dir + 2) & 7][1]; - const int off3_1 = cdef_directions[(dir + 6) & 7][1]; + const int off2_1 = cdef_directions4[(dir + 2) & 7][1]; + const int off3_1 = cdef_directions4[(dir + 6) & 7][1]; - copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges); + MAKE_TAPS for (int y = 0; y < h / 2; y++) { LOAD_PIX4(tmp) + SETUP_MINMAX + // Primary pass LOAD_DIR4(p, tmp, off1, off1_1) - CONSTRAIN(p, pri_strength) + CONSTRAIN(p, pri_strength, pri_shift) MIN_MAX(p) - PRI_0(p) - PRI_1(p) - - UPDATE_SUM(p) + PRI_0_UPDATE_SUM(p) // Secondary pass 1 LOAD_DIR4(s, tmp, off2, off3) - CONSTRAIN(s, sec_strength) + CONSTRAIN(s, sec_strength, sec_shift) MIN_MAX(s) - SEC_0(s) - - UPDATE_SUM(s) + SEC_0_UPDATE_SUM(s) // Secondary pass 2 LOAD_DIR4(s2, tmp, off2_1, off3_1) - CONSTRAIN(s2, sec_strength) + CONSTRAIN(s2, sec_strength, sec_shift) MIN_MAX(s2) UPDATE_SUM(s2) // Store - i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); - bias = vec_sub(vec_splat_s16(8), bias); - i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); - i16x8 vdst = vec_max(vec_min(unclamped, max), min); - - dst[0] = vdst[0]; - dst[1] = vdst[1]; - dst[2] = vdst[2]; - dst[3] = vdst[3]; - - tmp += tmp_stride; - dst += PXSTRIDE(dst_stride); - dst[0] = vdst[4]; - dst[1] = vdst[5]; - dst[2] = vdst[6]; - dst[3] = vdst[7]; - - tmp += tmp_stride; - dst += PXSTRIDE(dst_stride); + STORE4_CLAMPED + } +} + +static inline void +filter_4xN_pri(pixel *dst, const ptrdiff_t dst_stride, + const pixel (*left)[2], const pixel *const top, + const pixel *const bottom, const int w, const int h, + const int pri_strength, const int dir, + const int pri_shift, const enum CdefEdgeFlags edges, + uint16_t *tmp) +{ + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const int off1 = cdef_directions4[dir][0]; + const int off1_1 = cdef_directions4[dir][1]; + + MAKE_TAPS + + for (int y = 0; y < h / 2; y++) { + LOAD_PIX4(tmp) + + // Primary pass + LOAD_DIR4(p, tmp, off1, off1_1) + + CONSTRAIN(p, pri_strength, pri_shift) + + PRI_0_UPDATE_SUM(p) + + STORE4_UNCLAMPED + } +} + +static inline void +filter_4xN_sec(pixel *dst, const ptrdiff_t dst_stride, + const pixel (*left)[2], const pixel *const top, + const pixel *const bottom, const int w, const int h, + const int sec_strength, const int dir, + const int sec_shift, const enum CdefEdgeFlags edges, + uint16_t *tmp) +{ + const int off2 = cdef_directions4[(dir + 2) & 7][0]; + const int off3 = cdef_directions4[(dir + 6) & 7][0]; + + const int off2_1 = cdef_directions4[(dir + 2) & 7][1]; + const int off3_1 = cdef_directions4[(dir + 6) & 7][1]; + + for (int y = 0; y < h / 2; y++) { + LOAD_PIX4(tmp) + // Secondary pass 1 + LOAD_DIR4(s, tmp, off2, off3) + + CONSTRAIN(s, sec_strength, sec_shift) + + SEC_0_UPDATE_SUM(s) + + // Secondary pass 2 + LOAD_DIR4(s2, tmp, off2_1, off3_1) + + CONSTRAIN(s2, sec_strength, sec_shift) + + UPDATE_SUM(s2) + + STORE4_UNCLAMPED } } @@ -379,88 +481,121 @@ filter_8xN(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, const int w, const int h, const int pri_strength, const int sec_strength, const int dir, - const int damping, const enum CdefEdgeFlags edges, - const ptrdiff_t tmp_stride, uint16_t *tmp) + const int pri_shift, const int sec_shift, const enum CdefEdgeFlags edges, + uint16_t *tmp) { - const int8_t cdef_directions[8 /* dir */][2 /* pass */] = { - { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, - { 0 * tmp_stride + 1, -1 * tmp_stride + 2 }, - { 0 * tmp_stride + 1, 0 * tmp_stride + 2 }, - { 0 * tmp_stride + 1, 1 * tmp_stride + 2 }, - { 1 * tmp_stride + 1, 2 * tmp_stride + 2 }, - { 1 * tmp_stride + 0, 2 * tmp_stride + 1 }, - { 1 * tmp_stride + 0, 2 * tmp_stride + 0 }, - { 1 * tmp_stride + 0, 2 * tmp_stride - 1 } - }; const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const int off1 = cdef_directions8[dir][0]; + const int off1_1 = cdef_directions8[dir][1]; - const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1); - const int off1 = cdef_directions[dir][0]; - const int off1_1 = cdef_directions[dir][1]; + const int off2 = cdef_directions8[(dir + 2) & 7][0]; + const int off3 = cdef_directions8[(dir + 6) & 7][0]; - const int off2 = cdef_directions[(dir + 2) & 7][0]; - const int off3 = cdef_directions[(dir + 6) & 7][0]; + const int off2_1 = cdef_directions8[(dir + 2) & 7][1]; + const int off3_1 = cdef_directions8[(dir + 6) & 7][1]; - const int off2_1 = cdef_directions[(dir + 2) & 7][1]; - const int off3_1 = cdef_directions[(dir + 6) & 7][1]; - - copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges); + MAKE_TAPS for (int y = 0; y < h; y++) { LOAD_PIX(tmp) + SETUP_MINMAX + // Primary pass LOAD_DIR(p, tmp, off1, off1_1) - CONSTRAIN(p, pri_strength) + CONSTRAIN(p, pri_strength, pri_shift) MIN_MAX(p) - PRI_0(p) - PRI_1(p) - - UPDATE_SUM(p) + PRI_0_UPDATE_SUM(p) // Secondary pass 1 LOAD_DIR(s, tmp, off2, off3) - CONSTRAIN(s, sec_strength) + CONSTRAIN(s, sec_strength, sec_shift) MIN_MAX(s) - SEC_0(s) - - UPDATE_SUM(s) + SEC_0_UPDATE_SUM(s) // Secondary pass 2 LOAD_DIR(s2, tmp, off2_1, off3_1) - CONSTRAIN(s2, sec_strength) + CONSTRAIN(s2, sec_strength, sec_shift) MIN_MAX(s2) UPDATE_SUM(s2) // Store - i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); - bias = vec_sub(vec_splat_s16(8), bias); - i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); - i16x8 vdst = vec_max(vec_min(unclamped, max), min); - - dst[0] = vdst[0]; - dst[1] = vdst[1]; - dst[2] = vdst[2]; - dst[3] = vdst[3]; - dst[4] = vdst[4]; - dst[5] = vdst[5]; - dst[6] = vdst[6]; - dst[7] = vdst[7]; - - tmp += tmp_stride; - dst += PXSTRIDE(dst_stride); + STORE8_CLAMPED + } + +} + +static inline void +filter_8xN_pri(pixel *dst, const ptrdiff_t dst_stride, + const pixel (*left)[2], const pixel *const top, + const pixel *const bottom, const int w, const int h, + const int pri_strength, const int dir, + const int pri_shift, const enum CdefEdgeFlags edges, + uint16_t *tmp) +{ + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const int off1 = cdef_directions8[dir][0]; + const int off1_1 = cdef_directions8[dir][1]; + + MAKE_TAPS + + for (int y = 0; y < h; y++) { + LOAD_PIX(tmp) + + // Primary pass + LOAD_DIR(p, tmp, off1, off1_1) + + CONSTRAIN(p, pri_strength, pri_shift) + + PRI_0_UPDATE_SUM(p) + + STORE8_UNCLAMPED } +} + +static inline void +filter_8xN_sec(pixel *dst, const ptrdiff_t dst_stride, + const pixel (*left)[2], const pixel *const top, + const pixel *const bottom, const int w, const int h, + const int sec_strength, const int dir, + const int sec_shift, const enum CdefEdgeFlags edges, + uint16_t *tmp) +{ + const int off2 = cdef_directions8[(dir + 2) & 7][0]; + const int off3 = cdef_directions8[(dir + 6) & 7][0]; + + const int off2_1 = cdef_directions8[(dir + 2) & 7][1]; + const int off3_1 = cdef_directions8[(dir + 6) & 7][1]; + + for (int y = 0; y < h; y++) { + LOAD_PIX(tmp) + + // Secondary pass 1 + LOAD_DIR(s, tmp, off2, off3) + CONSTRAIN(s, sec_strength, sec_shift) + + SEC_0_UPDATE_SUM(s) + + // Secondary pass 2 + LOAD_DIR(s2, tmp, off2_1, off3_1) + + CONSTRAIN(s2, sec_strength, sec_shift) + + UPDATE_SUM(s2) + + STORE8_UNCLAMPED + } } #define cdef_fn(w, h, tmp_stride) \ @@ -477,8 +612,22 @@ void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \ { \ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \ - filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \ - sec_strength, dir, damping, edges, tmp_stride, tmp); \ + copy##w##xN(tmp - 2, dst, dst_stride, left, top, bottom, w, h, edges); \ + if (pri_strength) { \ + const int pri_shift = imax(0, damping - ulog2(pri_strength)); \ + if (sec_strength) { \ + const int sec_shift = damping - ulog2(sec_strength); \ + filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \ + sec_strength, dir, pri_shift, sec_shift, edges, tmp); \ + } else { \ + filter_##w##xN_pri(dst, dst_stride, left, top, bottom, w, h, pri_strength, \ + dir, pri_shift, edges, tmp); \ + } \ + } else { \ + const int sec_shift = damping - ulog2(sec_strength); \ + filter_##w##xN_sec(dst, dst_stride, left, top, bottom, w, h, sec_strength, \ + dir, sec_shift, edges, tmp); \ + } \ } cdef_fn(4, 4, 8); -- cgit v1.2.3